# Cleaning the dataset

We assume that all exploration has been done before, thus we take out unnecessary fields

In [None]:
import json
import numpy as np
import pandas as pd
import pickle as pkl
import pyspark as ps
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize

In [None]:
EXAMPLE_PATH = 'swiss-tweet/example.json'
with open(EXAMPLE_PATH) as data_file:    
    example = json.load(data_file)
cleaned = json_normalize(example)
cleaned.columns = [column.replace('_source.','') for column in cleaned.columns]

## Preparatory Steps :

### Data cleaning :

Looking at all the columns we have, only 10 are interesting to keep :
- main: this is the tweet in itself
- published: this is the time when the tweet was published
- source_spam_probability: necessary to get out of useless tweets
- source_location: this is the place where the tweet was published
- tags: this is useful as # give us concise information
- lang: allows us to choose the languages we will work with
- sentiment: allows us to choose our subset
- author_gender: can help us get more insight
- source_followers: can help us get more insight
- source_following: can help us get more insight

In [None]:
#Only keeping the necessary columns
cleaned = cleaned[['main', 'published', 'source_spam_probability', 'source_location', 'tags', 'lang', 'sentiment',
                   'author_gender', 'source_followers', 'source_following']]

We start by only keeping the languages we want to work with.

In [None]:
lang_mask = (cleaned['lang'] != 'de') & (cleaned['lang'] != 'fr') & (cleaned['lang'] != 'en')
cleaned.drop(cleaned[lang_mask].index, inplace=True)
cleaned.reset_index(drop=True, inplace=True)

The next step is to drop all tweets that have a 'POSITIVE' emotion.

In [None]:
sent_mask = (cleaned['sentiment'] == 'POSITIVE')
cleaned.drop(cleaned[sent_mask].index, inplace=True)
cleaned.reset_index(drop=True, inplace=True)

Next, we drop all tweets that have at least 50% of probability of being spams.

In [None]:
spam_mask = (cleaned['source_spam_probability'] >= 0.5)
cleaned.drop(cleaned[spam_mask].index, inplace=True)
cleaned.reset_index(drop=True, inplace=True)

In [None]:
cleaned

### Data preprocessing :

Some data is not proper to be used. To do this, we format the data following multiple steps, starting with the date.

In [None]:
cleaned['published'] = pd.to_datetime(cleaned['published'])
cleaned['published'].head(10)

Now, we will focus on the main element of our analysis, the tweets. We start by putting everything to lowercase.

In [None]:
import unicodedata
cleaned['main'] = cleaned['main'].astype(str).str.lower().\
                    apply(lambda tweet: unicodedata.normalize('NFD', tweet).\
                    encode('ascii', 'ignore').decode('utf-8'))
cleaned['main'].head()

Then, we take out urls and non alphanumerical characters.

In [None]:
cleaned['main'] = cleaned['main'].str.replace("www\S+", '').str.replace("http\S+", '').\
                    str.replace("pic.twitter\S+", '').str.replace('[^\w\s]', '')
cleaned['main'].head()

The next step is removing stopwords and stemming (getting only the racial of the word) using an NLP library

In [None]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

def process_words(language):
    lang = language[:2]
    if language == 'german':
        lang = 'de'
    
    stemmer = SnowballStemmer(language)
    lang_set = stopwords.words(language)
        
    cleaned.loc[cleaned['lang'] == lang, 'main'] = cleaned.loc[cleaned['lang'] == lang, 'main'].str.split().\
        apply(lambda tweet: [word for word in tweet if word not in lang_set]).\
        apply(lambda tweet: [stemmer.stem(word) for word in tweet])

In [None]:
process_words('english')
process_words('french')
process_words('german')
cleaned['main'].head()