***
# <h1 align = 'center'>Data preprocessing</h1> 
#### <center> Abderahmane BELLAMINE, Manal EL IDRISSI </center>
#### <center> Ecole Centrale Casablanca </center>
#### <center> January 2021 </center>
***

The purpose of this Lab is to clean data previously collected from Twitter using regex and some nlp techniques ( tokenization, stemmatizatiob). This data will be used for visualization and the creation of Bokeh app. 

### Objectives of this jupyter notebooks file:
- Data Preparation and Cleansing

#### Dependencies import
 

In [1]:
import json
import pandas as pd
import re
import spacy
from spacy.lang.en import English
import nltk
import string

In [2]:
def load_data(fname):
    data = list()
    with open(fname, 'r') as file:
        for line in file:
            data += json.loads(line)
    return data

In [3]:
usa_tweets = load_data("../data/usa_tweets.jsonl")
df_usa = pd.DataFrame(usa_tweets)

In [4]:
uk_tweets = load_data("../data/uk_tweets.jsonl")
df_uk = pd.DataFrame(uk_tweets)

In [5]:
germany_tweets = load_data("../data/germany_tweets.jsonl")
df_germany = pd.DataFrame(germany_tweets)

### Cleaning the data

In this section, we will use the data we stored and perform some text mining tasks on it. 

In [26]:
def clean_tweet(tweet):
    user_handles = re.findall(r'@[A-Za-z0-9]+', tweet) #search @mentions
    hashtags = re.findall(r'#[A-Za-z0-9]+', tweet) #search #hashtags
    links = re.findall(r' https?:\/\/\S*', tweet) #search hyperlinks
    
    cleaned_tweet = re.sub(r'@[A-Za-z0-9]+','', tweet) #remove @mentions
    cleaned_tweet = re.sub(r'#[A-Za-z0-9]+', '', cleaned_tweet) #remove #hashtags 
    cleaned_tweet = re.sub(r' https?:\/\/\S*', '', cleaned_tweet, flags=re.MULTILINE) #remove hyperlinks
    return cleaned_tweet.strip(), user_handles, hashtags, links

In [27]:
#Perform a small test
example1="Digital X Worldwide | Today Is Steve Jobs Day In California @apple http://t.co/QSCHuMIN"
example2="Wow. Great deals on refurbed #iPad (first gen) models. RT: Apple offers great deals on refurbished 1st-gen iPads http://t.co/ukWOKBGd @Apple"

In [28]:
print(clean_tweet(example1))
# ('Digital X Worldwide | Today Is Steve Jobs Day In California', ['@apple'], [], ['http://t.co/QSCHuMIN'])
print(clean_tweet(example2))
# ('Wow. Great deals on refurbed  (first gen) models. RT: Apple offers great deals on refurbished 1st-gen iPads', ['@Apple'], ['#iPad'], ['http://t.co/ukWOKBGd'])

('Digital X Worldwide | Today Is Steve Jobs Day In California', ['@apple'], [], [' http://t.co/QSCHuMIN'])
('Wow. Great deals on refurbed  (first gen) models. RT: Apple offers great deals on refurbished 1st-gen iPads', ['@Apple'], ['#iPad'], [' http://t.co/ukWOKBGd'])


In [29]:
def clean_all_tweets(df):
    df['cleaned_tweet'] = df['text'].apply(lambda x: clean_tweet(x)[0])
    df['user_handles'] = df['text'].apply(lambda x: clean_tweet(x)[1])
    df['hashtags'] = df['text'].apply(lambda x: clean_tweet(x)[2])
    df['links'] = df['text'].apply(lambda x: clean_tweet(x)[3])
    return df

In [30]:
def remove_punct(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    text = re.sub('[0–9]+','', text)
    return text

In [31]:
def tokenizer(text):
    '''
    Function that takes a text, tokenizes it using spacy's nlp(text), processes that text in spaCy and appends the results to a list 
    Return: list of tokens
    '''
    # write your code
    list_tokens = list()
    doc = nlp(text)
    for token in doc:
        list_tokens.append(token)
    return list_tokens

In [32]:
nlp = English()

In [33]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [34]:
# The total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

# The first ten stop words:
print('First twenty stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First twenty stop words: ['because', 'none', 'otherwise', 'fifteen', 'get', 'everyone', 'would', 'us', 'also', 'alone', 'after', 'unless', '’m', 'thence', 'eleven', 'at', 'whatever', '’re', 'others', 'seem']


In [35]:
def tokenizer_noStopw(text):
    '''
    Function that takes a text, tokenizes it using spacy's nlp function, removes the stopwords 
    Return: list of filtered tokens
    '''
    # write your code
    list_tokens = list()
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            list_tokens.append(token.text)
    return list_tokens

In [36]:
ps = nltk.PorterStemmer()

In [37]:
def stemming(text):
    text = [ps.stem(word) for word in text]
    return " ".join(text)

In [38]:
def build_dataframe(df):
    
    df = clean_all_tweets(df)
    df['punct'] = df.cleaned_tweet.apply(remove_punct)
    df['tokenized_text'] = df.punct.apply(tokenizer)
    df['tokenizer_noStopw'] = df.punct.apply(tokenizer_noStopw)
    df['stemmed_text'] = df.tokenizer_noStopw.apply(stemming)
    
    return df

In [39]:
# Display the head of your Dataframe
new_df_usa = build_dataframe(df_usa)
new_df_usa.head()

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets,cleaned_tweet,user_handles,hashtags,links,punct,tokenized_text,tokenizer_noStopw,stemmed_text
0,1643812291000,Grant Rivers 🏳️‍🌈,SnowAndBeach,In addition…\n\nChildren over the age of 12 ar...,0,0,In addition…\n\nChildren over the age of 12 ar...,[],[],[ https://t.co/AZnsErX4lR],In addition…\n\nChildren over the age of 12 ar...,"[In, addition, …, \n\n, Children, over, the, a...","[addition, …, \n\n, Children, age, 12, able, e...",addit … \n\n children age 12 abl enter countri...
1,1643812282000,NFTurk (L-Constant Panic),MichaelTurk,#covid should be like the perfect attendance a...,0,0,should be like the perfect attendance award in...,[],"[#covid, #US]",[ https://t.co/hKiM3oTffw],should be like the perfect attendance award in...,"[should, be, like, the, perfect, attendance, a...","[like, perfect, attendance, award, high, schoo...",like perfect attend award high school person ...
2,1643812202000,Gate 15,Gate_15_Analyst,As anti-vaccine mandate protest enters 5th day...,0,0,As anti-vaccine mandate protest enters 5th day...,[],[#COVID],[ https://t.co/rbRYVYF29P],As antivaccine mandate protest enters 5th day ...,"[As, antivaccine, mandate, protest, enters, 5t...","[antivaccine, mandate, protest, enters, 5th, d...",antivaccin mandat protest enter 5th day ottawa...
3,1643812183000,Gate 15,Gate_15_Analyst,Ottawa's police chief says the response to the...,0,0,Ottawa's police chief says the response to the...,[],[#COVID],[ https://t.co/ScmVUAUfYJ],Ottawas police chief says the response to the ...,"[Ottawas, police, chief, says, the, response, ...","[Ottawas, police, chief, says, response, prote...",ottawa polic chief say respons protest success...
4,1643812175000,Gate 15,Gate_15_Analyst,COVID study in which young adults were infecte...,0,0,COVID study in which young adults were infecte...,[],[#COVID],[ https://t.co/B7krPeoSaX],COVID study in which young adults were infecte...,"[COVID, study, in, which, young, adults, were,...","[COVID, study, young, adults, infected, virus,...",covid studi young adult infect viru reveal result


In [40]:
new_df_uk = build_dataframe(df_uk)
new_df_uk.head()

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets,cleaned_tweet,user_handles,hashtags,links,punct,tokenized_text,tokenizer_noStopw,stemmed_text
0,1643812310000,Thomas (Tom) Yoritaka,tomyoritaka,God-given #science - plus consideration for fe...,0,0,God-given - plus consideration for fellow cit...,[],[#science],[ https://t.co/Vjqaz8LCJV],Godgiven plus consideration for fellow citiz...,"[Godgiven, , plus, consideration, for, fello...","[Godgiven, , plus, consideration, fellow, ci...",godgiven plu consider fellow citizen save...
1,1643812260000,CartoonStock,CartoonStock,"""We're running low so some of you will have to...",0,0,"""We're running low so some of you will have to...",[],[],[ https://t.co/3iVlWv71F5],Were running low so some of you will have to s...,"[Were, running, low, so, some, of, you, will, ...","[running, low, share, \n\n, Cartoon, Jeremy, B...",run low share \n\n cartoon jeremi banx \n\n fi...
2,1643812159000,Gary #NHSPay15 💙🌍🌈💖,GarySyms,"@benonwine Yep, me &amp; my Husband 3× jabbed ...",0,0,"Yep, me &amp; my Husband 3× jabbed still , ha...",[@benonwine],[#WearAMask],[ https://t.co/Ik8Tqzy8pH],Yep me amp my Husband 3× jabbed still hand s...,"[Yep, me, amp, my, Husband, 3×, jabbed, still,...","[Yep, amp, Husband, 3×, jabbed, , hand, sani...",yep amp husband 3× jab hand sanit lft touch...
3,1643812147000,Ed Kiernan,Eddie_K_1974,#BorisJohnson announced he will end the #Covid...,0,0,announced he will end the pandemic by no long...,[],"[#BorisJohnson, #Covid]",[ https://t.co/j13sTKZCO2],announced he will end the pandemic by no long...,"[announced, he, will, end, the, , pandemic, b...","[announced, end, , pandemic, longer, publishi...",announc end pandem longer publish figur numb...
4,1643812146000,Doffou Radio Bordeaux,DoffouRadio,#NowPlaying Lauryn Hill - Doo Wop (That Thing)...,0,0,Lauryn Hill - Doo Wop (That Thing) …,[],"[#NowPlaying, #radioking, #bokaomw, #doffourad...",[ https://t.co/pRsqhj4lFu],Lauryn Hill Doo Wop That Thing …,"[Lauryn, Hill, , Doo, Wop, That, Thing, ...","[Lauryn, Hill, , Doo, Wop, Thing, , …]",lauryn hill doo wop thing …


In [41]:
new_df_germany = build_dataframe(df_germany)
new_df_germany.head()

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets,cleaned_tweet,user_handles,hashtags,links,punct,tokenized_text,tokenizer_noStopw,stemmed_text
0,1643812310000,Thomas (Tom) Yoritaka,tomyoritaka,God-given #science - plus consideration for fe...,0,0,God-given - plus consideration for fellow cit...,[],[#science],[ https://t.co/Vjqaz8LCJV],Godgiven plus consideration for fellow citiz...,"[Godgiven, , plus, consideration, for, fello...","[Godgiven, , plus, consideration, fellow, ci...",godgiven plu consider fellow citizen save...
1,1643812260000,CartoonStock,CartoonStock,"""We're running low so some of you will have to...",0,0,"""We're running low so some of you will have to...",[],[],[ https://t.co/3iVlWv71F5],Were running low so some of you will have to s...,"[Were, running, low, so, some, of, you, will, ...","[running, low, share, \n\n, Cartoon, Jeremy, B...",run low share \n\n cartoon jeremi banx \n\n fi...
2,1643812159000,Gary #NHSPay15 💙🌍🌈💖,GarySyms,"@benonwine Yep, me &amp; my Husband 3× jabbed ...",0,0,"Yep, me &amp; my Husband 3× jabbed still , ha...",[@benonwine],[#WearAMask],[ https://t.co/Ik8Tqzy8pH],Yep me amp my Husband 3× jabbed still hand s...,"[Yep, me, amp, my, Husband, 3×, jabbed, still,...","[Yep, amp, Husband, 3×, jabbed, , hand, sani...",yep amp husband 3× jab hand sanit lft touch...
3,1643812147000,Ed Kiernan,Eddie_K_1974,#BorisJohnson announced he will end the #Covid...,0,0,announced he will end the pandemic by no long...,[],"[#BorisJohnson, #Covid]",[ https://t.co/j13sTKZCO2],announced he will end the pandemic by no long...,"[announced, he, will, end, the, , pandemic, b...","[announced, end, , pandemic, longer, publishi...",announc end pandem longer publish figur numb...
4,1643812146000,Doffou Radio Bordeaux,DoffouRadio,#NowPlaying Lauryn Hill - Doo Wop (That Thing)...,0,0,Lauryn Hill - Doo Wop (That Thing) …,[],"[#NowPlaying, #radioking, #bokaomw, #doffourad...",[ https://t.co/pRsqhj4lFu],Lauryn Hill Doo Wop That Thing …,"[Lauryn, Hill, , Doo, Wop, That, Thing, ...","[Lauryn, Hill, , Doo, Wop, Thing, , …]",lauryn hill doo wop thing …


In [42]:
import sys
sys.setrecursionlimit(1500)

In [43]:
new_df_usa.to_csv("../cleaned_data/usa_cleaned_tweets.csv")

In [44]:
new_df_uk.to_csv("../cleaned_data/uk_cleaned_tweets.csv")

In [45]:
new_df_germany.to_csv("../cleaned_data/germany_cleaned_tweets.csv")