# Importing data

In [8]:
#regex 
import re

#data manipulation:
import pandas as pd
import numpy as np

#nlp:
import nltk
import gensim

#language detection:
from langdetect import detect

# unidecode
import unidecode

# emoji support
import emoji

#spellchecker
from autocorrect import Speller

#tqdm for notebook progressbars
from tqdm.notebook import tqdm
tqdm.pandas() # pandas.progress_map & pandas.progress_apply 

# seeding numpy:
np.random.seed(2021)

  from tqdm.autonotebook import tqdm


In [9]:
"""
Topic modeling 
Tweets of the COVID_19
These tweets are collected using Twitter API and a Python script.
 A query for this high-frequency hashtag (#covid19) is run on a daily basis for a certain time period, to collect a larger number of tweets samples.
The tweets have #covid19 hashtag. Collection started on 25/7/2020, with an initial 17k batch.
"""
#Importing data
df = pd.read_csv(r"./data/covid19_tweets.csv", error_bad_lines=False)
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


# Checking for undesirable rows :

In [10]:
# csv exploration showed that we have no retweets :
print("Number of rows in the df :", df.shape[0])
print("Number of retweets :", df[df["is_retweet"] == True].shape[0])
print("List of all possible values in 'is_retweet' :",df.is_retweet.unique())

# we can safely drop the is_retweet column :
df.drop('is_retweet',axis=1, inplace=True)

Number of rows in the df : 179108
Number of retweets : 0
List of all possible values in 'is_retweet' : [False]


In [11]:
# since we are not supposed to have retweets, identical tweets are probably spam/bots !
# counting identical tweets :
print("Total number of rows in the df :", df.shape[0])
print("Nbr of identical tweets :", df.shape[0] - len(df.value_counts('text')))

# dropping identical tweets:
df.drop_duplicates('text', inplace=True)
print("Remaining rows after dropping the duplicates :", df.shape[0])

Total number of rows in the df : 179108
Nbr of identical tweets : 425
Remaining rows after dropping the duplicates : 178683


In [12]:
# checking if we have rows with nothing in the tweet body:
print("Number of rows with no tweet body :", len(df[df.text.isna()])) #ok

Number of rows with no tweet body : 0


# Processing the tweets :

In [22]:
# regex patterns:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
number_pattern = re.compile(r'\d+')
specialchar_pattern = re.compile(r'[^\w\s]')
multiplespaces_pattern = re.compile(r'\s\s+')

def preprocessor(text):
    #replace emojis with text :
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    #remove all unicode funk:
    text = unidecode.unidecode(text)
    
    #remove caps:
    text = text.lower()
    
    #remove urls :
    text = re.sub(pattern=url_pattern, repl='', string=text)
    
    #remove numbers :
    text = re.sub(pattern=number_pattern, repl='', string=text)
    
    #remove special chars :
    text = re.sub(pattern=specialchar_pattern, repl='', string=text)
    
    #remove the '\n' probably created by unidecode and the remaining "_"
    text = text.replace("\n", " ").replace("_", " ")
    
    #merge multiple spaces into 1:
    text = re.sub(pattern=multiplespaces_pattern, repl=' ', string=text)
    
    return(text)

In [23]:
df["c_text"] = df.text.progress_map(preprocessor)

  0%|          | 0/178683 [00:00<?, ?it/s]

In [24]:
df.head(5)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,c_text,lang
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,if i smelled the scent of hand sanitizers toda...,en
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,hey yankees yankeespr and mlb wouldnt it have ...,en
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,diane wdunlap realdonaldtrump trump never once...,en
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,brookbanktv the one gift covid has give me is ...,en
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,july media bulletin on novel coronavirusupdat...,en


# Checking the tweets after processing :

In [25]:
# It's likely that the preprocessor fnc completely deleted some tweets.
# Count the lines and remove them.
# also remove all-whitespace tweets & single letter tweets :
print("Number of empty tweets after preprocess :", df.shape[0] -df[df['c_text'].apply(
    lambda x: len(str(x).replace(" ","")) > 1)].shape[0])
# this is surprising!

# just in case, checking min and avg tweet len():
print("Minimum tweet len() :", df.c_text.str.len().min())
print("Mean tweet len() :", df.c_text.str.len().mean())
# I was wrong, no need to remove anything !

Number of empty tweets after preprocess : 0
Minimum tweet len() : 10
Mean tweet len() : 100.6656369100586


# Trying to detect the language of every tweet :

In [None]:
#use langdetect to detect the lang of every tweet in the df:
# this is pretty slow (~10 min)
df["lang"] = df.c_text.progress_map(detect)

  0%|          | 0/178683 [00:00<?, ?it/s]

In [None]:
# pickle the results:
import pickle as pk
with open("./tweet_df.pk", "wb") as fp:
    pk.dump(df, fp)

In [None]:
# Number of tweets that are probably not in english:
print("Nbr of non-english tweets :", df[df["lang"]!="en"].shape[0])

# 10 most detected tweet languages:
print("\n10 most detected languages :\n",df.lang.value_counts()[:10], end="")

In [None]:
# dump the non-english tweets in a file for manual exploration :
with open("./noneng.csv", "w", encoding="utf8") as fp:
    [fp.write(f"{elem}\n\n") for elem in df[df["lang"]!="en"].c_text]

In [None]:
#after manually reviewing the tweets tagged as non-english, 
#I'm ok with ok with removing them from the dataframe:
df.drop(df[df["lang"] != "en"].index, inplace = True)

print("Remaining tweets :",df.lang.value_counts())

In [None]:
# tokenize, remove stopwords and lemmatize everything
# creating a dedicated spacy pipeline :
import spacy
import lemminflect

nlp = spacy.load('en_core_web_trf', exclude=["transformer"])

df["doc"]= df.c_text.map_apply(nlp)

In [None]:
[elem.lemma_ for elem in docs[3]]

In [None]:
for token in docs[1]:
    print(token.lemma_)

In [None]:
# bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# bow_corpus[420]

# tfidf = models.TfidfModel(bow_corpus)
# corpus_tfidf = tfidf[bow_corpus]

# # define LDA model using the TF-IDF dictionary
# lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=4, id2word=dictionary, passes=3, workers=3)

# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))