# General Preprocessing (Data Cleaning)

In [31]:
import pandas as pd
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji

In [32]:
#base_path = "/content/drive/MyDrive/SMA_Project/"
base_path = "Data/"
files = ["Tweets10062021.xlsx","Tweets23052021.xlsx"]
dfs = []

for file in files:
  df = pd.read_excel(base_path + file)
  dfs.append(df)

df_tweets = pd.concat(dfs).reset_index(drop=True)

In [33]:
df_tweets

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes
0,@MatthewDavidH @EricTopol @TheEconomist @US_FD...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0
1,"@DharkArk @JoeBiden You're right, too bad this...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0
2,"@irishdean Salam, here is your unroll: 1/ @CDC...",threadreaderapp,476172,https://threadreaderapp.com,2021-06-10 19:24:04.999999,0,0,0
3,@theredshift11 @POTUS @studentsfordemo The vac...,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0
4,.@BorisJohnson @JustinTrudeau @POTUS @EUCounci...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0
...,...,...,...,...,...,...,...,...
22538,@POTUS @HillaryClinton @Jaemyung_Lee \n\nThe c...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0
22539,@POTUS Why can you not go maskless with out va...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0
22540,"@CDCgov You say the immunocomprimised ""should ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1
22541,"@50treeK8 @OregonGovBrown @CDCgov Well, Kate, ...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0


In [34]:
# Remove @ mentions and put them to a new column

# from https://stackoverflow.com/questions/46633758/extracting-mentions-from-tweets-using-findall-python-giving-incorrect-results
reg_ex = r'(?<![@\w])@(\w{1,25})'

df_tweets['mention'] = df_tweets.text.str.findall(reg_ex).apply(','.join)
df_tweets['text'] = df_tweets.text.str.replace(reg_ex, "")

# Extract hashtags
reg_ex = r"#(\w+)"
df_tweets['hashtag'] = df_tweets.text.str.findall(reg_ex).apply(','.join)
df_tweets['text'] = df_tweets.text.str.replace(reg_ex, "")




In [35]:
# Remove urls
df_tweets['text'] = df_tweets.text.str.replace(r'http\S+', '', flags=re.MULTILINE)



In [36]:
# Remove whitespaces
df_tweets['text'] = df_tweets.text.str.strip()

# Replace vax with vaccine
df_tweets['text'] = df_tweets.text.str.replace(' vax ', " vaccine ")





In [37]:
# Remove tweets by threadreaderapp
# Threadreaderapp (https://twitter.com/threadreaderapp) is a bot to make a thread more readable 
df_tweets = df_tweets[df_tweets["user screen name"]!= "threadreaderapp"]


In [38]:
df_tweets

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes,mention,hashtag
0,Doesn't vaccine approval typically take 6 and ...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0,"MatthewDavidH,EricTopol,TheEconomist,US_FDA,la...",
1,"You're right, too bad this vaccine doesn't hav...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0,"DharkArk,JoeBiden",
3,The vaccine is free…❤️,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0,"theredshift11,POTUS,studentsfordemo",
4,. the world needs vaccine access now. Th...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0,"BorisJohnson,JustinTrudeau,POTUS,EUCouncil,Reg...","COVID19,G7,EndThePandemic,COVAX"
5,But you can get it.\nI remember getting my Mum...,canfixstoopid,355,https://mobile.twitter.com,2021-06-10 19:23:42.000000,0,0,0,"SerendipityOr,Shockwave_Shaun,Ozymandiyaas,Joe...",
...,...,...,...,...,...,...,...,...,...,...
22538,The country need to check on people who take o...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0,"POTUS,HillaryClinton,Jaemyung_Lee",
22539,Why can you not go maskless with out vaccine? ...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0,POTUS,
22540,"You say the immunocomprimised ""should discuss ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1,CDCgov,
22541,"Well, Kate, not everyone is vaccinated and the...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0,"50treeK8,OregonGovBrown,CDCgov",


In [39]:
df_tweets.to_excel(base_path + "Tweets_cleaned.xlsx", index=False)