### **Implementation Details**


#### **Step 2: Data Preprocessing**
In the Data Prepocessing areas , we aim to clean the collected tweets and make it ready for our Topic Analysis.
Key activities 

* Replacing Empty Locations with Unknown
* Filtering out Non English Tweets    
* Lowercaseing 
* Removing special characters 
* Removing Whitespaces
* Removing tagged Usernames
* Removing Hashtags
* Removing RT
* Removing URLs and Http tags 
* Removing Punctuations
* Removing Emojis
* Stopword Removal 
* Lemmatization



In [1]:
# Libraries needed
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
plt.style.use('bmh')
# packages for data cleaning function
import re
import string
import pickle

import nltk
from sklearn.feature_extraction import text 
from nltk.stem import WordNetLemmatizer 
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.collocations import *
from nltk.tokenize import word_tokenize 
from emot.emo_unicode import UNICODE_EMOJI



Import *tweets_final.csv*

In [2]:
df = pd.read_csv('../data/tweets_final.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en


* Initial Data Preparation

In [3]:
### Data Cleaning 

# Checking Locations which are NaN
df.location.isna().sum()

7809

In [4]:
### Data Cleaning 

# Replacing NaN location values with Unknown
df['location'] = df['location'].fillna('Unknown')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,Unknown,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en


In [5]:
df.location.isna().sum()

0

In [6]:
### Data Cleaning 

# Dropping non english tweets.

df.drop(df[(df['language'] != 'en')].index, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,Unknown,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en


In [8]:
# Create pickle for filtered data
df.to_pickle("../source/filtered_df.pkl")


* Lowercaseing 
* Removing special characters 
* Removing Whitespaces
* Removing tagged Usernames
* Removing Hashtags
* Removing RT
* Removing URLs and Http tags 
* Removing Punctuations
* Emojis
* Stop Words
* Lemmatization

In [16]:
eng_stopwords = []
with open('../data/stopwords.txt','r') as file:    
      for line in file:         
          for sw in line.split():            
              eng_stopwords.append(sw)
emoji = list(UNICODE_EMOJI.keys())
words = set(nltk.corpus.words.words())

# Generic Data Cleaning function
def clean_tweet(text):
    text = text.lower()
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())
    text = re.sub(r'\@\w+|\#\w+|\d+', '', text)
    punct = str.maketrans('', '', string.punctuation+string.digits)
    text = text.translate(punct)
    text = re.sub('(\&amp\;)', '', text)
    text = re.sub('(rt)', '', text)
    text = re.sub('(httptco)', '', text)
    text = re.sub('(httpstco)', '', text)

    tokens = word_tokenize(text)
    filtered_words = [w for w in tokens if w not in eng_stopwords]
    filtered_words = [w for w in filtered_words if w not in emoji]
    filtered_words1 = [w2 for w2 in filtered_words if w2 in words or w2 in {'democrats', 'dem', 'dems'}]
    
    lemmatizer = WordNetLemmatizer() 
    lemma_words = [lemmatizer.lemmatize(w) for w in filtered_words1]
    text = " ".join(lemma_words)
    # text = " ".join(filtered_words1)

    return text

In [17]:
df_filtered = pd.read_pickle('../source/filtered_df.pkl')
df_filtered.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,Unknown,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en


In [18]:
df_filtered['cleaned_tweets'] = df_filtered['tweet'].apply(clean_tweet)
df_filtered.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language,cleaned_tweets
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en,taking account personal bad everyone better tr...
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en,whats song voting
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,Unknown,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en,speak opinion like rest u opinion vote republi...
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en,hey district please vote
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en,vote blue matter


In [19]:
# Create pickle again for second set
df_filtered.to_pickle("../source/cleaned_df.pkl")