# Imports

In [4]:
%matplotlib inline

import pandas as pd

import string
import re

import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer


# Loading The DF After Phase 2


In [5]:
def load_dataframe(filename):
    return pd.read_csv(filename)

df = load_dataframe('trumpTweets.csv')

# Cleaning The DF
## Removing Stop-Words

In [6]:
Stop_words = set (stopwords.words('English'))

In [7]:
#Printing the dataframe befor manipulation
print("The dataframe befor manipulation:")
df.head(1000)

The dataframe befor manipulation:


Unnamed: 0,text,isRetweet,favorites,retweets,date
0,Republicans and Democrats have both created ou...,f,49,255,02/08/2011 18:07
1,I was thrilled to be back in the Great city of...,f,73748,17404,03/03/2020 01:34
2,RT @CBS_Herridge: READ: Letter to surveillance...,t,0,7396,17/01/2020 03:22
3,The Unsolicited Mail In Ballot Scam is a major...,f,80527,23502,12/09/2020 20:10
4,RT @MZHemingway: Very friendly telling of even...,t,0,9081,17/01/2020 13:13
...,...,...,...,...,...
995,But 2020 is a long way from over! https://t.co...,f,187699,32040,25/11/2020 15:44
996,RT @ScottAdamsSays: The most corrupt instituti...,t,0,19473,14/11/2020 05:29
997,RT @jacobkschneider: Boston Herald endorses Pr...,t,0,6642,27/10/2020 15:29
998,RT @marklevinshow: The Boston Herald endorses ...,t,0,8716,27/10/2020 15:28


In [8]:
#Exploring the data before manipulation
print("shape of the dataframe:" ,df.shape)
print("\nThe dataframe count of null values:\n",df.isnull().sum(axis = 0))
df.describe(include='all')

shape of the dataframe: (56571, 5)

The dataframe count of null values:
 text         0
isRetweet    0
favorites    0
retweets     0
date         0
dtype: int64


Unnamed: 0,text,isRetweet,favorites,retweets,date
count,56571,56571,56571.0,56571.0,56571
unique,56118,2,,,50345
top,MAKE AMERICA GREAT AGAIN!,f,,,29/06/2020 12:12
freq,51,46694,,,10
mean,,,28349.55,8618.987467,
std,,,57815.64,13306.132408,
min,,,0.0,0.0,
25%,,,10.0,59.0,
50%,,,164.0,3450.0,
75%,,,43938.5,13014.5,


# Actions:
## Tokenizing each tweet
## Cleaning Punctuations
## Cleaning Links
## Cleaning Emojis
## Stemming
## Lemmataizing

In [9]:
def tokenizeTweets(sentance):
    words = word_tokenize(sentance)
    noise_free_words = [word for word in words if word not in Stop_words]
    return noise_free_words

In [10]:
def cleanPunctuation(sentance):
    new_sentance = sentance.translate(str.maketrans('', '', string.punctuation))
    new_sentance = new_sentance.replace("“","").replace("’","").replace("”","")
    return new_sentance

In [11]:
def cleanLinks(sentance):
    sentance = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', sentance, flags=re.MULTILINE)
    return(sentance)

In [12]:
def stemming(wordsList):
    #getting the root of every word for all tweets
    rootList = []
    ps = PorterStemmer()
    for w in wordsList:
        #remove the 'Retweet' tag
        if (w != 'RT'):
            rootList.append(ps.stem(w))
    return rootList

In [13]:
def lemmataizeSentance(wordsList):
    #getting the root of every word for all tweets
    rootList = []
    wl = WordNetLemmatizer()
    for w in wordsList:
        #remove the 'Retweet' tag
        if (w != 'RT'):
            rootList.append(wl.lemmatize(w))
    return rootList

In [14]:
def replace_empty_to_nan(sentance, dfm , index):
    if len(sentance) == 0 :
        dfm['text'][index] = float('NaN')
        
    return dfm    

In [15]:
def remove_nan_rows(df):
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    df.dropna(subset = ["text"], inplace=True)
    return df

In [16]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [17]:
def remove_unnecessary_data(dataframe):
    dfm = df_lemmatize = df_stem = dataframe.copy()
    
    #remove duplicate rows
    dfm = dfm.drop_duplicates(keep='first').copy() 
    
    #removing links,emoji's, punc and tokenzing the words and making 3 dfs in 3 methods
    for i in range(0, len(df.index)):
        temp = df['text'][i].lower()
        temp = remove_emoji(temp)
        temp = cleanLinks(temp)
        temp = cleanPunctuation(temp)
        temp = tokenizeTweets(temp)
        df_stem['text'][i] = stemming(temp)
        df_lemmatize['text'][i] = lemmataizeSentance(temp)
       
        df_stem = replace_empty_to_nan(temp ,df_stem, i)
        df_lemmatize = replace_empty_to_nan(temp ,df_lemmatize, i)
    
    df_stem =  remove_nan_rows(df_stem)    
    df_lemmatize =  remove_nan_rows(df_lemmatize)    
    
             
    return df_stem , df_lemmatize

# Saving The Stemmed & The Lemmatzied DFs into CSV

In [18]:
# create tokenize data
df_stem = remove_unnecessary_data(df)[0]
df_lemmatize = remove_unnecessary_data(df)[1]

#save the progress to easy continuation
df_lemmatize.to_csv("df_lemmatize_token.csv")
df_stem.to_csv("df_stem_token.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stem['text'][i] = stemming(temp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lemmatize['text'][i] = lemmataizeSentance(temp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfm['text'][index] = float('NaN')
