In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from bs4 import BeautifulSoup as bs
from nltk.util import ngrams
import cgi
import nltk
import html
import readability 
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michaellewis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaellewis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Data Pre-Processing**

In [2]:
biden = pd.read_csv("biden_filtered.csv")
trump = pd.read_csv("trump_filtered.csv")
biden['author'] = "Biden"
trump['author'] = "Trump"

In [3]:
trump = trump[trump.columns[1:]]
biden = biden[biden.columns[1:]]

In [4]:
tweets = pd.concat([biden, trump])

In [5]:
class TwitterCleaner:
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def decode_xml_entities(self, text):
        text = html.unescape(text)
        text = str(text)
        return text

    def clean_text(self, lemmatize=False, stem=False, remove_stopwords=True):
        stop_words = set(stopwords.words('english'))
        self.df[self.column] = self.df[self.column].apply(lambda x: str(x))
        clean_list = []
        for text in self.df[self.column]:
            # remove URLs
            text = re.sub(r'http\S+','', text)  

            # remove '@' twitter mentions
            text = re.sub(r'@[A-Za-z0-9_]+','', text) 

            # XML to characters
            text = self.decode_xml_entities(text)

            # lowercase the text
            text = text.lower() 
            words = word_tokenize(text)

            clean_words = []

            # stemming / lemmatization

            for word in words:
                if remove_stopwords == True and word in stop_words:
                    continue
                if lemmatize == True: 
                    lemmatizer = WordNetLemmatizer()
                    word = lemmatizer.lemmatize(word)
                if stem == True: 
                    stemmer = PorterStemmer()
                    word = stemmer.stem(word)
                clean_words.append(word)

            clean_text = ' '.join(clean_words)
            clean_list.append(clean_text)

        if lemmatize:
            if remove_stopwords:
                self.df[f"{self.column}_clean_lemmatized_stopwords"] = clean_list
            else:
                self.df[f"{self.column}_clean_lemmatized"] = clean_list
        elif stem:
            if remove_stopwords:
                self.df[f"{self.column}_clean_stemmed_stopwords"] = clean_list
            else:
                self.df[f"{self.column}_clean_stemmed"] = clean_list
        else:
            if remove_stopwords:
                self.df[f"{self.column}_clean_stopwords"] = clean_list
            else:
                self.df[f"{self.column}_clean"] = clean_list

        return self.df


In [6]:
tc = TwitterCleaner(tweets, 'text')
tweets = tc.clean_text(lemmatize = True, stem = False, remove_stopwords = True)
tweets = tc.clean_text(lemmatize = False, stem = True, remove_stopwords = True)
tweets

Unnamed: 0,text,likes,retweets,timestamp,id,author,text_clean_lemmatized_stopwords,text_clean_stemmed_stopwords
0,Every single human being deserves to be treate...,11574,2423,2020-01-01 18:35:00,1.212442e+18,Biden,every single human deserves treated dignity . ...,everi singl human deserv treat digniti . every...
1,With just over one month until the Iowa Caucus...,1457,368,2020-01-02 00:01:00,1.212524e+18,Biden,"one month iowa caucus , need hand deck talk fo...","one month iowa caucu , need hand deck talk fol..."
2,This election is about the soul of our nation ...,44886,10192,2020-01-02 01:05:00,1.212540e+18,Biden,election soul nation — donald trump poison soul .,elect soul nation — donald trump poison soul .
3,Every day that Donald Trump remains in the Whi...,9581,2005,2020-01-02 02:07:00,1.212556e+18,Biden,every day donald trump remains white house put...,everi day donald trump remain white hous put f...
4,It was a privilege to work with @JulianCastro ...,17156,2284,2020-01-02 16:10:00,1.212768e+18,Biden,"privilege work obama administration , true hon...","privileg work obama administr , true honor tal..."
...,...,...,...,...,...,...,...,...
6403,"Iran never won a war, but never lost a negotia...",303007,57253,2020-01-03 12:44:30,1.213079e+18,Trump,"iran never war , never lost negotiation !","iran never war , never lost negoti !"
6404,Thank you to the @dcexaminer Washington Examin...,35044,9213,2020-01-01 01:03:15,1.212177e+18,Trump,thank washington examiner . list growing every...,thank washington examin . list grow everi day !
6405,One of my greatest honors was to have gotten C...,56731,12761,2020-01-01 00:55:01,1.212175e+18,Trump,one greatest honor gotten choice approved grea...,one greatest honor gotten choic approv great v...
6406,Just signed an order to support the workers of...,176289,36001,2020-10-22 21:04:21,1.319384e+18,Trump,signed order support worker delphi corporation...,sign order support worker delphi corpor make s...


**Analysis**

In [7]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

def sentiment_score(sentence):
    score = analyzer.polarity_scores(sentence)
    return score['compound']

tweets['sentiment_scores'] = [sentiment_score(sentence) for sentence in tweets['text_clean_lemmatized_stopwords']]
tweets

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/michaellewis/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,text,likes,retweets,timestamp,id,author,text_clean_lemmatized_stopwords,text_clean_stemmed_stopwords,sentiment_scores
0,Every single human being deserves to be treate...,11574,2423,2020-01-01 18:35:00,1.212442e+18,Biden,every single human deserves treated dignity . ...,everi singl human deserv treat digniti . every...,-0.4019
1,With just over one month until the Iowa Caucus...,1457,368,2020-01-02 00:01:00,1.212524e+18,Biden,"one month iowa caucus , need hand deck talk fo...","one month iowa caucu , need hand deck talk fol...",0.6597
2,This election is about the soul of our nation ...,44886,10192,2020-01-02 01:05:00,1.212540e+18,Biden,election soul nation — donald trump poison soul .,elect soul nation — donald trump poison soul .,-0.5423
3,Every day that Donald Trump remains in the Whi...,9581,2005,2020-01-02 02:07:00,1.212556e+18,Biden,every day donald trump remains white house put...,everi day donald trump remain white hous put f...,-0.5719
4,It was a privilege to work with @JulianCastro ...,17156,2284,2020-01-02 16:10:00,1.212768e+18,Biden,"privilege work obama administration , true hon...","privileg work obama administr , true honor tal...",0.9442
...,...,...,...,...,...,...,...,...,...
6403,"Iran never won a war, but never lost a negotia...",303007,57253,2020-01-03 12:44:30,1.213079e+18,Trump,"iran never war , never lost negotiation !","iran never war , never lost negoti !",0.4071
6404,Thank you to the @dcexaminer Washington Examin...,35044,9213,2020-01-01 01:03:15,1.212177e+18,Trump,thank washington examiner . list growing every...,thank washington examin . list grow everi day !,0.5411
6405,One of my greatest honors was to have gotten C...,56731,12761,2020-01-01 00:55:01,1.212175e+18,Trump,one greatest honor gotten choice approved grea...,one greatest honor gotten choic approv great v...,0.9060
6406,Just signed an order to support the workers of...,176289,36001,2020-10-22 21:04:21,1.319384e+18,Trump,signed order support worker delphi corporation...,sign order support worker delphi corpor make s...,0.0000


In [8]:
print("Average sentiment score for Biden: " + str(tweets[tweets['author'] == "Biden"]['sentiment_scores'].mean()))

Average sentiment score for Biden: 0.17794974376494702


In [9]:
print("Average sentiment score for Trump: " + str(tweets[tweets['author'] == "Trump"]['sentiment_scores'].mean()))

Average sentiment score for Trump: 0.1705028870162297


In [10]:
# Basically the same sentiment score 