<h3>Imports</h3>

In [2]:
import pandas as pd
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pygal
from pygal.style import Style
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')

data = pd.read_csv("csv/tweets.csv", sep = ',')

[nltk_data] Downloading package punkt to /home/nomdebrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nomdebrew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Drop columns except 'created_str' and 'text', separate data, and perform sentiment analysis</h3>

In [3]:
columns_to_use = ['created_str','text']
data = data[columns_to_use]
data = data.dropna(subset=['text'])

#convert created_str to datetime and drop hour, min, sec
data['created_str'] = pd.to_datetime(data['created_str'])
data.created_str = data.created_str.map(lambda x: x.replace(day=1, hour=0, minute=0, second=0))


clintonData = data[data['text'].str.contains('clinton|hillary', case=False)]
trumpData = data[data['text'].str.contains('trump|donald', case=False)]
bothData = data[data['text'].str.contains('clinton|hillary|trump|donald', case=False)]

#add polarity and subjectivity columns to dataframes
clintonData['polarity'] = clintonData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
clintonData['subjectivity'] = clintonData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
trumpData['polarity'] = trumpData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
trumpData['subjectivity'] = trumpData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
bothData['polarity'] = bothData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
bothData['subjectivity'] = bothData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

df_C1 = clintonData.groupby('created_str').mean()
df_T1 = trumpData.groupby('created_str').mean()
df_B1 = bothData.groupby('created_str').mean()

print(data.shape)
print(clintonData.shape)
print(trumpData.shape)
print(bothData.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

(203430, 2)
(25555, 4)
(40403, 4)
(57521, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
#get dates back into column instead of being an index
df_C1.to_csv('csv/clinton_1.csv', sep=',')
df_C1 = pd.read_csv('csv/clinton_1.csv')

df_T1.to_csv('csv/trump_1.csv', sep=',')
df_T1 = pd.read_csv('csv/trump_1.csv')

df_B1.to_csv('csv/both_1.csv', sep=',')
df_B1 = pd.read_csv('csv/both_1.csv')

<h3>Generate Graphs with Pygal before preprocessing</h3>

In [5]:
custom_style = Style(colors=('red','blue','purple'))
date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Polarity Over Time'
date_chart.x_labels = df_C1['created_str']
date_chart.add("trump", df_T1['polarity'])
date_chart.add("Clinton", df_C1['polarity'])
date_chart.add("Both", df_B1['polarity'])
date_chart.render_to_file('images/polarity_date.svg')

date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Subjectivity Over Time'
date_chart.x_labels = df_C1['created_str']
date_chart.add("trump", df_T1['subjectivity'])
date_chart.add("Clinton", df_C1['subjectivity'])
date_chart.add("Both", df_B1['subjectivity'])
date_chart.render_to_file('images/subjectivity_date.svg')

In [6]:
custom_style = Style(colors=('red',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'trump'
gauge.value_formatter = percent_formatter
gauge.add('trump', [{'value': round(len(trumpData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/t_guage.svg')

custom_style = Style(colors=('blue',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'Clinton'
gauge.value_formatter = percent_formatter
gauge.add('Clinton', [{'value': round(len(clintonData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/c_guage.svg')

custom_style = Style(colors=('purple',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'Both'
gauge.value_formatter = percent_formatter
gauge.add('Both', [{'value': round(len(bothData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/b_guage.svg')

In [7]:
#reindex dataframe
bothData = bothData.reset_index(drop=True)

<h3>Example Tweets</h3>

In [8]:
clintonData = clintonData.sort_values(by=['polarity'])
clintonData = clintonData.reset_index(drop=True)
print(clintonData.loc[0:19,['text','polarity']])

                                                 text  polarity
0   RT @luvGodncountry: Wikileaks: Clinton Adviser...      -1.0
1   RT @andersonDrLJA: #BillClinton #Obama #Hillar...      -1.0
2   RT @venus58: Hey @ShepNewsTeam your show was A...      -1.0
3   RT @Col_Connaughton: ASSANGE: CLINTON MEDIA 'E...      -1.0
4   Tennessee GOP\n@TEN_GOP\nTrump: "Hillary Clint...      -1.0
5   RT @HelloHello228: Terrible I'll bet that hurt...      -1.0
6   RT @netwrkguy: @StopStopHillary @sluggoD54 Loo...      -1.0
7   💣 BREAKING!!!!! 💣\n\n‘Calibration error’ chang...      -1.0
8                  @HillaryClinton VETERANS HATE YOU!      -1.0
9   RT @kevkid79: "Clinton Camp Claims Media Was P...      -1.0
10  Bernie campaign director endorses Trump, slams...      -1.0
11  RT @Merry__Can: @HillaryClinton #democRATS are...      -1.0
12  SHOCKING: Leaked photo of Hillary Clinton with...      -1.0
13  Hillary Clinton protects serial rapist Bill Cl...      -1.0
14  RT @bob_owens: If Trump is the worst

In [9]:
trumpData = trumpData.sort_values(by=['subjectivity'])
trumpData = trumpData.reset_index(drop=True)
print(trumpData.loc[0:19,['text','subjectivity']])

                                                 text  subjectivity
0   RT @mcicero10: #BernieSanders #Trump people sh...           0.0
1   RT @ChristiChat: MT @ChristiChat: #WakeUpAmeri...           0.0
2   RT @SCLconservative: The State Controlled Medi...           0.0
3   RT @FreedomChild3: Seven Ways Obama Is Trying ...           0.0
4   RT @theglobaluniter: @realDonaldTrump \n\n💥 Wh...           0.0
5   RT @Thiru_0914: 🇺🇸Donald J. Trump Rally SATURD...           0.0
6   RT @Trump__Pence: VIDEO : Senator Tim Scott Pr...           0.0
7   RT @business: Bloomberg exclusive: Vladimir Pu...           0.0
8   RT @realDonaldTrump: Today in Florida, I pledg...           0.0
9   Man is trying to scale Trump Tower in NYC usin...           0.0
10  RT @GoForTimmer: #ItsUnacceptableTo Act like r...           0.0
11  .@ericbolling: "Can you imagine the president-...           0.0
12  RT @TrueCOT: Team Hillary Gave GOP Establishme...           0.0
13  Driver uses cardboard Trump head in carpool 

<h3>Tokenize Tweets</h3>

In [10]:
clintonData['tokenized_sents'] = clintonData.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
trumpData['tokenized_sents'] = trumpData.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
bothData['tokenized_sents'] = bothData.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

<h3>Lemmatize Tweets</h3>

In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

clintonData['text_lemmatized'] = clintonData.tokenized_sents.apply(lemmatize_text)
trumpData['text_lemmatized'] = trumpData.tokenized_sents.apply(lemmatize_text)
bothData['text_lemmatized'] = bothData.tokenized_sents.apply(lemmatize_text)

<h3>Remove Stop Words From Tweets</h3>

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
custom_stopwords = ['RT','https','http','@',':']
stop.extend(custom_stopwords)

clintonData['without_stopwords'] = clintonData.text_lemmatized.apply(lambda x: [item for item in x if item not in stop])
trumpData['without_stopwords'] = trumpData.text_lemmatized.apply(lambda x: [item for item in x if item not in stop])
bothData['without_stopwords'] = bothData.text_lemmatized.apply(lambda x: [item for item in x if item not in stop])

<h3>Join tokens back together</h3>

In [13]:
clintonData['processed'] = clintonData.without_stopwords.apply(lambda x: ' '.join(word for word in x))
trumpData['processed'] = trumpData.without_stopwords.apply(lambda x: ' '.join(word for word in x))
bothData['processed'] = bothData.without_stopwords.apply(lambda x: ' '.join(word for word in x))

<h3>Processed Data</h3>

In [17]:
print(bothData.loc[0:19,['text','processed']])

                                                 text  \
0   RT @NahBabyNah: Twitchy: Chuck Todd caught out...   
1   RT @mcicero10: #BernieSanders #Trump people sh...   
2   RT @ItsJustJaynie: @HillaryClinton The undecid...   
3   @NickTomaWBRE Hi, Nick! We're holding a "Miner...   
4   RT @HillaryClinton: This one's for you, Hillar...   
5   RT @leonpui_: Hillary Clinton, Obama and the D...   
6   #TrumpBecause #DonaldTrump will not be bought!...   
7   RT @PrisonPlanet: Hillary's anti-Trump poster ...   
8   RT @TrumpSuperPAC: #AfricanAmericans like @Jer...   
9   RT @American_Woman4: #MAGA,#FEMININEAMERICA4TR...   
10  RT @Conservatexian: News post: "TWITTER Buries...   
11  RT @1_Hoof_Hearted: @TuckerCarlson\r\n@JRubinB...   
12  RT @babysgramma: @upayr Obama rules by exec or...   
13  Trump appears to encourage gun owners to take ...   
14  Obama on Trump winning: 'Anything's possible' ...   
15  RT @stormynights10: #TrumpsFavoriteHeadline Tr...   
16  RT @Grummz: CNN: "we got pl

<h3>Perform sentiment analysis on data after preprocessing</h3>

In [18]:
#add polarity and subjectivity columns to dataframes
clintonData['polarity_2'] = clintonData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
clintonData['subjectivity_2'] = clintonData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
trumpData['polarity_2'] = trumpData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
trumpData['subjectivity_2'] = trumpData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
bothData['polarity_2'] = bothData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
bothData['subjectivity_2'] = bothData['processed'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)


df_C2 = clintonData.groupby('created_str').mean()
df_T2 = trumpData.groupby('created_str').mean()
df_B2 = bothData.groupby('created_str').mean()


In [19]:
#get dates back into column instead of being an index
df_C2.to_csv('csv/clinton_2.csv', sep=',')
df_C2 = pd.read_csv('csv/clinton_2.csv')

df_T2.to_csv('csv/trump_2.csv', sep=',')
df_T2 = pd.read_csv('csv/trump_2.csv')

df_B2.to_csv('csv/both_2.csv', sep=',')
df_B2 = pd.read_csv('csv/both_2.csv')

<h3>Generate graphs with Pygal after preprocessing</h3>

In [20]:
custom_style = Style(colors=('red','blue','purple'))
date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Polarity Over Time'
date_chart.x_labels = df_C2['created_str']
date_chart.add("trump", df_T2['polarity_2'])
date_chart.add("Clinton", df_C2['polarity_2'])
date_chart.add("Both", df_B2['polarity_2'])
date_chart.render_to_file('images/polarity_date_processed.svg')

date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Subjectivity Over Time'
date_chart.x_labels = df_C2['created_str']
date_chart.add("trump", df_T2['subjectivity_2'])
date_chart.add("Clinton", df_C2['subjectivity_2'])
date_chart.add("Both", df_B2['subjectivity_2'])
date_chart.render_to_file('images/subjectivity_date_processed.svg')