In [1]:
import pandas as pd
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pygal
from pygal.style import Style
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')

data = pd.read_csv("csv/tweets.csv", sep = ',')

[nltk_data] Downloading package punkt to /home/nomdebrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nomdebrew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
columns_to_use = ['created_str','text']
data = data[columns_to_use]
data = data.dropna(subset=['text'])

#convert created_str to datetime and drop hour, min, sec
data['created_str'] = pd.to_datetime(data['created_str'])
data.created_str = data.created_str.map(lambda x: x.replace(day=1, hour=0, minute=0, second=0))


clintonData = data[data['text'].str.contains('clinton|hillary', case=False)]
trumpData = data[data['text'].str.contains('trump|donald', case=False)]
bothData = data[data['text'].str.contains('clinton|hillary|trump|donald', case=False)]

#add polarity and subjectivity columns to dataframes
clintonData['polarity'] = clintonData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
clintonData['subjectivity'] = clintonData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
trumpData['polarity'] = trumpData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
trumpData['subjectivity'] = trumpData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
bothData['polarity'] = bothData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
bothData['subjectivity'] = bothData['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

df_C1 = clintonData.groupby('created_str').mean()
df_T1 = trumpData.groupby('created_str').mean()
df_B1 = bothData.groupby('created_str').mean()

print(data.shape)
print(clintonData.shape)
print(trumpData.shape)
print(bothData.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

(203430, 2)
(25555, 4)
(40403, 4)
(57521, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
#get dates back into column instead of being an index
df_C1.to_csv('csv/clinton_1.csv', sep=',')
df_C1 = pd.read_csv('csv/clinton_1.csv')

df_T1.to_csv('csv/trump_1.csv', sep=',')
df_T1 = pd.read_csv('csv/trump_1.csv')

df_B1.to_csv('csv/both_1.csv', sep=',')
df_B1 = pd.read_csv('csv/both_1.csv')

In [5]:
custom_style = Style(colors=('red','blue','purple'))
date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Polarity Over Time'
date_chart.x_labels = df_C1['created_str']
date_chart.add("trump", df_T1['polarity'])
date_chart.add("Clinton", df_C1['polarity'])
date_chart.add("Both", df_B1['polarity'])
date_chart.render_to_file('images/polarity_date.svg')

date_chart = pygal.Line(x_label_rotation=60, fill=False, style=custom_style)
date_chart.title = 'Subjectivity Over Time'
date_chart.x_labels = df_C1['created_str']
date_chart.add("trump", df_T1['subjectivity'])
date_chart.add("Clinton", df_C1['subjectivity'])
date_chart.add("Both", df_B1['subjectivity'])
date_chart.render_to_file('images/subjectivity_date.svg')

In [6]:
custom_style = Style(colors=('red',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'trump'
gauge.value_formatter = percent_formatter
gauge.add('trump', [{'value': round(len(trumpData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/t_guage.svg')

custom_style = Style(colors=('blue',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'Clinton'
gauge.value_formatter = percent_formatter
gauge.add('Clinton', [{'value': round(len(clintonData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/c_guage.svg')

custom_style = Style(colors=('purple',))
gauge = pygal.SolidGauge(inner_radius=0.70, style=custom_style, show_legend=False)
percent_formatter = lambda x: '{:.10g}%'.format(x)
gauge.title = 'Both'
gauge.value_formatter = percent_formatter
gauge.add('Both', [{'value': round(len(bothData)/len(data)*100,1), 'max_value': 100}])
gauge.render_to_file('images/b_guage.svg')

<h3>Example Tweets</h3>

In [54]:
clintonData = clintonData.sort_values(by=['polarity'])
clintonData = clintonData.reset_index(drop=True)
print(clintonData.loc[0:19,['text','polarity']])

                                                 text  polarity
0   RT @luvGodncountry: Wikileaks: Clinton Adviser...      -1.0
1   RT @CalebDHurt1: They must be truly desperate ...      -1.0
2   🚨WATCH: Bernie Sanders booed  after demanding ...      -1.0
3   RT @LindaSuhler: #VoteTrump to STOP this globa...      -1.0
4   RT @NewSonsLiberty: 23 Shocking Revelations Fr...      -1.0
5   RT @TEN_GOP: 🚨WATCH: Bernie Sanders booed  aft...      -1.0
6   RT @qariwarmi: Maybe the spirits of the fallen...      -1.0
7   RT @acidrebel: The cheating has begun. Email u...      -1.0
8   RT @groggygirl85: Dear Hillary Voters: It's No...      -1.0
9   RT @kevkid79: "Clinton Camp Claims Media Was P...      -1.0
10  RT @anne19brown: RT if you are with me with in...      -1.0
11  RT @anastasiabeave4: #IGetDepressedWhen when I...      -1.0
12  RT @mischabi: These aren't debates. They are b...      -1.0
13  RT @LindaSuhler: #VoteTrump to STOP this globa...      -1.0
14  BREAKING: Hillary Clinton Just Got T

In [55]:
trumpData = trumpData.sort_values(by=['subjectivity'])
trumpData = trumpData.reset_index(drop=True)
print(trumpData.loc[0:19,['text','subjectivity']])

                                                 text  subjectivity
0   RT @mcicero10: #BernieSanders #Trump people sh...           0.0
1   RT @ForecasterEnten: Trump 46, Cruz 31, Rubio ...           0.0
2   RT @TrivWorks: • Trump\r\n• Celebrity Deaths \...           0.0
3   RT @micafarha: @LeahR77 @Braveheart_USA @dreww...           0.0
4   RT @realDailyWire: Ep. 220 - Trump Saved Jobs!...           0.0
5   RT @Born_To_DYE: #TrumpsFavoriteHeadline Putin...           0.0
6   RT @darleneturner53: For Trump to release his ...           0.0
7   Trump Jr. likens Syrian refugees to Skittles h...           0.0
8          Even #ChildrenThinkThat Trump is immature.           0.0
9   RT @MIPooh: Trump Ag Dept @USDA Orders Hiding ...           0.0
10  The Anti-Inauguration in a nutshell: we are of...           0.0
11  RT @TeaPartyOrg: Trump Cracks the Electoral Co...           0.0
12  #IGetDepressedWhen I have to inform the custom...           0.0
13  RT @YoungDems4Trump: Show this to those who 

In [9]:

from textblob.sentiments import NaiveBayesAnalyzer
opinion = TextBlob("EliteDataScience.com is dope!", analyzer=NaiveBayesAnalyzer())
opinion.sentiment#.classification

Sentiment(classification='neg', p_pos=0.35000000000000003, p_neg=0.6499999999999997)

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
0.39166666666666666


In [13]:
textData = data['text']
type(textData)

pandas.core.series.Series

In [3]:
data.shape

(203451, 16)

In [None]:
textData.head

In [56]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [60]:
def preprocessTweet(text):
    text = text.lower()
    text = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+","",text)
    text = text.replace("rt","")
    #other stuff go here
    tokens = TextBlob(text).words
    tokens = [w for w in tokens if w not in stopwords]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens

preprocessTweet(data.loc[8,'text'])

['nicktomawbre',
 'hi',
 'nick',
 "'re",
 'holding',
 'miner',
 'trump',
 'rally',
 'tomorrow',
 "'re",
 'interested',
 'covering',
 'ple…']

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
additional_stopwords = ['RT','https',':']
stop_words.extend(additional_stopwords)

for i in range(len(textData)):
    words = nltk.word_tokenize(str(textData[i]))
    for w in words:
        filtered_sentence = [w for w in words if not w in stop_words]
        filtered_sentence = []
        if w not in stop_words:
            filtered_sentence.append(w)
            #print(filtered_sentence)

In [None]:
punctuations="?:!.,;"

for i in range(len(textData)):
    text = nltk.word_tokenize(str(textData[i]))
    for word in text:
        if word in puncuations:
            text.remove(word)
    for word in text:
        print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos="v")))