# Sentiment Analysis (Trump vs Clinton)

In [2]:
import pandas as pd
from collections import Counter
import re
import stopwords as st

### Run the file 'tweetering.py' and store the contents (tweets) in a text file.

In [2]:
%run tweetering.py trump 250     #save about 250 tweets containing "trump" or "#trump"

Listening to '#trump' and 'trump' ...


In [3]:
%run tweetering.py clinton 250       #save about 250 tweets containing "clinton" or "#clinton"

Listening to '#clinton' and 'clinton' ...


### Function to clean a tweet; remove the usernames, urls, numbers etc.

In [4]:
def processTweet(tweet):
    
    if tweet.startswith("RT"):
        i = tweet.index(':')
        tweet = tweet[i+2:]
    
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    tweet = re.sub('([0-9]+)','', tweet)
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('&amp;', '', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = tweet.strip('\'"')
    return tweet

### A list containing top most words which don't count towards sentiment analysis:

In [5]:
stoplist = ["clinton","trump","hillary","donald",'chair',"rt",":","oh","clinton.","trump,","team",".","bill","like","made","now","trump:","via","\xe2\x80\x94","de","\xe2\x80\x93","republican","will","going","-","campaign","election","us","things]","president",
            "camp","seen","--","well","breaking:","tv","november","media","anonymous","video","ng","can","just","girl","calls","ready","trump\'s","clinton\'s","saying","really","girl","calls","former","electing","teen",
           "say","presidential","%","obama","new","take","talk","vote","people","may","watch","test","voters:","must","gop","live","breaks",
           "poll","yay","one","trump’s","voting","nothing","trump.","real","old","back"]

### Function to read the tweets from the file and convert it into a dictionary of words having their frequency as well:

In [6]:
def countinfile(filename):
    d = {}
    stopwords = st.get_stopwords("en") + stoplist
    with open(filename, "r") as fp:
        for line in fp:
            line = processTweet(line)
            #print line
            words = line.strip().split()
            for word in words:
                try:
                    if(word not in stopwords):
                        d[word] += 1
                except KeyError:
                    d[word] = 1
    return d

In [7]:
dict_clinton = countinfile("clinton.txt")
dict_trump = countinfile("trump.txt")



### Collecting the 10 most associated words and storing them in a dataframe:

In [8]:
data1 = Counter(dict_trump).most_common(10)
data2 = Counter(dict_clinton).most_common(10)

df_trump = pd.DataFrame(data1, columns=["Word","Frequency"])
df_clinton = pd.DataFrame(data2, columns=["Word","Frequency"])

print "Trump:"
print df_trump
print "\nClinton:"
print df_clinton

Trump:
         Word  Frequency
0       women          7
1      sexist          6
2  challenges          4
3        drug          4
4         win          3
5        help          3
6       white          3
7      women:          3
8       woman          2
9     support          2

Clinton:
            Word  Frequency
0         emails         13
1     foundation         12
2  podestaemails         10
3      wikileaks          9
4   neverhillary          6
5        cocaine          6
6     corruption          6
7         muslim          6
8          email          6
9          china          5


### Assigning positive or negative sentiments to the words:

In [9]:
df_trump["Sentiment"] = [-1,-1,1,-1,1,1,-1,-1,-1,1]
df_clinton["Sentiment"] = [-1,1,-1,-1,-1,-1,-1,1,-1,1]

print "Trump:"
print df_trump
print "\nClinton:"
print df_clinton

Trump:
         Word  Frequency  Sentiment
0       women          7         -1
1      sexist          6         -1
2  challenges          4          1
3        drug          4         -1
4         win          3          1
5        help          3          1
6       white          3         -1
7      women:          3         -1
8       woman          2         -1
9     support          2          1

Clinton:
            Word  Frequency  Sentiment
0         emails         13         -1
1     foundation         12          1
2  podestaemails         10         -1
3      wikileaks          9         -1
4   neverhillary          6         -1
5        cocaine          6         -1
6     corruption          6         -1
7         muslim          6          1
8          email          6         -1
9          china          5          1


### Calculating the average sentiment value:

In [10]:
sum1 = 0
for i in range(len(df_trump)):
    sum1 += df_trump["Frequency"][i] * df_trump["Sentiment"][i]
print "Trump's average sentiment: " + str(float(sum1) / 10)

sum2 = 0
for i in range(len(df_clinton)):
    sum2 += df_clinton["Frequency"][i] * df_clinton["Sentiment"][i]
print "Clinton's average sentiment: " + str(float(sum2) / 10)

Trump's average sentiment: -1.3
Clinton's average sentiment: -3.3


### Seems that there are more chances of Donald Trump to win over Hillary Clinton according to the worldwide tweets on Oct 14, 2016.