# Sentiment analysis per state
Combine Sentiment analysis with state data
*First start MongoDB service:*  
`mongod --config /usr/local/etc/mongod.conf`  
`brew services start mongodb-community`  

In [1]:
# Load dependencies
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import pandas as pd

In [2]:
# Configuration
# Location MongoDB:
mongo_host = None
# Name collection
client_name = "fundamentals"
# Postive-negative sentiment file
posneg_file = '../datasets/pos_neg_sentiment_by_state.pkl'

In [3]:
# Create connection
try:
    client = MongoClient(mongo_host)
    client.admin.command('ismaster')
    db = client[client_name] 
    twitter_db = db.twitter
    
except ConnectionFailure:
    print("Connection to MongoDB server could not be established")
    exit()

In [None]:
# Test connection by counting
print("Total amount of tweets:",twitter_db.count_documents({}))

In [None]:
# Create pickle of DataFrame of all us tweets with added state locations and mentions
names = ['realDonaldTrump', 'HillaryClinton']

print("Number of US country tweets", twitter_db.count_documents(filter = 
 {"place.country_code" : "US"}))

# Set up pipeline to find tweets mentioning one or both candidates 
# and create columns with booleans for the mention of each candidate
pipeline_sen_state = [{"$match" : {"place.country_code" : "US",
                                   "place.state":{"$exists" : True},
                                   "entities.user_mentions.screen_name":{"$in": names}
                                  }},
                      {"$project": { "_id" : 1, "id": 1,"state": "$place.state",
                                    "Mentions_Trump": {"$in": [names[0],
                                                "$entities.user_mentions.screen_name"]},
                                     "Mentions_Clinton": {"$in": [names[1],
                                                "$entities.user_mentions.screen_name"]}
                                   }}]
sen_counter = twitter_db.aggregate(pipeline_sen_state)

sen_state_df = pd.DataFrame(sen_counter)
print("Number of tweets from the US mentioning one or both candidates: "+str(
    sen_state_df.shape[0]))


In [None]:
display(sen_state_df.head())

In [6]:
# Load pos-neg sentiment
pos_neg_df = pd.read_pickle(posneg_file)[["id","sentiment"]]

In [7]:
# Merge datasetpos_neg
pos_neg_state = sen_state_df.merge(pos_neg_df, on = "id", how = "inner")
print("N mentioning with sentiment: "+str(pos_neg_state.shape[0]))
pos_neg_state.head()

N mentioning with sentiment: 330512


Unnamed: 0,id,state,Mentions_Trump,Mentions_Clinton,sentiment
0,764039733076897792,Louisiana,True,False,pos
1,764039917924069376,California,True,False,neg
2,764039926161604608,New Jersey,True,False,pos
3,764039928116240384,Texas,False,True,neg
4,764039948567576576,Maryland,True,False,pos


In [8]:
# test_df.to_pickle("../state_id.pkl")

In [9]:
names = ['realDonaldTrump', 'HillaryClinton']
print("Trump: "+str(twitter_db.count_documents({"entities.user_mentions.screen_name": names[0]})))
print("Clinton: "+str(twitter_db.count_documents({"entities.user_mentions.screen_name": names[1]})))
print("Both: "+str(twitter_db.count_documents({"$and":[{"entities.user_mentions.screen_name": names[0]},
                                                       {"entities.user_mentions.screen_name": names[1]}]})))

Trump: 325834
Clinton: 140740
Both: 28253


In [10]:
# Assign mentions column to Trump, Clinton or Both
series_both = (pos_neg_state["Mentions_Trump"] & pos_neg_state["Mentions_Clinton"])
series_trump = (pos_neg_state["Mentions_Trump"] & ~pos_neg_state["Mentions_Clinton"])
series_clinton = (~pos_neg_state["Mentions_Trump"] & pos_neg_state["Mentions_Clinton"])
pos_neg_state["Mentions"] = None
print(sum(pos_neg_state["Mentions"].isna()))
pos_neg_state.loc[series_both,"Mentions"] = "both"
pos_neg_state.loc[series_trump,"Mentions"] = "trump"
pos_neg_state.loc[series_clinton,"Mentions"] = "clinton"
print(sum(pos_neg_state["Mentions"].isna()))
pos_neg_state.head()

330512
0


Unnamed: 0,id,state,Mentions_Trump,Mentions_Clinton,sentiment,Mentions
0,764039733076897792,Louisiana,True,False,pos,trump
1,764039917924069376,California,True,False,neg,trump
2,764039926161604608,New Jersey,True,False,pos,trump
3,764039928116240384,Texas,False,True,neg,clinton
4,764039948567576576,Maryland,True,False,pos,trump


In [11]:
# Check for sanity
print("Both: ","\t",sum(pos_neg_state["Mentions_Trump"] & pos_neg_state["Mentions_Clinton"]),"\n",
      "Trump: ","\t",sum(pos_neg_state["Mentions_Trump"] & ~pos_neg_state["Mentions_Clinton"]),"\n",
      "Clinton: ",sum(~pos_neg_state["Mentions_Trump"] & pos_neg_state["Mentions_Clinton"])
     , sep="")
print("Both: ","\t",sum(pos_neg_state["Mentions"] == "both"),"\n",
      "Trump: ","\t",sum(pos_neg_state["Mentions"] == "trump"),"\n",
      "Clinton: ",sum(pos_neg_state["Mentions"] == "clinton")
     , sep="")

Both: 	21404
Trump: 	222381
Clinton: 86727
Both: 	21404
Trump: 	222381
Clinton: 86727


In [13]:
pos_neg_state.pivot_table(values)

NameError: name 'values' is not defined

In [41]:
state_summary_inc_both = pos_neg_state[["id","state", "Mentions","sentiment"]].groupby(["state", "Mentions","sentiment"]).count()

In [None]:
print(state_summary.loc[(slice(None),),:].sum().values[0])
print(state_summary.loc[(slice(None),["clinton", "trump"]),:].sum().values[0])

In [44]:
state_summary = state_summary_inc_both.loc[(slice(None),["clinton", "trump"]),:]

In [45]:
state_summary.to_pickle("../datasets/state_sen_summary.pkl")

In [6]:
state_summary = pd.read_pickle("../datasets/state_sen_summary.pkl")
state_summary.unstack(level = "Mentions").unstack(level = "sentiment").columns

MultiIndex([('id', 'clinton', 'neg'),
            ('id', 'clinton', 'pos'),
            ('id',   'trump', 'neg'),
            ('id',   'trump', 'pos')],
           names=[None, 'Mentions', 'sentiment'])

In [12]:
state_summary.loc[(slice(None),["clinton", "trump"]),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
state,Mentions,sentiment,Unnamed: 3_level_1
Alabama,clinton,neg,358
Alabama,clinton,pos,527
Alabama,trump,neg,1345
Alabama,trump,pos,2854
Alaska,clinton,neg,80
...,...,...,...
Wisconsin,trump,pos,1601
Wyoming,clinton,neg,67
Wyoming,clinton,pos,65
Wyoming,trump,neg,153


In [27]:
state_summary.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
state,Mentions,sentiment,Unnamed: 3_level_1
Alabama,both,neg,104
Alabama,both,pos,171
Alabama,clinton,neg,358
Alabama,clinton,pos,527
Alabama,trump,neg,1345
