## Senate NLP Project - Sentiment Analysis
#### By: Mitch Brinkman

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import patsy
import re
import pickle
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist, ConditionalFreqDist
import nltk
nltk.download('punkt')
nltk.download('wordnet')
%matplotlib inline
pd.set_option('display.max_rows', 500)
from sklearn.feature_extraction import text 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#### Function Imports

In [1]:
from senate_func import full_vader_speech
from senate_func import build_sentiment_tables
from senate_func import drop_columns

#### DF Types & Names:
    Topic
        Education: edu_df
        Healthcare: hc_df
        Banking: fin_df
    Era
        1980-1988: reagan_df
        1989-1992: bush_df
        1993-2000: clinton_df
        2001-2008: w_bush_df
        2009-2016: obama_df
    Gender
        female_df
        male_df
        edu_female_df
        hc_female_df
        fin_female_df
        edu_male_df
        hc_male_df
        fin_male_df 
    Party
        edu_dem_df
        hc_dem_df
        fin_dem_df
        edu_rep_df
        hc_rep_df
        fin_rep_df
    

### Sentiment Tables & Pickles
    The build_sentiment_tables code below reads in a pickle file of the DF, applies the full_vader_speech function to add a column to each dataframe returning the average compound VADER sentiment score for each speech. The inputs retrieve the pickle from the ordered file structure.

In [None]:
## Uncomment (if needed) to build the theme and era sentiment tables

# edu_df = build_sentiment_tables('topic','edu_df')
# hc_df = build_sentiment_tables('topic','hc_df')
# fin_df = build_sentiment_tables('topic','fin_df')
# reagan_df = build_sentiment_tables('era','reagan_df')
# bush_df = build_sentiment_tables('era','bush_df')
# clinton_df = build_sentiment_tables('era','clinton_df')
# w_bush_df = build_sentiment_tables('era','w_bush_df')
# obama_df = build_sentiment_tables('era','obama_df')

In [None]:
## Uncomment (if needed) to build the theme-gender and theme-party sentiment tables

female_df = build_sentiment_tables('gender','female_df')
male_df = build_sentiment_tables('gender','male_df')
edu_male_df = build_sentiment_tables('gender','edu_male_df')
edu_female_df = build_sentiment_tables('gender','edu_female_df')
hc_female_df = build_sentiment_tables('gender','hc_female_df')
fin_female_df = build_sentiment_tables('gender','fin_female_df')
hc_male_df = build_sentiment_tables('gender','hc_male_df')
fin_male_df = build_sentiment_tables('gender','fin_male_df')
edu_dem_df = build_sentiment_tables('party','edu_dem_df')
edu_rep_df = build_sentiment_tables('party','edu_rep_df')
hc_dem_df = build_sentiment_tables('party','hc_dem_df')
fin_dem_df = build_sentiment_tables('party','fin_dem_df')
hc_rep_df = build_sentiment_tables('party','hc_rep_df')
fin_rep_df = build_sentiment_tables('party','fin_rep_df')

In [None]:
pickle.dump(female_df, open("./data/pickles/party/female_df.pkl", "wb"))

### Vader Analysis

#### Sentiment Capture & Indexing

In [None]:
edu_df['year'] = pd.DatetimeIndex(edu_df['date']).year
hc_df['year'] = pd.DatetimeIndex(hc_df['date']).year
fin_df['year'] = pd.DatetimeIndex(fin_df['date']).year
edu_df_sent = edu_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
hc_df_sent = hc_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
fin_df_sent = fin_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()

##### Democrats 

In [None]:
edu_dem_df['year'] = pd.DatetimeIndex(edu_dem_df['date']).year
hc_dem_df['year'] = pd.DatetimeIndex(hc_dem_df['date']).year
fin_dem_df['year'] = pd.DatetimeIndex(fin_dem_df['date']).year
edu_dem_df_sent = edu_dem_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
hc_dem_df_sent = hc_dem_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
fin_dem_df_sent = fin_dem_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()

##### Republicans 

In [None]:
edu_rep_df['year'] = pd.DatetimeIndex(edu_rep_df['date']).year
hc_rep_df['year'] = pd.DatetimeIndex(hc_rep_df['date']).year
fin_rep_df['year'] = pd.DatetimeIndex(fin_rep_df['date']).year
edu_rep_df_sent = edu_rep_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
hc_rep_df_sent = hc_rep_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
fin_rep_df_sent = fin_rep_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()

##### Males

In [None]:
edu_male_df['year'] = pd.DatetimeIndex(edu_male_df['date']).year
hc_male_df['year'] = pd.DatetimeIndex(hc_male_df['date']).year
fin_male_df['year'] = pd.DatetimeIndex(fin_male_df['date']).year
edu_male_df_sent = edu_male_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
hc_male_df_sent = hc_male_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
fin_male_df_sent = fin_male_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()

##### Females

In [None]:
edu_female_df['year'] = pd.DatetimeIndex(edu_female_df['date']).year
hc_female_df['year'] = pd.DatetimeIndex(hc_female_df['date']).year
fin_female_df['year'] = pd.DatetimeIndex(fin_female_df['date']).year
edu_female_df_sent = edu_female_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
hc_female_df_sent = hc_female_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()
fin_female_df_sent = fin_female_df.groupby(['year'],as_index=False,sort=False)['sentiment_cmpd'].mean()

#### Edu/HC/Fin OVERALL

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(hc_df.index):
    x = hc_df_sent.year
    y = hc_df_sent.sentiment_cmpd
    x2 = edu_df_sent.year
    y2 = edu_df_sent.sentiment_cmpd
    x3 = fin_df_sent.year
    y3 = fin_df_sent.sentiment_cmpd
    x4 = np.arange(1980,2017)
    y4 = np.linspace(.05,.05,37)
    
    plt.plot(x, y, color='red',marker="o",label='Healthcare')
    plt.plot(x2,y2,color='blue',marker='>',ls='--',label='Education')
    plt.plot(x3,y3,color='green',marker='P',ls='-',label='Banking')
    plt.plot(x4,y4,color='yellow',ls='--',label='positive sentiment threshold',lw=5)
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
#     plt.xlim(-.01, .12) 
    
# plt.figure(figsize=(15,12))
plt.title('Education, Healthcare & Banking: \n Yearly Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Positive --->',**gfont, fontsize=40)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
# plt.savefig('ehb_overall.png');
plt.show();

#### Edu/HC/Fin - DEMOCRATS

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(edu_dem_df.index):
    x = edu_dem_df_sent.year
    y = edu_dem_df_sent.sentiment_cmpd
    x2 = hc_dem_df_sent.year
    y2 = hc_dem_df_sent.sentiment_cmpd
    x3 = fin_dem_df_sent.year
    y3 = fin_dem_df_sent.sentiment_cmpd
    
    plt.plot(x, y, color='blue',marker=">",label='Education')
    plt.plot(x2,y2,color='red',marker='P',ls='--',label='Healthcare')
    plt.plot(x3,y3,color='green',marker='o',ls='-',label='Banking')
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
#     plt.xlim(-.01, .12) 
    
# plt.figure(figsize=(15,12))
plt.title('Education, Healthcare & Banking: \n Democrat Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
# plt.legend(loc=0,numpoints=1)
# plt.savefig('ehb_dems.png');
plt.show();

#### Edu/HC/Fin - REPUBLICANS

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(edu_rep_df.index):
    x = edu_rep_df_sent.year
    y = edu_rep_df_sent.sentiment_cmpd
    x2 = hc_rep_df_sent.year
    y2 = hc_rep_df_sent.sentiment_cmpd
    x3 = fin_rep_df_sent.year
    y3 = fin_rep_df_sent.sentiment_cmpd
    
    plt.plot(x, y, color='cornflowerblue',marker=">",label='Education')
    plt.plot(x2,y2,color='red',marker='P',ls='--',label='Healthcare')
    plt.plot(x3,y3,color='green',marker='o',ls='-',label='Banking')
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
#     plt.xlim(-.01, .12) 
    
# plt.figure(figsize=(15,12))
plt.title('Education, Healthcare & Banking: \n Republican Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
# plt.savefig('ehb_reps.png');
plt.show();

#### Education - DEMS/REPS

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(edu_rep_df.index):
    x = edu_rep_df_sent.year
    y = edu_rep_df_sent.sentiment_cmpd
    x2 = edu_dem_df_sent.year
    y2 = edu_dem_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='Republicans')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='Democrats')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.02, .20) 
    
# plt.figure(figsize=(15,12))
plt.title('Education: \n Dem vs. Rep Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('edu_dem_rep.png');
plt.show();

#### Healthcare - DEMS/REPS

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(hc_rep_df.index):
    x = hc_rep_df_sent.year
    y = hc_rep_df_sent.sentiment_cmpd
    x2 = hc_dem_df_sent.year
    y2 = hc_dem_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='Republicans')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='Democrats')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.02, .20) 
    
# plt.figure(figsize=(15,12))
plt.title('Healthcare: \n Dem vs. Rep Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('hc_dem_rep.png');
plt.show();

#### Banking - DEMS/REPS

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(fin_rep_df.index):
    x = fin_rep_df_sent.year
    y = fin_rep_df_sent.sentiment_cmpd
    x2 = fin_dem_df_sent.year
    y2 = fin_dem_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='Republicans')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='Democrats')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.075, .20) 
    
# plt.figure(figsize=(15,12))
plt.title('Banking: \n Dem vs. Rep Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('fin_dem_rep.png');
plt.show();

#### Education - Male/Female

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(edu_male_df.index):
    x = edu_male_df_sent.year
    y = edu_male_df_sent.sentiment_cmpd
    x2 = edu_female_df_sent.year
    y2 = edu_female_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='male')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='female')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.02, .20) 
    
# plt.figure(figsize=(15,12))
plt.title('Education: \n Female vs. Male Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('edu_m_f.png');
plt.show();

#### Healthcare - Male/Female

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(hc_male_df.index):
    x = hc_male_df_sent.year
    y = hc_male_df_sent.sentiment_cmpd
    x2 = hc_female_df_sent.year
    y2 = hc_female_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='male')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='female')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.02, .30) 
    
# plt.figure(figsize=(15,12))
plt.title('Healthcare: \n Female vs. Male Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('hc_m_f.png');
plt.show();

#### Banking - Male/Female

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt
sfont = {'fontname':'Spectral Bold'}
gfont = {'fontname':'Georgia'}
ifont = {'fontname':'Impact'}

plt.rcParams['figure.figsize'] = [18, 14]
sns.set(font_scale = 1.4)
for index, sentiment_cmpd in enumerate(fin_male_df.index):
    x = fin_male_df_sent.year
    y = fin_male_df_sent.sentiment_cmpd
    x2 = fin_female_df_sent.year
    y2 = fin_female_df_sent.sentiment_cmpd
    
    
    plt.plot(x, y, color='red',marker=">",label='male')
    plt.plot(x2,y2,color='cornflowerblue',marker='P',ls='--',label='female')
    
#     plt.text(x+.001, y+.001, fin_df.index, fontsize=10)
    plt.ylim(-.075, .20) 
    
# plt.figure(figsize=(15,12))
plt.title('Banking: \n Female vs. Male Sentiment Analysis',
          **gfont, fontsize=25,fontweight=40)
plt.xlabel('Year of our Lord',**gfont, fontsize=40)
plt.ylabel('Compound Sentiment Score (.05 and above is positive)',**gfont, fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# plt.legend(loc=0,numpoints=1)
plt.savefig('fin_m_f.png');
plt.show();

### Code Archive- DISREGARD

In [None]:
# Stop 1 on my journey to building the automatic VADER machine full_vader_speech

# def vader_scoring (tokens):
#     agg_score = []
#     for token in tokens:
#         score = vader.polarity_scores(token)
#         agg_score.append(score)
#     return agg_score

In [None]:
# Stop 2 on my journey to building the automatic VADER machine full_vader_speech

# def speech_sentiment(vader_scores):
#     cmpd = 0
#     for i in vader_scores:
#         cmpd+=(i['compound'])
#     final_tally = cmpd / len(vader_scores)
#     print(cmpd)
#     print(len(vader_scores))
#     return final_tally
    

In [None]:
# Stop 3 on my journey to building the automatic VADER machine full_vader_speech

# def vader_scores(text):
#     token_sent = sent_tokenize().tokenize(text)
#     score = vader.polarity_scores(sentence)
#     print("{:-<40} {}".format(sentence, str(score)))

In [None]:
# def full_vader_speech (speech):
#     """
#     Full speech is passed. Tokenizes by sentence. Vader sentiment scores each sentence and
#     appends that to a list of scores. Then each compound score is added to a summed value 
#     of compound vader scores. Lastly dividing summed compound value by the length of the 
#     scores list, you find average compound sentiment score for each speech.
#     Print statements can be uncommented to provide sanity check as it did for me.
#     """
#     vader = SentimentIntensityAnalyzer()
#     tokens = sent_tokenize(speech)
#     all_scores = []
#     cmpd = 0
#     for token in tokens:
#         score = vader.polarity_scores(token)
#         all_scores.append(score)
#     for i in all_scores:
#         cmpd+=(i['compound'])
#     final_tally = cmpd / len(all_scores)
# #     print(cmpd)
# #     print(len(all_scores))
#     return final_tally

In [None]:
# def build_sentiment_tables(kind, dataframe):
#     dataframe = pd.read_pickle('./data/pickles/'+kind+'/'+dataframe+'.pkl')
#     dataframe['sentiment_cmpd']= dataframe.speech.apply(lambda x: full_vader_speech(x))
#     return dataframe
        