### Merge approval rating data into tidy data csv. Approval data start on 2009-01-21

<A HREF="https://news.gallup.com/poll/116479/barack-obama-presidential-job-approval.aspx">Gallup Approval Numbers</A><BR>

In [69]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from textblob import TextBlob
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize

In [41]:
approval = pd.read_csv('./Data/genData/approvalData.csv')
approval['beg'] = pd.to_datetime(approval['beg'], format='%Y-%m-%d')
approval['end'] = pd.to_datetime(approval['end'], format='%Y-%m-%d')
approval.sort_values(by='beg', ignore_index=True, inplace=True)
approval.shape

(418, 5)

In [6]:
tidy_data = pd.read_csv('./Data/genData/tidy_data.csv')
tidy_data['date'] = pd.to_datetime(tidy_data['date'], format='%Y-%m-%d')
tidy_data.head(2)

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,sadness,surprise,trust,num_sents,num_words,num_unique_words,depth,TBsubjectivity,TBpolarity,words_per_sentence
0,2008-06-04,nyt,0.064458,0.065088,0.0624,0.064962,0.064416,0.063408,0.055598,0.06492,...,0.07563,0.046218,0.130252,47,1459,620,7.276596,0.406976,0.13588,31.042553
1,2008-06-04,oba,0.064649,0.06467,0.06444,0.064398,0.064471,0.064638,0.054967,0.064503,...,0.042017,0.048739,0.159664,217,5856,939,5.986175,0.445383,0.167408,26.986175


In [48]:
# Make an index of all dates in approval data range
idx = pd.date_range('2009-01-21', '2017-01-19')
# copy approval data to new df and use beg date for index
app = approval.set_index('beg')
# reindex using all dates and forward fill missing data
app = app.reindex(idx).ffill()
app.head(10)

Unnamed: 0,end,approval,disapproval,no opinon
2009-01-21,2009-01-25,67.0,13.0,22.0
2009-01-22,2009-01-25,67.0,13.0,22.0
2009-01-23,2009-01-25,67.0,13.0,22.0
2009-01-24,2009-01-25,67.0,13.0,22.0
2009-01-25,2009-01-25,67.0,13.0,22.0
2009-01-26,2009-02-01,66.0,18.0,17.0
2009-01-27,2009-02-01,66.0,18.0,17.0
2009-01-28,2009-02-01,66.0,18.0,17.0
2009-01-29,2009-02-01,66.0,18.0,17.0
2009-01-30,2009-02-01,66.0,18.0,17.0


In [50]:
# make new date column which is a copy of the index
app['date'] = app.index
# drop the end row
app = app.drop('end', axis=1)
app

Unnamed: 0,approval,disapproval,no opinon,date
2009-01-21,67.0,13.0,22.0,2009-01-21
2009-01-22,67.0,13.0,22.0,2009-01-22
2009-01-23,67.0,13.0,22.0,2009-01-23
2009-01-24,67.0,13.0,22.0,2009-01-24
2009-01-25,67.0,13.0,22.0,2009-01-25
...,...,...,...,...
2017-01-15,57.0,39.0,4.0,2017-01-15
2017-01-16,59.0,37.0,4.0,2017-01-16
2017-01-17,59.0,37.0,4.0,2017-01-17
2017-01-18,59.0,37.0,4.0,2017-01-18


In [53]:
# merge tidy_data and approval data
new = pd.merge(tidy_data, app, on='date',how='left')
# drop the rows with NaNs, before 2009-01-21, when approval data starts
new = new.dropna()
new

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,num_sents,num_words,num_unique_words,depth,TBsubjectivity,TBpolarity,words_per_sentence,approval,disapproval,no opinon
12,2009-04-02,nyt,0.069702,0.070678,0.070569,0.069539,0.066883,0.070136,0.021192,0.070732,...,46,1256,561,6.260870,0.357102,0.067402,27.304348,62.0,28.0,10.0
13,2009-04-02,oba,0.069923,0.070734,0.070779,0.070644,0.070284,0.070509,0.000000,0.070554,...,76,1491,642,5.328947,0.422101,0.098396,19.618421,62.0,28.0,10.0
14,2009-04-02,wsj,0.071057,0.071814,0.067367,0.066373,0.071672,0.071719,0.000000,0.071530,...,62,1420,632,5.951613,0.296947,0.057422,22.903226,62.0,28.0,10.0
15,2009-04-15,nyt,0.069284,0.067063,0.070365,0.070545,0.068624,0.069224,0.000000,0.070605,...,39,1128,538,6.846154,0.404185,0.087420,28.923077,62.0,29.0,9.0
16,2009-04-15,oba,0.064179,0.064196,0.063894,0.064032,0.063980,0.064170,0.053538,0.064075,...,283,7027,1735,5.908127,0.430164,0.090057,24.830389,62.0,29.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2016-11-09,oba,0.069479,0.069178,0.069730,0.069078,0.068376,0.069429,0.042841,0.069329,...,79,1303,470,4.392405,0.490976,0.237315,16.493671,57.0,41.0,3.0
296,2016-11-09,wsj,0.069166,0.068667,0.068939,0.068576,0.068078,0.069120,0.000000,0.069211,...,61,1446,623,6.098361,0.397283,0.114535,23.704918,57.0,41.0,3.0
297,2017-01-11,nyt,0.068761,0.069864,0.067920,0.065872,0.069811,0.069916,0.041708,0.070074,...,53,1275,558,5.679245,0.454084,0.178910,24.056604,57.0,39.0,4.0
298,2017-01-11,oba,0.066897,0.066921,0.066529,0.066823,0.066811,0.066885,0.066676,0.066493,...,231,5143,1592,4.900433,0.500204,0.182039,22.264069,57.0,39.0,4.0


In [54]:
# Save data
# new.to_csv('./Data/genData/tidy_data_approval.csv', index=False)

In [60]:
# Very little correlation between approval and anything else (except for disapproval)
new.corr(numeric_only=True)['approval']

ADJ                   0.012558
ADP                   0.041829
ADV                  -0.040080
AUX                  -0.032651
CCONJ                 0.055880
DET                   0.042971
INTJ                 -0.058567
NOUN                  0.044863
NUM                   0.009160
PART                 -0.004145
PRON                  0.018160
PROPN                -0.058026
PUNCT                 0.043187
SCONJ                -0.095271
SYM                   0.059188
VERB                  0.039351
PCA1                  0.001198
PCA2                  0.097942
anger                 0.029574
anticipation         -0.040887
disgust               0.063164
fear                 -0.022048
joy                  -0.026764
negative              0.001365
positive             -0.023861
sadness               0.042273
surprise              0.105901
trust                -0.006237
num_sents             0.051250
num_words             0.034043
num_unique_words      0.006119
depth                -0.096575
TBsubjec

<A HREF="https://plotly.com/python/plotly-express/">Plotly Express</A>

In [68]:
# "Do you approve or disapprove of the way Barack Obama is handling his job as president?"
fig = px.line(approval, 
             x="beg", 
             y="approval", 
             hover_name="approval",
             hover_data={'approval':':.3f'},
             title = 'Percent of polled Americans who approve of Barack Obama')
fig.show()

In [73]:
# "Do you approve or disapprove of the way Barack Obama is handling his job as president?"
fig = go.Figure()
fig.add_trace(go.Scatter(x=approval['beg'], y=approval['approval'],
                    mode='lines',
                    name='approval'))
fig.add_trace(go.Scatter(x=approval['beg'], y=approval['disapproval'],
                    mode='lines',
                    name='disapproval'))
fig.update_layout(title='Gallup Poll Obama Presidential Approval Rating',
                   xaxis_title='Date',
                   yaxis_title='Percent')
fig.show()