# Libraries

In [3]:
import json
import requests
import pandas as pd
from pandas.io.json import json_normalize

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from collections import Counter

In [2]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

# Import Data

In [4]:
data = pd.read_csv('data/data_full_sentiment.csv')

In [103]:
data = data.dropna()
data

Unnamed: 0,Candidate,Date,Headline,Site,Headline_Cleaned,Headline_Keywords,Headline_Stopwords,Headline_Tokens,Sentiment_Values,Sentiment,Headline_polarity
0,trump,2016-10-01,Trump and the Intellectuals,New York Times,Trump and the Intellectuals,"Trump, Intellectuals","and, the","Trump, and, the, Intellectuals","[{'neg': 0.0, 'neu': 0.536, 'pos': 0.464, 'com...",neu,0.3818
1,trump,2016-10-01,The Other Trump,New York Times,The Other Trump,Trump,"The, Other","The, Other, Trump","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",neu,0.0000
2,trump,2016-10-01,Trump Infrastructure Plan’s Fatal Flaw,New York Times,Trump Infrastructure Plan’s Fatal Flaw,"Trump, Infrastructure, Plan, Fatal, Flaw",’s,"Trump, Infrastructure, Plan, ’s, Fatal, Flaw","[{'neg': 0.467, 'neu': 0.533, 'pos': 0.0, 'com...",neu,-0.5423
3,trump,2016-10-01,Donald Trump’s Pathetic Fraternity,New York Times,Donald Trump’s Pathetic Fraternity,"Donald, Trump, Pathetic, Fraternity",’s,"Donald, Trump, ’s, Pathetic, Fraternity","[{'neg': 0.552, 'neu': 0.448, 'pos': 0.0, 'com...",neg,-0.5719
4,trump,2016-10-01,How Could Anyone Vote for Trump?,New York Times,How Could Anyone Vote for Trump?,"Vote, Trump, ?","How, Could, Anyone, for","How, Could, Anyone, Vote, for, Trump, ?","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",neu,0.0000
5,trump,2016-10-01,Donald Trump Opens New Line of Attack on Hilla...,New York Times,Donald Trump Opens New Line of Attack on Hilla...,"Donald, Trump, Opens, New, Line, Attack, Hilla...","of, on, Her","Donald, Trump, Opens, New, Line, of, Attack, o...","[{'neg': 0.22, 'neu': 0.78, 'pos': 0.0, 'compo...",neu,-0.4767
6,trump,2016-10-01,Report That Donald Trump Did Business in Cuba ...,New York Times,Report That Donald Trump Did Business in Cuba ...,"Report, Donald, Trump, Business, Cuba, Ups, An...","That, Did, in, the, in","Report, That, Donald, Trump, Did, Business, in...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",neu,0.0000
7,trump,2016-10-01,"After Trump-Clinton, Vice-Presidential Debate ...",New York Times,"After Trump-Clinton, Vice-Presidential Debate ...","Trump, -, Clinton, ,, Vice, -, Presidential, D...","After, Is, n’t, the, of","After, Trump, -, Clinton, ,, Vice, -, Presiden...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",neu,0.0000
8,trump,2016-10-01,Donald Trump Is Seen as Helping Push Asian-Ame...,New York Times,Donald Trump Is Seen as Helping Push Asian-Ame...,"Donald, Trump, Seen, Helping, Push, Asian, -, ...","Is, as, Into","Donald, Trump, Is, Seen, as, Helping, Push, As...","[{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compo...",neu,0.2960
9,trump,2016-10-01,Girl Talk at Trump Tower,New York Times,Girl Talk at Trump Tower,"Girl, Talk, Trump, Tower",at,"Girl, Talk, at, Trump, Tower","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",neu,0.0000


# General Frequency

In [6]:
def wordextractdf(df, col):
    """
    Extract words in the dataframe

    Inputs:
    df -- dataframe
    col -- set of words that we can extract ('keywords', 'hashtags',...)

    Outputs:
    wordsfreq - Counter containing a list of words and its frequency in the dataframe
    """
    words = df[col][df[col].isnull() == False]
    lst = [w for word in words for w in word.split(",")]
    return Counter(lst)

def calc_freq(df, col):
    kw = wordextractdf(df, col)
    dictlist = [[key, value] for key, value in kw.items()]
    freq_df = pd.DataFrame()
    freq_df['words'] = [i[0] for i in dictlist]
    freq_df['freq_total'] = [i[1] for i in dictlist]
    return freq_df

sorted(wordextractdf(data, 'Headline_Keywords').most_common())
freq = calc_freq(data, 'Headline_Keywords')

In [67]:
freq['words'] = freq['words'].apply(lambda x: x.strip())

freq.drop(freq.index[freq['words'].isin([' ', ':','?','-',"'",'’','‘','(',')'])], inplace = True)
freq_clean= freq.pivot_table(index='words', values ='freq_total', aggfunc = 'sum').sort_values(by=['freq_total'], ascending=False)
freq_clean = freq_clean.reset_index()
# freq_clean.drop([1], inplace = True)
freq_clean.head()

Unnamed: 0,words,freq_total
0,Trump,962
1,,927
2,Clinton,467
3,Donald,350
4,Hillary,164


In [69]:
#freq_clean.to_csv('data/frequency_full.csv', index=False)

# Frequency before the election

In [78]:
data_prev = data[(data['Date'] < '2016-11-08')]

In [79]:
sorted(wordextractdf(data_prev, 'Headline_Keywords').most_common())
freq = calc_freq(data_prev, 'Headline_Keywords')

In [80]:
freq['words'] = freq['words'].apply(lambda x: x.strip())

freq.drop(freq.index[freq['words'].isin([' ', ':','?','-',"'",'’','‘','(',')'])], inplace = True)
freq_clean= freq.pivot_table(index='words', values ='freq_total', aggfunc = 'sum').sort_values(by=['freq_total'], ascending=False)
freq_clean = freq_clean.reset_index()
freq_clean.drop([1], inplace = True)
freq_clean.head(20)

Unnamed: 0,words,freq_total
0,Trump,616
2,Clinton,383
3,Donald,256
4,Hillary,133
5,Debate,44
6,Election,39
7,New,35
8,Presidential,33
9,WikiLeaks,32
10,Says,31


In [81]:
#freq_clean.to_csv('data/frequency_before_election.csv', index=False)

# Frequency per Day

In [104]:
dates = data['Date'].tolist()
dates = set(dates)

In [122]:
#freq_days = pd.DataFrame(columns=['Date', 'KeyWords'])
freq_days = []

for i in dates:
    data_day = data[(data['Date'] == i)]
    sorted(wordextractdf(data_day, 'Headline_Keywords').most_common())
    freq = calc_freq(data_day, 'Headline_Keywords')
    freq['words'] = freq['words'].apply(lambda x: x.strip())
    freq['words'] = freq['words'].apply(lambda x: x.lower())
    freq.drop(freq.index[freq['words'].isin(['',' ', ':','?','-',"'",'’','‘','(',')','trump','donald','clinton','hillary','white','house','america','election'])], inplace = True)
    #print(freq.columns)
    freq_clean= freq.pivot_table(index='words', values ='freq_total', aggfunc = 'sum').sort_values(by=['freq_total'], ascending=False)
    freq_clean = freq_clean.reset_index()
    freq_list = freq_clean['words'].tolist()[:5]
   # print(freq_list)
    freq_days_dict = {'Date': i, 'KeyWords': freq_list }
    freq_days.append(freq_days_dict)

In [126]:
df_freq_days = pd.DataFrame(freq_days)
df_freq_days = df_freq_days.sort_values(by='Date')
df_freq_days.head()

Unnamed: 0,Date,KeyWords
41,2016-10-01,"[debate, talk, presidential, plan, new]"
22,2016-10-02,"[tax, leaked, return, bill, past]"
46,2016-10-03,"[tax, taxes, plan, james, returns]"
5,2016-10-04,"[says, debate, presidential, pence, tax]"
34,2016-10-05,"[pence, debate, mike, kaine, g.o.p]"


In [127]:
df_freq_days.to_csv('data/frequency_days.csv', index=False)