In [33]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go # for fancy interactive plot
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup
import os # to check if directory exists and create it if it doesn't
from datetime import datetime # to parse speech date
from nrclex import NRCLex
import spacy
import en_core_web_md
import csv
import prince # for correspondence analysis
import nltk
from scipy.stats import power_divergence # for g-test (log likelihood ratio)
from scipy.stats.contingency import association
from scipy.stats import chi2_contingency
from scipy.stats import chi2
stopwords = pd.read_table('./Data/word_lists/kaggle_stopwords.txt')

In [34]:
#cont_table = pd.read_csv('./Data/genData/ContingencyTable.csv', index_col=0)
cont_table = pd.read_csv('./Data/genData/contingency_table_noSW_oba.csv', index_col=0)
cont_table = cont_table.astype(int)
ctab_gwb = pd.read_csv('./Data/genData/contingency_table_noSW_gwb.csv', index_col=0)
ctab_gwb = ctab_gwb.astype(int)

<A HREF="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.power_divergence.html">SciPy power divergence for G-test or log-likelihood ratio</A> set lambda_='log-likelihood' for g-test

In [13]:
g_test = power_divergence(cont_table, lambda_='log-likelihood')

  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)


<A HREF="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.contingency.association.html#scipy.stats.contingency.association">SciPy association</A>

In [15]:
len(g_test[0])

7716

<A HREF="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html">SciPy contingency table Chi2 test</A>

from <A HREF="https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test">Wikipedia</A> : In cases where the expected value, E, is found to be small (indicating a small underlying population probability, and/or a small number of observations), the normal approximation of the multinomial distribution can fail, and in such cases it is found to be more appropriate to use the G-test, a likelihood ratio-based test statistic

Starting with <A HREF="https://aclanthology.org/J93-1003/">Dunning 1993</A> the g-test (likelihood ratio) became popular for computational linguistics

In [None]:
# Problems with no stopwords version of Obama contingency table ..... arghhh
res_gtest = chi2_contingency(cont_table, lambda_="log-likelihood") # g-test
print("Statistic:",res_gtest[0],"\tp-value:",res_gtest[1],"\tdf:",res_gtest[2])

In [19]:
col_sums = np.reshape(cont_table.sum(axis=0).to_numpy(), (-1,1))
row_sums = np.reshape(cont_table.sum(axis=1).to_numpy(), (-1,1))
total = row_sums.sum()

In [20]:
expected = pd.DataFrame(np.matmul(row_sums, np.transpose(col_sums))/total, columns=cont_table.columns, index=cont_table.index)

### Compare words which occur more than expected - Obama vs. Bush

In [21]:
# Obama Words occuring more often than expected
difference = cont_table.subtract(expected)
difference.mean(axis=0).sort_values(ascending=False).head(20)

know         1.494954e-16
world        1.099231e-16
serve        9.893076e-17
tonight      9.013692e-17
insurance    8.683923e-17
stand        8.354153e-17
challenge    8.134307e-17
human        8.134307e-17
year         7.914461e-17
care         7.694615e-17
trade        7.035077e-17
place        7.035077e-17
home         7.035077e-17
family       7.035077e-17
effort       6.815230e-17
like         6.595384e-17
american     6.595384e-17
high         6.265615e-17
oil          6.155692e-17
end          5.716000e-17
dtype: float64

In [22]:
col_sums_gwb = np.reshape(ctab_gwb.sum(axis=0).to_numpy(), (-1,1))
row_sums_gwb = np.reshape(ctab_gwb.sum(axis=1).to_numpy(), (-1,1))
total_gwb = row_sums_gwb.sum()
expected_gwb = pd.DataFrame(np.matmul(row_sums_gwb, np.transpose(col_sums_gwb))/total, 
                        columns=ctab_gwb.columns, index=ctab_gwb.index)
difference_gwb = ctab_gwb.subtract(expected_gwb)
# GWB Words occuring more often than expected
difference_gwb.mean(axis=0).sort_values(ascending=False).head(20)

people       3.243760
work         2.252611
country      2.150493
help         1.952263
world        1.898200
new          1.886186
american     1.844138
nation       1.766047
year         1.711985
time         1.561810
great        1.459692
come         1.429657
child        1.411636
need         1.381602
terrorist    1.363581
life         1.363581
freedom      1.309518
know         1.309518
today        1.213407
economy      1.159344
dtype: float64

In [37]:
def top_20_diff(ctab):
    col_sums = np.reshape(ctab.sum(axis=0).to_numpy(), (-1,1))
    row_sums = np.reshape(ctab.sum(axis=1).to_numpy(), (-1,1))
    total = row_sums.sum()
    expected = pd.DataFrame(np.matmul(row_sums, np.transpose(col_sums))/total, 
                            columns=ctab.columns, index=ctab.index)
    difference = ctab.subtract(expected)
    # GWB Words occuring more often than expected
    print(difference.mean(axis=0).sort_values(ascending=False).head(20))
top_20_diff(ctab_gwb)

stem            1.332268e-16
year            1.097162e-16
world           7.836868e-17
economic        7.053182e-17
learn           6.791953e-17
pass            6.138880e-17
today           5.485808e-17
terror          5.224579e-17
come            4.963350e-17
agency          4.571507e-17
new             4.179663e-17
future          4.049049e-17
confidence      4.049049e-17
achieve         4.049049e-17
story           3.918434e-17
long            3.918434e-17
resolution      3.853127e-17
income          3.657205e-17
peace           3.657205e-17
compensation    3.526591e-17
dtype: float64


In [39]:
top_20_diff(cont_table)

know         1.494954e-16
world        1.099231e-16
serve        9.893076e-17
tonight      9.013692e-17
insurance    8.683923e-17
stand        8.354153e-17
challenge    8.134307e-17
human        8.134307e-17
year         7.914461e-17
care         7.694615e-17
trade        7.035077e-17
place        7.035077e-17
home         7.035077e-17
family       7.035077e-17
effort       6.815230e-17
like         6.595384e-17
american     6.595384e-17
high         6.265615e-17
oil          6.155692e-17
end          5.716000e-17
dtype: float64


In [88]:
def speech_diff(ctab, date):
    col_sums = np.reshape(ctab.sum(axis=0).to_numpy(), (-1,1))
    row_sums = np.reshape(ctab.sum(axis=1).to_numpy(), (-1,1))
    total = row_sums.sum()
    expected = pd.DataFrame(np.matmul(row_sums, np.transpose(col_sums))/total, 
                            columns=ctab.columns, index=ctab.index)
    diff = ctab.subtract(expected)
    row = diff.iloc[diff.index.str.startswith(date).argmax()]
    sorted_row_desc = row.sort_values(ascending=False)
    print(sorted_row_desc[:20])
    
speech_diff(cont_table, '2014-03-26')

people           18.761196
ideal            15.006051
nation           13.126060
international    12.151876
world            11.169223
young             8.033683
individual        7.586729
human             7.058693
meet              6.785196
russian           6.751513
believe           6.527229
history           6.406414
voice             5.928399
law               5.899960
free              5.810206
conflict          5.285599
force             5.220049
stand             5.117387
come              4.862042
ethnic            4.860226
Name: 2014-03-26-NATOBelgium, dtype: float64


In [None]:
### From obama_vs_gwb.ipynb PCA of obama speechs, far left on plot = '2016-05-06', upper right = '2016-03-23'

In [89]:
speech_diff(cont_table, '2016-05-06')

water         36.421945
kid           19.642423
lead          18.532012
city          16.421945
sure          15.779177
say           15.406253
government    13.946283
mindset       13.695797
community     13.322872
child         13.296186
happen        10.425610
start         10.425610
feel           9.946627
tell           9.539686
hear           9.193792
invest         8.908601
filter         8.828886
people         7.895887
problem        7.718474
raise          7.231818
Name: 2016-05-06-FlintWaterCrisis, dtype: float64


In [90]:
speech_diff(cont_table, '2016-03-23')

people         3.176832
naval          2.997034
vessel         1.997034
fly            1.994067
station        1.991842
flag           1.985910
staff          1.982943
facility       1.971819
half           1.945864
military       1.885795
fortieth       0.999258
consular       0.999258
legation       0.999258
hereunto       0.999258
perpetrate     0.998517
sunset         0.998517
vest           0.998517
sixteen        0.997775
length         0.997775
perpetrator    0.996292
Name: 2016-03-23-BrusselsTerrorism, dtype: float64


In [None]:
# And speeches on opposite sides from GWB's PCA plot: '2008-09-04' and '2003-02-01'

In [91]:
speech_diff(ctab_gwb, '2008-09-04')

fight         30.457572
country       18.680385
know          17.498111
work          13.095375
let            9.694374
stand          9.180231
friend         9.143776
lose           8.210559
government     8.113449
job            7.977842
change         7.455530
love           7.281426
think          6.313796
bad            6.139692
tough          6.002042
party          5.728786
cause          5.556724
believe        5.455530
bless          5.451445
big            5.208516
Name: 2008-09-04-presidential-nominat, dtype: float64


In [94]:
speech_diff(ctab_gwb, '2003-02-01')

naval         2.995696
vessel        1.997848
station       1.995696
staff         1.991391
flag          1.991391
facility      1.989239
fly           1.986011
half          1.972022
military      1.895621
hereunto      0.998924
legation      0.998924
consular      0.998924
possession    0.998924
aboard        0.998924
vest          0.997848
embassy       0.997848
seventh       0.997848
length        0.997848
building      0.991391
post          0.989239
Name: 2003-02-01-7646-honoring-memory, dtype: float64


In [96]:
# Obama first inaugural address
speech_diff(cont_table, '2009-01-20')

nation        9.801565
generation    6.154402
common        4.923401
spirit        4.364294
new           4.171286
man           4.098309
seek          4.056694
meet          3.877562
understand    3.503015
father        3.446321
carry         3.436067
let           3.401083
endure        3.302774
remain        3.169481
word          3.056694
far           2.974668
long          2.862483
moment        2.861881
woman         2.682749
crisis        2.656815
Name: 2009-01-20-Inauguration, dtype: float64


In [95]:
# GWB first inaugural address
speech_diff(ctab_gwb, '2001-01-20')

story         9.626142
citizen       7.930429
nation        7.074491
common        4.639494
ideal         4.599438
courage       4.412509
country       4.219958
commitment    4.198876
promise       3.985242
civility      3.919888
deep          3.692902
time          3.528461
character     3.492621
purpose       3.479269
principle     3.465917
public        3.345748
duty          3.198876
community     3.011947
life          2.969079
grand         2.946592
Name: 2001-01-20-inaugural-52, dtype: float64


In [23]:
# Sort words from one speech by difference between observed and expected
date = '2011-09-09'
# Words occuring more often than expected - over represented
difference[difference.index.str.startswith(date)].iloc[0].sort_values(ascending=False)[0:20]

job         26.703030
tax         20.635265
right       16.577658
company     14.161356
business    13.460690
plan        13.392313
cut         13.165558
pass        11.505691
build        9.805026
away         9.445106
worker       8.614866
economy      8.298722
hire         8.044388
pay          7.902207
ask          7.100771
idea         7.056383
work         6.908773
small        6.748643
proposal     6.384521
meet         5.647260
Name: 2011-09-09-SpeechJobsAct, dtype: float64

In [24]:
# Words occuring less often than expected
difference[difference.index.str.startswith(date)].iloc[0].sort_values(ascending=False)[-20:]

military   -2.494309
love       -2.542900
progress   -2.542900
ago        -2.623884
student    -2.623884
end        -2.676676
effort     -2.838644
child      -2.859044
force      -2.899229
nuclear    -2.915426
day        -3.037209
stand      -3.049203
war        -3.146384
energy     -3.417527
come       -3.487130
think      -3.502714
change     -3.928033
like       -4.547714
say        -5.320959
today      -6.320959
Name: 2011-09-09-SpeechJobsAct, dtype: float64

In [25]:
# LOAD CONTINGENCY TABLE
# Read in saved contingency table
ctab = pd.read_csv('./Data/genData/obamaContingencyTable_noSW.csv',index_col=0) 
# Shorten row index names to date only
ctab = ctab.rename(index = lambda x: x[0:10])
ca = prince.CA(n_components=2,
               n_iter=3,copy=True,
               check_input=True,
               engine='sklearn',
               random_state=33)

In [None]:
%matplotlib notebook

### Ok, let's cram this into one cell. Enter data of interest at top... out pops a nice CA plot

In [None]:
# 2011-07-26 debt ceiling
# 2011-05-02 Osama bin Laden killed
# 2011-09-09 Speech Jobs Act
# 2011-06-23 Afghanistan Pullout
# 2011-05-23 Missouri Tornado

# Sort words from one speech by difference between observed and expected
date = '2011-05-02'
# label location adjustment
x=0.1
y=-0.1
#make word list from difference between observed and expected
over_rep = difference[difference.index.str.startswith(date)].iloc[0].sort_values(ascending=False)[0:75]
under_rep = difference[difference.index.str.startswith(date)].iloc[0].sort_values(ascending=False)[-75:]
over_rep = pd.DataFrame(over_rep)
under_rep = pd.DataFrame(under_rep)

word_list = pd.concat([over_rep, under_rep])
word_list.columns = ['difference']
word_list['word']=word_list.index
word_list.drop(word_list.loc[word_list['word']=='â'].index, inplace=True) # get rid of that pesky goofy a
# Make a short contingency list using topic words
ctabshort = ctab.loc[ : , [col for col in ctab.columns if col in list(word_list['word'])]]
ctabshort.dropna(inplace=True)
# Keep it to the most common 150 words
if np.shape(ctabshort)[1] > 150:
    ctabshort=ctabshort.iloc[:, : 150]
    
ctabshort.columns.rename('words', inplace=True)
ctabshort.index.rename('speeches', inplace=True)
# Fit Correspondence Analysis with speeches and specified word list
ca = ca.fit(ctabshort)

date1 = '2011-05-02'     # date1 - red X   --- uncomment ax.plot line 46 to highlight two points
date2 = date             # date2 - green pentagon 
df1=ca.row_coordinates(ctabshort)
df1.columns=['Comp1', 'Comp2']
df2=ca.column_coordinates(ctabshort)
df2.columns=['Comp1', 'Comp2']

# df1 for speeches
ax = df1.plot(x='Comp1', y='Comp2', figsize=(10, 10), kind='scatter', color='blue')
#Label speech points
#for i, txt in enumerate(df1.index.values.tolist()):
#    ax.annotate(txt, (df1['Comp1'][i], df1['Comp2'][i]))

# df2 for words
df2.plot(ax=ax, x='Comp1', y='Comp2', kind='scatter', color='orange')
for i, txt in enumerate(df2.index.values.tolist()):
    ax.annotate(txt, (df2['Comp1'][i], df2['Comp2'][i]))

#ax.plot(df1.loc[[date1]]['Comp1'], df1.loc[[date1]]['Comp2'], marker='X', color='red', markersize=20)
#ax.annotate(date1, (df1['Comp1'][date1]+0.1, df1['Comp2'][date1]), color='darkred')
ax.plot(df1.loc[[date2]]['Comp1'], df1.loc[[date2]]['Comp2'], marker='p', color='green', markersize=20)
ax.annotate(date2, (df1['Comp1'][date2]+x, df1['Comp2'][date2]+y), color='darkgreen')

#plt.xlim([-0.25, 5.0])
#plt.ylim([-1.2, 1.0])
plt.show()