In [118]:
import pandas as pd
import numpy as np
import unicodedata
import matplotlib.pyplot as plt
import plotly.express as px
import os # to check if directory exists and create it if it doesn't
from datetime import datetime # to parse speech date
import spacy
import en_core_web_md
import csv
import nltk
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import manifold # for t-sne
import plotly.graph_objects as go
import plotly.figure_factory as ff
import kaleido

# [To PCA on tidy data csv](#pcatidydata)

In [3]:
# Load the two data sets needed
# string encodings to try utf_8, iso8859_15
facetPlotData = pd.read_csv('facetPlotData.csv')
df = pd.read_csv('fullEmotionData.csv', encoding='utf_8') # apostrophes aren't being read correctly
# Pandas reads date as string, cast to datetime object
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
facetPlotData['date'] = pd.to_datetime(facetPlotData['date'], format='%Y-%m-%d')
facetPlotData.shape # Should be (3000, 4) 

(3000, 4)

In [4]:
sources = ['text_oba', 'text_nyt', 'text_wsj']
for source in sources:
    tmp1 = [re.sub(r"[^\w\s^']", " ", text).lower() for text in df[source]]
    df[source] = [re.sub(' +', ' ', text).replace('\n', '').replace("â " , "'") for text in tmp1]

In [5]:
# Read in new articles from OtherSources directory
# Load up the files
path = './OtherSources/' 
list_of_files = []

# Get file names in target directory
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            list_of_files.append(os.path.join(root,file))

# Open files and read in text
speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        text = f.read()
    f.close()
    speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches]

info = [(file.split('_')[0][15:]+'.txt', file.split('_')[1][:-4]) for file in list_of_files]

# speeches[i] has the article text
# info[i][0] has the filename to match row of original Obama speech
# info[i][1] has the source name for naming the column
# Some new source/column names are: FoxNews, InfoWars, NationalReview, Intercept

for i in range(len(info)):
    df[info[i][1]] = None

for i in range(len(list_of_files)):
    df.loc[df['file'] == info[i][0], [info[i][1]]] = speeches[i]

<A HREf="https://arxiv.org/abs/1905.05583">Paper with some stuff on dealing with long text</A> This doesn't seem to be an issue with spacy encodings

In [6]:
# Clean up Obama speeches and what the heck, let's do NYT and WSJ too, just to be sure no more unicode garbage
df['text_oba'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_oba']]
df['text_nyt'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_nyt']]
df['text_wsj'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_wsj']]

In [7]:
nlp = spacy.load('en_core_web_md')

In [8]:
# This takes a little bit to calculate the encodings... maybe a minute
# Encode all the texts and save to source specific columns
sources = ['text_oba', 'text_nyt', 'text_wsj', 'FoxNews', 'InfoWars', 'NationalReview', 'Intercept']
enc_col = ['enc_oba', 'enc_nyt', 'enc_wsj', 'enc_fn', 'enc_iw', 'enc_nr', 'enc_int']

# initialize the new encoding columns
for col in enc_col:
    df[col]=None

for i in range(len(df)):
    for j in range(len(sources)):
        #print(i, sources[j])
        if df[sources[j]][i] != None:
            # dfmi.loc[:, ('one', 'second')]
            df[enc_col[j]][i] = list(nlp(df[sources[j]][i]).vector)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[enc_col[j]][i] = list(nlp(df[sources[j]][i]).vector)


In [9]:
pcaBiplotData = pd.DataFrame(columns=['date','source'])

In [10]:
# Reshape data for pca analysis and plotting
for i in range(len(df)):
    for j in range(len(sources)):
        if df.loc[i,sources[j]] != None:
            index = (i*len(sources))+j
            pcaBiplotData.loc[index, 'date' ] = pd.to_datetime(df['date'][i])
            pcaBiplotData.loc[index, 'source' ] = sources[j]
            for k in range(len(df['enc_oba'][99])):
                columnname = 'enc'+str(k+1)
                # initialize column, not sure this is necessary
                pcaBiplotData.loc[index, columnname] = None
                encoding = enc_col[j]
                pcaBiplotData.loc[index, columnname] = df[encoding][i][k]
pcaBiplotData = pcaBiplotData.reset_index()

  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnname] = None
  pcaBiplotData.loc[index, columnn

In [11]:
pcaBiplotData['date'] = pd.to_datetime(pcaBiplotData['date'], format='%Y-%m-%d')

In [12]:
# Selecting dataframe subset with columns that start with 'enc'
# pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')]

In [13]:
# All encoding values are already on same scale
pca = PCA(n_components = 2)
pcafit = pca.fit_transform(pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')])

In [14]:
plot_data = pd.DataFrame()
plot_data = pd.concat([pd.DataFrame(pcafit), pcaBiplotData['date'], pcaBiplotData['source']], axis=1, ignore_index=True)

<A HREF="https://plotly.com/python/discrete-color/#color-sequences-in-plotly-express">Plotly discrete color palettes/sequences</A><BR><A HREF="https://plotly.com/python/plotly-express/">Plotly Express docs</A>

In [64]:
fig = px.scatter(pcafit, 
                 x=0, 
                 y=1, 
                 color=pcaBiplotData['source'],
                 hover_name=pcaBiplotData['date'],
                 color_discrete_sequence=px.colors.qualitative.Bold,
                 title="Principle Components Biplot")
fig.show()

In [16]:
# What is that NYT way out there?
plot_data.loc[plot_data[0].idxmax()]

0              10.995829
1               3.820567
2    2012-07-21 00:00:00
3               text_oba
Name: 140, dtype: object

Very short article announcing new Intel fab and mentioning that Obama spoke at Intel.

### Save the data

In [17]:
#pcaBiplotData.to_csv('pcaBiplotData.csv', index=False)

In [18]:
#df.to_csv('df_encodings.csv')

Playing with python's PCA

In [19]:
pca2 = PCA(n_components = 10)
pcafit2 = pca2.fit_transform(pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')])

In [20]:
# The singular values corresponding to each of the selected components - guessing they're refering to eigenvalues
print(pca2.singular_values_)

[81.78093018 42.05430485 31.00536398 29.35742016 20.7621332  17.90038185
 17.76776161 16.05083271 13.82075759 13.030196  ]


In [21]:
# The amount of variance explained by each of the selected components.
print(pca2.explained_variance_)

[21.64440304  5.72350989  3.11110872  2.78918485  1.39503616  1.03696981
  1.02166134  0.83375156  0.61816615  0.54946928]


In [22]:
# Percentage of variance explained by each of the selected components.
print(pca2.explained_variance_ratio_)

[0.47480208 0.12555368 0.06824678 0.06118491 0.03060219 0.02274747
 0.02241166 0.01828958 0.01356039 0.01205342]


In [23]:
# Sklearn's T-SNE
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
from sklearn import manifold

In [24]:
# Perplexity value can greatly influence the shape of the results, typical ranges from 5 to 50
# Some perplexity values with decent separation: 6,7,9,10,12,14,15
t_sne = manifold.TSNE(
    n_components=2,
    perplexity=10,
    init="random",
    n_iter=250,
    random_state=0,
);
tsne_data = pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')]
S_t_sne = t_sne.fit_transform(tsne_data);
fig = px.scatter(S_t_sne, 
                 x=0, 
                 y=1, 
                 color=pcaBiplotData['source'],
                 hover_name=pcaBiplotData['date'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

In [25]:
# Join PCA components to biplot dataframe and then save
pcaBiplotData.drop(['index'], axis=1, inplace=True)
pcafitdf = pd.DataFrame(pcafit, columns=['PCA1', 'PCA2'])
pcaBiplotData = pcaBiplotData.join(pcafitdf)

In [26]:
#pcaBiplotData.to_csv('pcaBiplotData.csv', index=False)

### <a id='pcatidydata'>Try PCA on the big tidy data set with emo, pos, sentence and word counts</a>

In [95]:
scaling=StandardScaler()
df_tidy = pd.read_csv('tidy_data.csv')
df_tidy_all = df_tidy.copy(deep=True)
# drop variables which will simply increase with length of text
df_tidy = df_tidy.drop(['sent_count', 'word_count', 'char_count', 'syl_count', 'dc_word_count', 'gf_word_count', 'poly_word_count'], axis=1)
#df_tidy.columns

In [89]:
pca_data = df_tidy.drop(['date', 'source', 'enc_value','PCA1', 'PCA2','num_sents', 'num_words'], axis=1, inplace=False)
scaling.fit(pca_data)
scaled_data = scaling.transform(pca_data)
pca = PCA(n_components = 2)
pcafit2 = pca.fit_transform(scaled_data)

In [90]:
fig = px.scatter(pcafit2, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 hover_name=df_tidy['date'],
                 color_discrete_sequence=px.colors.qualitative.Bold,
                 title="Principle Components Biplot")
fig.update_layout(
    xaxis_title="PC1",
    yaxis_title="PC2")
fig.show()

In [91]:
print(pca.explained_variance_ratio_)

[0.33800732 0.17423761]


In [92]:
print(pca.singular_values_)

[66.03252531 47.40954697]


In [93]:
loadings=pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=pca_data.columns)
loadings=loadings.sort_values(by='PC1')
def large_bold(val):
    bold = 'bold' if abs(val) > 0.18 else ''
    return 'font-weight: %s' % bold
loadings.T.style.applymap(large_bold) # .T makes it wide instead of tall

Unnamed: 0,INTJ,num_unique_words,joy,TBpolarity,TBsubjectivity,SYM,anticipation,surprise,sadness,positive,disgust,trust,SCONJ,ADV,NUM,anger,PRON,negative,fear,CCONJ,AUX,PROPN,PART,ADP,VERB,PUNCT,ADJ,DET,NOUN,word_per_sent,words_per_sentence,dc_word_perc,gf_word_perc,depth,poly_word_perc,dale_chall,gunning_fog,char_per_word,syl_per_word,ari,smog,flesch_kincaid,coleman_liau
PC1,-0.16702,-0.132444,-0.132269,-0.129103,-0.126149,-0.073157,-0.065939,-0.037702,-0.012965,-0.005148,-0.001795,0.005319,0.016202,0.021169,0.023487,0.050428,0.053226,0.056491,0.069414,0.115504,0.143954,0.144245,0.155025,0.167701,0.168216,0.168301,0.168391,0.169595,0.173988,0.183866,0.189563,0.199262,0.203023,0.206478,0.209358,0.214998,0.216969,0.21886,0.21905,0.221974,0.223523,0.22412,0.227326
PC2,-0.104625,-0.094602,0.026973,0.000585,0.057694,-0.213439,-0.022706,0.004712,-0.011963,-0.005324,0.00543,-0.027861,0.030459,0.035742,-0.113038,0.0161,0.057522,-0.021456,0.039992,0.162808,0.258094,0.204661,0.239667,0.264102,0.269095,0.262491,0.249891,0.264945,0.259209,-0.171505,-0.164969,-0.090393,-0.140303,-0.121135,-0.134169,-0.170053,-0.179448,-0.107936,-0.115903,-0.169312,-0.171812,-0.168814,-0.122389


In [94]:
fig = px.bar(loadings, 
             x=loadings.index, 
             y='PC1', 
             color='PC1', 
             title = 'PC1 loadings')
fig.show()

In [87]:
fig = px.bar(loadings, 
             x=loadings.index, 
             y='PC2', 
             color='PC2', 
             title = 'PC2 loadings')
fig.show()

In [97]:
# Furthes right Obama speech 2016-03-23
df_tidy_all.query('source == "oba"').query('date == "2016-03-23"').T

Unnamed: 0,274
date,2016-03-23
source,oba
ADJ,0.069703
ADP,0.076828
ADV,0.061958
AUX,0.053903
CCONJ,0.078377
DET,0.077447
INTJ,0.0
NOUN,0.078686


In [112]:
df_tidy_all.query('source == "oba"').anger.median()

0.05572808833678395

In [114]:
df_tidy_all.query('source == "oba"').query('date == "2015-01-08"').anger

229    0.163636
Name: anger, dtype: float64

In [115]:
df_tidy_all.query('source == "nyt"').query('date == "2015-10-02"').T

Unnamed: 0,258
date,2015-10-02
source,nyt
ADJ,0.067064
ADP,0.067354
ADV,0.064005
AUX,0.064842
CCONJ,0.067128
DET,0.067193
INTJ,0.028815
NOUN,0.067386


In [117]:
df_tidy_all.query('source == "nyt"').median(numeric_only=True)

ADJ                      0.069442
ADP                      0.069782
ADV                      0.068327
AUX                      0.069364
CCONJ                    0.068404
DET                      0.069633
INTJ                     0.000000
NOUN                     0.070115
NUM                      0.065055
PART                     0.068897
PRON                     0.069390
PROPN                    0.068867
PUNCT                    0.070332
SCONJ                    0.067977
SYM                      0.000000
VERB                     0.069982
PCA1                    -2.821981
PCA2                     0.247196
anger                    0.070658
anticipation             0.094775
disgust                  0.026988
fear                     0.092532
joy                      0.052242
negative                 0.152947
positive                 0.230911
sadness                  0.063914
surprise                 0.038627
trust                    0.154262
num_sents               51.500000
num_words     

<A HREF="https://plotly.com/python/figure-factory-table/">Plotly Figure Factory</A>

In [42]:
df_tidy.query('source == "oba"').words_per_sentence.median()

20.66673983026046

In [39]:
df_tidy.query('source == "oba"').words_per_sentence.std()

3.974525548147794

In [43]:
df_tidy.query('source == "nyt"').words_per_sentence.median()

27.24308300395257

In [40]:
df_tidy.query('source == "nyt"').words_per_sentence.std()

4.13493756317543

In [44]:
df_tidy.query('source == "wsj"').words_per_sentence.median()

24.980288461538464

In [41]:
df_tidy.query('source == "wsj"').words_per_sentence.std()

3.3016256473246495

In [53]:
df_tidy.query('source == "oba"').dale_chall.std()

1.8660367791631192

In [54]:
df_tidy.query('source == "nyt"').dale_chall.std()

2.0675398047665117

In [55]:
df_tidy.query('source == "wsj"').dale_chall.std()

1.748271429016244

<A HREF="https://plotly.com/python/figure-factory-table/">Plotly figure factory documentation</A>

In [119]:
to_png = loadings.copy()
to_png['var'] = to_png.index
table = ff.create_table(round(to_png,3))
table.update_layout(
    autosize=False,
    width=400,
    height=700)

In [120]:
# Not working
import kaleido
table.write_image("table_plotly.png", scale=2)

In [121]:
font_color=['black']*2+[['red' if boolv else 'black' for boolv in abs(to_png['PC1'])>0.16]]
table_trace = go.Table(
                 header = dict(height = 50,
                               values = [['Var'], ['PC1'], ['PC2']],
                               align = ['center']*3,
                               fill_color = '#686dea',
                               font_color = '#ffffff',
                               font_size =20),
                 cells = dict(values = [to_png['var'], round(to_png['PC1'],3), round(to_png['PC2'],3)],
                              line = dict(color='#506784'),
                              align = ['left']*3,
                              font_color=font_color,
                              font_family="Arial", 
                              font_size=14,   
                              height = 25,
                              fill = dict(color='rgb(245,245,245)'))
                             )
layout = go.Layout(width=600, height=800, autosize=False,
                   title_text='Principle Components',
                   title_x=0.5, showlegend=False)
fig = go.Figure(data=[table_trace], layout=layout)
fig.show()

In [122]:
fig = px.imshow(to_png,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()

<A HREF="https://distill.pub/2016/misread-tsne/">Some info about t-sne</A>

In [137]:
# Maybe the right perplexity in t-sne will separate these .... mostly not
# Perplexity value can greatly influence the shape of the results, typical ranges from 5 to 50
# Some perplexity values with decent separation: 6,7,9,10,12,14,15
t_sne = manifold.TSNE(
    n_components=2,
    perplexity=10,
    init="random",
    n_iter=1200,
    random_state=0,
);
tsne_data = scaled_data
S_t_sne = t_sne.fit_transform(tsne_data);
fig = px.scatter(S_t_sne, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 hover_name=df_tidy['date'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

In [124]:
# ok then, how about MDS?
# https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
# Data needs to be a similarity or disimilarity matrix ... maybe another day
md_scaling = manifold.MDS(
    n_components=2,
    max_iter=50,
    n_init=4,
    random_state=0
)
S_scaling = md_scaling.fit_transform(scaled_data)

fig = px.scatter(S_scaling, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()



