In [2]:
import pandas as pd
import numpy as np
import unicodedata
import matplotlib.pyplot as plt
import plotly.express as px
import os # to check if directory exists and create it if it doesn't
from datetime import datetime # to parse speech date
import spacy
import en_core_web_md
import csv
import nltk
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import manifold # for t-sne

In [3]:
# Load the two data sets needed
# string encodings to try utf_8, iso8859_15
facetPlotData = pd.read_csv('facetPlotData.csv')
df = pd.read_csv('fullEmotionData.csv', encoding='utf_8') # apostrophes aren't being read correctly
# Pandas reads date as string, cast to datetime object
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
facetPlotData['date'] = pd.to_datetime(facetPlotData['date'], format='%Y-%m-%d')
facetPlotData.shape # Should be (3000, 4) 

(3000, 4)

In [4]:
sources = ['text_oba', 'text_nyt', 'text_wsj']
for source in sources:
    tmp1 = [re.sub(r"[^\w\s^']", " ", text).lower() for text in df[source]]
    df[source] = [re.sub(' +', ' ', text).replace('\n', '').replace("â " , "'") for text in tmp1]

In [5]:
# Read in new articles from OtherSources directory
# Load up the files
path = './OtherSources/' 
list_of_files = []

# Get file names in target directory
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            list_of_files.append(os.path.join(root,file))

# Open files and read in text
speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        text = f.read()
    f.close()
    speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches]

info = [(file.split('_')[0][15:]+'.txt', file.split('_')[1][:-4]) for file in list_of_files]

# speeches[i] has the article text
# info[i][0] has the filename to match row of original Obama speech
# info[i][1] has the source name for naming the column
# Some new source/column names are: FoxNews, InfoWars, NationalReview, Intercept

for i in range(len(info)):
    df[info[i][1]] = None

for i in range(len(list_of_files)):
    df.loc[df['file'] == info[i][0], [info[i][1]]] = speeches[i]

<A HREf="https://arxiv.org/abs/1905.05583">Paper with some stuff on dealing with long text</A> This doesn't seem to be an issue with spacy encodings

In [6]:
# Clean up Obama speeches and what the heck, let's do NYT and WSJ too, just to be sure no more unicode garbage
df['text_oba'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_oba']]
df['text_nyt'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_nyt']]
df['text_wsj'] = [unicodedata.normalize("NFKD", speech) for speech in df['text_wsj']]

In [91]:
nlp = spacy.load('en_core_web_md')

In [92]:
# This takes a little bit to calculate the encodings... maybe a minute
# Encode all the texts and save to source specific columns
sources = ['text_oba', 'text_nyt', 'text_wsj', 'FoxNews', 'InfoWars', 'NationalReview', 'Intercept']
enc_col = ['enc_oba', 'enc_nyt', 'enc_wsj', 'enc_fn', 'enc_iw', 'enc_nr', 'enc_int']

# initialize the new encoding columns
for col in enc_col:
    df[col]=None

for i in range(len(df)):
    for j in range(len(sources)):
        #print(i, sources[j])
        if df[sources[j]][i] != None:
            # dfmi.loc[:, ('one', 'second')]
            df[enc_col[j]][i] = list(nlp(df[sources[j]][i]).vector)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [93]:
pcaBiplotData = pd.DataFrame(columns=['date','source'])

In [94]:
# Reshape data for pca analysis and plotting
for i in range(len(df)):
    for j in range(len(sources)):
        if df.loc[i,sources[j]] != None:
            index = (i*len(sources))+j
            pcaBiplotData.loc[index, 'date' ] = pd.to_datetime(df['date'][i])
            pcaBiplotData.loc[index, 'source' ] = sources[j]
            for k in range(len(df['enc_oba'][99])):
                columnname = 'enc'+str(k+1)
                # initialize column, not sure this is necessary
                pcaBiplotData.loc[index, columnname] = None
                encoding = enc_col[j]
                pcaBiplotData.loc[index, columnname] = df[encoding][i][k]
pcaBiplotData = pcaBiplotData.reset_index()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [9]:
# Selecting dataframe subset with columns that start with 'enc'
# pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')]

In [95]:
# All encoding values are already on same scale
pca = PCA(n_components = 2)
pcafit = pca.fit_transform(pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')])

In [96]:
plot_data = pd.DataFrame()
plot_data = pd.concat([pd.DataFrame(pcafit), pcaBiplotData['date'], pcaBiplotData['source']], axis=1, ignore_index=True)

<A HREF="https://plotly.com/python/discrete-color/#color-sequences-in-plotly-express">Plotly discrete color palettes/sequences</A><BR><A HREF="https://plotly.com/python/plotly-express/">Plotly Express docs</A>

In [97]:
fig = px.scatter(pcafit, 
                 x=0, 
                 y=1, 
                 color=pcaBiplotData['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

In [98]:
# What is that NYT way out there?
plot_data.loc[plot_data[0].idxmax()]

0              10.995829
1               3.820567
2    2012-07-21 00:00:00
3               text_oba
Name: 140, dtype: object

Very short article announcing new Intel fab and mentioning that Obama spoke at Intel.

Save the data

In [67]:
pcaBiplotData.to_csv('pcaBiplotData.csv', index=False)

In [68]:
df.to_csv('df_encodings.csv')

Playing with python's PCA

In [69]:
pca2 = PCA(n_components = 10)
pcafit2 = pca2.fit_transform(pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')])

In [70]:
# The singular values corresponding to each of the selected components - guessing they're refering to eigenvalues
print(pca2.singular_values_)

[81.78093018 42.05430485 31.00536398 29.35742016 20.7621332  17.90038185
 17.76776161 16.05083271 13.82075759 13.030196  ]


In [71]:
# The amount of variance explained by each of the selected components.
print(pca2.explained_variance_)

[21.64440304  5.72350989  3.11110872  2.78918485  1.39503616  1.03696981
  1.02166134  0.83375156  0.61816615  0.54946928]


In [72]:
# Percentage of variance explained by each of the selected components.
print(pca2.explained_variance_ratio_)

[0.47480208 0.12555368 0.06824678 0.06118491 0.03060219 0.02274747
 0.02241166 0.01828958 0.01356039 0.01205342]


In [73]:
# Sklearn's T-SNE
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
from sklearn import manifold

In [74]:
# Perplexity value can greatly influence the shape of the results, typical ranges from 5 to 50
# Some perplexity values with decent separation: 6,7,9,10,12,14,15
t_sne = manifold.TSNE(
    n_components=2,
    perplexity=10,
    init="random",
    n_iter=250,
    random_state=0,
);
tsne_data = pcaBiplotData.loc[:, pcaBiplotData.columns.str.startswith('enc')]
S_t_sne = t_sne.fit_transform(tsne_data);
fig = px.scatter(S_t_sne, 
                 x=0, 
                 y=1, 
                 color=pcaBiplotData['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [None]:
# Join PCA components to biplot dataframe and then save
pcaBiplotData.drop(['index'], axis=1, inplace=True)
pcafitdf = pd.DataFrame(pcafit, columns=['PCA1', 'PCA2'])
pcaBiplotData = pcaBiplotData.join(pcafitdf)

In [108]:
#pcaBiplotData.to_csv('pcaBiplotData.csv', index=False)

### Try PCA on the big tidy data set with emo, pos, sentence and word counts

In [14]:
scaling=StandardScaler()
df_tidy = pd.read_csv('tidy_data.csv')
#df_tidy.columns

In [18]:
# All encoding values are already on same scale
pca_data = df_tidy.loc[:, ~df_tidy.columns.isin(['date', 'source', 'enc_value','PCA1', 'PCA2'])]
scaling.fit(pca_data)
scaled_data = scaling.transform(pca_data)
pca = PCA(n_components = 2)
pcafit2 = pca.fit_transform(scaled_data)

In [19]:
fig = px.scatter(pcafit2, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

<A HREF="https://distill.pub/2016/misread-tsne/">Some info about t-sne</A>

In [77]:
# Maybe the right perplexity in t-sne will separate these .... mostly not
# Perplexity value can greatly influence the shape of the results, typical ranges from 5 to 50
# Some perplexity values with decent separation: 6,7,9,10,12,14,15
t_sne = manifold.TSNE(
    n_components=2,
    perplexity=43,
    init="random",
    n_iter=1200,
    random_state=0,
);
tsne_data = scaled_data
S_t_sne = t_sne.fit_transform(tsne_data);
fig = px.scatter(S_t_sne, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [62]:
# ok then, how about MDS?
# https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
# Data needs to be a similarity or disimilarity matrix ... maybe another day
md_scaling = manifold.MDS(
    n_components=2,
    max_iter=50,
    n_init=4,
    random_state=0
)
S_scaling = md_scaling.fit_transform(scaled_data)

fig = px.scatter(S_scaling, 
                 x=0, 
                 y=1, 
                 color=df_tidy['source'],
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()