In [1]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import spacy
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from textblob import TextBlob
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
word_token = TreebankWordTokenizer()

[nltk_data] Downloading package punkt to /home/muddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load up the speeches

In [101]:
# Load up the files
paths = ['./speeches/', './NYTimes/', './WSJ/'] 
list_of_files = []

dates = pd.read_csv('dateSpeeches.csv')
for path in paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        #print(file)
        text = f.read()
    f.close()
    speeches.append([text, file])

#clean out goofy unicode  space characters 
speeches = [(unicodedata.normalize("NFKD", speech[0]), speech[1]) for speech in speeches if len(speech)>0 ]

# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub('(\[[^w]*\]\s)', '',text)
speeches = [(remove_bracket(speech[0]), speech[1]) for speech in speeches]

def get_source(text):
    regex = "[^./][a-zA-Z]+[^/]"
    string = re.findall(regex, str(text))[0]
    if string == 'speeches': string = 'oba'
    if string == 'NYTimes': string = 'nyt'
    return string.lower()

def get_date(text):
    regex = "([0-9]+[\-][0-9]+[\-][0-9]+)"
    return re.findall(regex, str(text))[0]

def get_filename(text):
    regex = "[-]([a-zA-Z]+)"
    return re.findall(regex, str(text))[0]

cols = ['text', 'filepath']
text_df = pd.DataFrame(speeches, columns=cols)
text_df['date'] = text_df['filepath'].apply(get_date)
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df['source'] = text_df['filepath'].apply(get_source)

text_df['sentences'] = text_df['text'].apply(sent_tokenize)
text_df['words'] = text_df['text'].apply(word_token.tokenize)
text_df['num_sents'] = text_df['sentences'].apply(len)
text_df['num_words'] = text_df['words'].apply(len)
text_df['word_set'] = text_df['words'].apply(set)
text_df['num_unique_words'] = text_df['word_set'].apply(len)
text_df.head(3)

Unnamed: 0,text,filepath,date,source,sentences,words,num_sents,num_words,word_set,num_unique_words
0,"Good afternoon, everybody. One year ago this m...",./speeches/2014-07-01-Immigration.txt,2014-07-01,oba,"[Good afternoon, everybody., One year ago this...","[Good, afternoon, ,, everybody., One, year, ag...",112,2095,"{letter, Speaker, -–, board., more, why, Secre...",725
1,"Good morning, everybody. I want to take just a...",./speeches/2009-12-25-UnderwearBomber.txt,2009-12-25,oba,"[Good morning, everybody., I want to take just...","[Good, morning, ,, everybody., I, want, to, ta...",53,1166,"{public., brutality, closely, more, Third, ter...",501
2,"Hello, Chicago.\n\nIf there is anyone out ther...",./speeches/2008-11-05-ObamaElected.txt,2008-11-05,oba,"[Hello, Chicago., If there is anyone out there...","[Hello, ,, Chicago., If, there, is, anyone, ou...",96,2254,"{heard, more, debt, help., bombs, spoken, hear...",755


<A HREF="https://textblob.readthedocs.io/en/latest/quickstart.html">TextBlob Quickstart guide</A>

In [102]:
text_df['TBsubjectivity']=[TextBlob(text).sentiment.subjectivity for text in text_df['text']]
text_df['TBpolarity']=[TextBlob(text).sentiment.polarity for text in text_df['text']]

In [7]:
tidy_data = pd.read_csv('tidy_data.csv')
tidy_data.head(2)

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,fear,joy,negative,positive,sadness,surprise,trust,num_sents,num_words,num_unique_words
0,2008-06-04,nyt,0.064458,0.065088,0.0624,0.064962,0.064416,0.063408,0.055598,0.06492,...,0.063025,0.079832,0.130252,0.281513,0.07563,0.046218,0.130252,47,1459,620
1,2008-06-04,oba,0.064649,0.06467,0.06444,0.064398,0.064471,0.064638,0.054967,0.064503,...,0.095798,0.095798,0.115966,0.233613,0.042017,0.048739,0.159664,217,5856,939


<A HREF="https://plotly.com/python/plotly-express/">Plotly Express</A><BR><A HREF="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas cheat sheet</A>

In [84]:
# Mean number of sentences per text
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, x=means.index, y="num_sents", color=means.index, 
            title= 'Mean number of sentences per text')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

In [85]:
# Mean number of unique words per text
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, x=means.index, y="num_unique_words", color=means.index,
            title = 'Mean number of unique words per text')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

<A HREF="https://universaldependencies.org/u/pos/">Universal POS tags</A>

In [88]:
# Proportion of words that are adjectives
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, 
             x=means.index, 
             y="ADJ", 
             color=means.index, 
             hover_name=means.index,
             labels={"ADJ":"Proportion"},
             hover_data={'ADJ':':.3f'},
             title = 'Proportion of words that are adjectives')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

<A HREF="https://en.wikipedia.org/wiki/Interjection">Wikipedia - Interjections</A>

In [87]:
# Proportion of words that are interjections
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, 
             x=means.index, 
             y="INTJ", 
             color=means.index, 
             hover_name=means.index,
             labels={"INTJ":"Proportion"},
             hover_data={'INTJ':':.3f'},
             title = 'Proportion of words that are interjections')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

In [86]:
# Proportion of words that are numbers
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, 
             x=means.index, 
             y="NUM", 
             color=means.index, 
             hover_name=means.index,
             labels={"NUM":"Proportion"},
             hover_data={'NUM':':.3f'},
             title = 'Proportion of words that are numbers')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

<A HREF="https://pypi.org/project/NRCLex/">NRCLex</A>

In [89]:
# anger score from NRCLex
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, 
             x=means.index, 
             y="anger", 
             color=means.index, 
             hover_name=means.index,
             hover_data={'anger':':.3f'},
             title = 'NRCLex anger score')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

In [90]:
# joy score from NRCLex
means = tidy_data.groupby('source').mean(numeric_only=True)
fig = px.bar(means, 
             x=means.index, 
             y="joy", 
             color=means.index, 
             hover_name=means.index,
             hover_data={'joy':':.3f'},
             title = 'NRCLex joy score')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

In [107]:
# Mean TextBlob subjectivity scores
# subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
means2 = text_df.groupby('source').mean(numeric_only=True)
fig = px.bar(means2, 
             x=means2.index, 
             y="TBsubjectivity", 
             color=means2.index, 
             hover_name=means2.index,
             hover_data={'TBsubjectivity':':.3f'},
             title = 'Mean TextBlob subjectivity scores')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()

In [108]:
# Mean TextBlob polarity scores
# polarity is a float within the range [-1.0, 1.0]
means2 = text_df.groupby('source').mean(numeric_only=True)
fig = px.bar(means2, 
             x=means2.index, 
             y="TBpolarity", 
             color=means2.index, 
             hover_name=means2.index,
             hover_data={'TBpolarity':':.3f'},
             title = 'Mean TextBlob polarity scores')
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':['oba','nyt','wsj']})
fig.show()