In [31]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import spacy
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
word_token = TreebankWordTokenizer()

[nltk_data] Downloading package punkt to /home/muddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
# Load up the files
paths = ['./speeches/', './NYTimes/', './WSJ/'] 
list_of_files = []

dates = pd.read_csv('dateSpeeches.csv')
for path in paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        #print(file)
        text = f.read()
    f.close()
    speeches.append([text, file])

#clean out goofy unicode  space characters 
speeches = [(unicodedata.normalize("NFKD", speech[0]), speech[1]) for speech in speeches if len(speech)>0 ]

# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub('(\[[^w]*\]\s)', '',text)
speeches = [(remove_bracket(speech[0]), speech[1]) for speech in speeches]

def get_source(text):
    regex = "[^./][a-zA-Z]+[^/]"
    string = re.findall(regex, str(text))[0]
    if string == 'speeches': string = 'oba'
    if string == 'NYTimes': string = 'nyt'
    return string.lower()

def get_date(text):
    regex = "([0-9]+[\-][0-9]+[\-][0-9]+)"
    return re.findall(regex, str(text))[0]

def get_filename(text):
    regex = "[-]([a-zA-Z]+)"
    return re.findall(regex, str(text))[0]

cols = ['text', 'filepath']
text_df = pd.DataFrame(speeches, columns=cols)
text_df['date'] = text_df['filepath'].apply(get_date)
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df['source'] = text_df['filepath'].apply(get_source)

text_df['sentences'] = text_df['text'].apply(sent_tokenize)
text_df['words'] = text_df['text'].apply(word_token.tokenize)
text_df['num_sents'] = text_df['sentences'].apply(len)
text_df['num_words'] = text_df['words'].apply(len)
text_df['word_set'] = text_df['words'].apply(set)
text_df['num_unique_words'] = text_df['word_set'].apply(len)
text_df.head(3)

Unnamed: 0,text,filepath,date,source,sentences,words,num_sents,num_words,word_set,num_unique_words
0,"Good afternoon, everybody. One year ago this m...",./speeches/2014-07-01-Immigration.txt,2014-07-01,oba,"[Good afternoon, everybody., One year ago this...","[Good, afternoon, ,, everybody., One, year, ag...",112,2095,"{strengthen, barbecue, Republican, right, lett...",725
1,"Good morning, everybody. I want to take just a...",./speeches/2009-12-25-UnderwearBomber.txt,2009-12-25,oba,"[Good morning, everybody., I want to take just...","[Good, morning, ,, everybody., I, want, to, ta...",53,1166,"{seek, destroy, season., international., stren...",501
2,"Hello, Chicago.\n\nIf there is anyone out ther...",./speeches/2008-11-05-ObamaElected.txt,2008-11-05,oba,"[Hello, Chicago., If there is anyone out there...","[Hello, ,, Chicago., If, there, is, anyone, ou...",96,2254,"{seek, heart, itself, perished, live, Democrat...",755


In [61]:
from nrclex import NRCLex

In [163]:
emo_of_interest = 'anger'
emo_series = pd.DataFrame(columns=['speech_no', 'sentence_no', 'emotion', 'emotion_value'])
for i in range(0,10):
    emos = ['fear','anger','anticip','trust','surprise','positive','negative','sadness','disgust','joy']
    for emo in emos:
        for_one_emotion = []
        for s, sentence in enumerate(text_df['sentences'][i]):
            nrclex = NRCLex(sentence)
            emo_value = nrclex.affect_frequencies[emo]
            emo_series.loc[len(emo_series)] = [str(text_df['date'][i]), s, emo, emo_value]



In [164]:
fig = px.line(emo_series, x="sentence_no", y="emotion_value", color="speech_no",hover_name="speech_no",
        line_shape="spline", render_mode="svg")
fig.show()