### A Start at some correspondence analysis like I did for Corpus Linguistics

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go # for fancy interactive plot
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup
import os # to check if directory exists and create it if it doesn't
from datetime import datetime # to parse speech date
from nrclex import NRCLex
import spacy
import en_core_web_md
import csv
import prince # for correspondence analysis
import nltk
stopwords = pd.read_table('kaggle_stopwords.txt')

In [None]:
#nltk.download('all', halt_on_error=False) # maybe don't need all

In [31]:
# Load the two data sets needed
# string encodings to try utf_8, iso8859_15
facetPlotData = pd.read_csv('facetPlotData.csv')
df = pd.read_csv('fullEmotionData.csv', encoding='utf_8') # apostrophes aren't being read correctly
# Pandas reads date as string, cast to datetime object
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
facetPlotData['date'] = pd.to_datetime(facetPlotData['date'], format='%Y-%m-%d')
facetPlotData.shape # Should be (3000, 4) 

(3000, 4)

In [18]:
# Tells pandas to print all the text when printing an article or speech not just a 50 character preview
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)

In [19]:
# Function to find date of max emoValue for a particular source and emotion
def findmax(sourcefm, emofm, printyn):
    '''
    Function to print date of maximum value of subset of data
    # sourcefm - source name (oba, nyt, wsj)
    # emofm - emotion name (fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,anticipation)
    # printyn = 0 or 1 to print text
    '''
    find = facetPlotData[(facetPlotData['source'] == sourcefm) & (facetPlotData['emotion'] == emofm)]
    result = find.loc[find['emoValue'].idxmax()]
    date = result['date'].strftime('%Y-%m-%d')
    print(date, round(result['emoValue'], 4))
    if printyn == 1:
        text = 'text_' + sourcefm
        print(df.query('date == @date')[text])

In [20]:
findmax('oba', 'positive', 1)

2014-03-07 0.4107
65    Good afternoon, everybody. Before Jay takes some of your questions, I wanted to provide a brief update on our efforts to address the ongoing crisis in Ukraine.\n\nSince the Russian intervention, we've been mobilizing the international community to condemn this violation of international law and to support the people and Government of Ukraine. This morning I signed an Executive order that authorizes sanctions on individuals and entities responsible for violating the sovereignty and territorial integrity of Ukraine or for stealing the assets of the Ukrainian people.\n\nAccording to my guidance, the State Department has also put in place restrictions on the travel of certain individuals and officials. These decisions continue our efforts to impose a cost on Russia and those responsible for the situation in Crimea. And they also give us the flexibility to adjust our response going forward based on Russia's actions.\n\nWe took these steps in close coordination with o

In [21]:
# ahhh, I'll work on this later. Maybe I'll translate my R code from class to python

In [None]:
## Creating Freq list
nouns_freq = nltk.FreqDist([w for w, t in nouns])
sorted(nouns_freq.items(),key=lambda x:x[1], reverse=True)[:20]

In [None]:
df['text_oba'][0].split()

### <A HREF="https://www.nltk.org/index.html">NLTK Documentation</A> <A HREF="https://www.tomaarsen.com/nltk/api/nltk.html">API reference</A>

In [66]:
# Create a frequency list of all words in all Obama speeches

# How to flatten a list
#[item for sublist in list_of_lists for item in sublist]

freqlist = nltk.FreqDist([word.lower() for text in df['text_oba'] for word in text.split()])
freqlist

FreqDist({'the': 11945, 'and': 9614, 'to': 8730, 'of': 6802, 'that': 5227, 'a': 4920, 'in': 4302, 'we': 4166, 'our': 3598, 'for': 2590, ...})

In [67]:
dict(freqlist)

{'thank': 249,
 'you....thank': 2,
 'you.': 124,
 'what': 827,
 'a': 4920,
 '--': 2046,
 'wonderful': 20,
 'reception.': 2,
 'you,': 128,
 'saint': 3,
 'paul.': 2,
 'minnesota.': 9,
 'joann': 2,
 'syverson,': 2,
 'for': 2590,
 'the': 11945,
 'introduction.': 4,
 'michelle': 32,
 'obama': 8,
 'and': 9614,
 'malia': 13,
 'sasha': 9,
 'obama.': 4,
 'you': 1201,
 'to': 8730,
 'my': 515,
 'brothers': 15,
 'sisters.': 2,
 'staff.': 2,
 'our': 3598,
 'volunteers.': 2,
 'political': 72,
 'team.': 12,
 'campaign': 33,
 'manager,': 3,
 'david': 10,
 'plouffe,': 2,
 'who': 1267,
 'never': 182,
 'gets': 33,
 'any': 192,
 'credit,': 7,
 'but': 1306,
 'has': 806,
 'built': 55,
 'best': 151,
 'organization': 14,
 'in': 4302,
 'country.': 88,
 'grandmother,': 4,
 'helped': 70,
 'raise': 49,
 'me,': 31,
 'is': 2435,
 'sitting': 8,
 'hawaii': 2,
 'somewhere': 9,
 'right': 337,
 'now': 301,
 'because': 655,
 'she': 204,
 "can't": 71,
 'travel,': 3,
 'poured': 2,
 'everything': 88,
 'had': 296,
 'into': 2