### Just trying to organize one big tall dataset here, each text (speech or article) as its own row

In [1]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
word_token = TreebankWordTokenizer()

[nltk_data] Downloading package punkt to /home/muddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load up the files
paths = ['./Data/speeches/', './Data/NYTimes/', './Data/WSJ/'] 
list_of_files = []

dates = pd.read_csv('./Data/genData/dateSpeeches.csv')
for path in paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        #print(file)
        text = f.read()
    f.close()
    speeches.append([text, file])

#clean out goofy unicode  space characters 
speeches = [(unicodedata.normalize("NFKD", speech[0]), speech[1]) for speech in speeches if len(speech)>0 ]

# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub('(\[[^w]*\]\s)', '',text)
speeches = [(remove_bracket(speech[0]), speech[1]) for speech in speeches]

<A HREF="https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes">Python datetime format codes</A>

In [3]:
pos = pd.read_csv('./Data/genData/POSFacetPlotData.csv', index_col=None) #date, source, pos, proportion
wide = pd.read_csv('./Data/genData/df_encodings.csv', index_col=None) #file, text, emos, enc vectors
pca = pd.read_csv('./Data/genData/pcaBiplotData.csv', index_col=None) #date, source, enc1-300, PCA1, PCA2
emo = pd.read_csv('./Data/genData/facetPlotData.csv', index_col=None) #date, source, emo, emoValue
pos['date'] = pd.to_datetime(pos['date'], format='%Y-%m-%d')
wide['date'] = pd.to_datetime(wide['date'], format='%Y-%m-%d')
pca['date'] = pd.to_datetime(pca['date'], format='%Y-%m-%d')
emo['date'] = pd.to_datetime(emo['date'], format='%Y-%m-%d')

In [4]:
pos.head(3)

Unnamed: 0.1,Unnamed: 0,date,source,pos,proportion
0,0,2008-06-04,oba,PRON,0.064597
1,1,2008-06-04,oba,NUM,0.058727
2,2,2008-06-04,oba,DET,0.064638


In [5]:
pos_wide=pos.pivot(index=['date', 'source'],columns='pos', values='proportion').reset_index()
pos_wide.shape

(300, 18)

In [6]:
pos_wide.head()

pos,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB
0,2008-06-04,nyt,0.064458,0.065088,0.0624,0.064962,0.064416,0.063408,0.055598,0.06492,0.065046,0.065004,0.06513,0.064668,0.065172,0.061602,0.04342,0.06471
1,2008-06-04,oba,0.064649,0.06467,0.06444,0.064398,0.064471,0.064638,0.054967,0.064503,0.058727,0.064315,0.064597,0.06468,0.064691,0.064377,0.0,0.064628
2,2008-06-04,wsj,0.072536,0.0713,0.070185,0.071579,0.072456,0.072177,0.0,0.072616,0.065402,0.072098,0.071938,0.072217,0.072656,0.070344,0.0,0.072496
3,2008-11-05,nyt,0.065299,0.065636,0.065602,0.064593,0.065131,0.065266,0.042603,0.065972,0.065064,0.065703,0.065837,0.065669,0.066005,0.063315,0.042569,0.065736
4,2008-11-05,oba,0.067288,0.067345,0.064371,0.067174,0.067145,0.06726,0.066802,0.066745,0.065544,0.065,0.067088,0.067374,0.067403,0.06623,0.0,0.067231


In [7]:
wide.head(1)

Unnamed: 0.1,Unnamed: 0,file,text_oba,text_wsj,text_nyt,emo_oba,emo_nyt,emo_wsj,date,obafear,...,InfoWars,NationalReview,Intercept,enc_oba,enc_nyt,enc_wsj,enc_fn,enc_iw,enc_nr,enc_int
0,0,2008-06-04-ObamaNomination.txt,thank you thank you what a what a wonderful re...,sen barack obama captured enough convention de...,senator barack obama claimed the democratic pr...,<nrclex.NRCLex object at 0x7f09f2a0eda0>,<nrclex.NRCLex object at 0x7f09e5416260>,<nrclex.NRCLex object at 0x7f09e2782260>,2008-06-04,0.095798,...,,,,"[-0.717641, 0.19752467, -0.15350738, -0.080692...","[-0.68823427, 0.17678636, -0.034410875, -0.026...","[-0.6987737, 0.16180255, -0.02974237, -0.01259...",,,,


In [8]:
enc_oba=pd.melt(wide, id_vars =['date'], value_vars =['enc_oba'],var_name='source', value_name='enc_value')
enc_oba['source']='oba'
enc_nyt=pd.melt(wide, id_vars =['date'], value_vars =['enc_nyt'],var_name='source', value_name='enc_value')
enc_nyt['source']='nyt'
enc_wsj=pd.melt(wide, id_vars =['date'], value_vars =['enc_wsj'],var_name='source', value_name='enc_value')
enc_wsj['source']='wsj'
encodings=pd.concat([enc_oba,enc_nyt,enc_wsj])

In [9]:
encodings.shape

(300, 3)

In [10]:
try:
    pca_values=pca[['date', 'source', 'PCA1', 'PCA2']].copy()
except:
    pca_values=pca[['date', 'source']].copy()
pca_values['source']=pca_values['source'].replace('text_', '', regex=True)
pca_values.head(3)

Unnamed: 0,date,source
0,2008-06-04,oba
1,2008-06-04,nyt
2,2008-06-04,wsj


In [11]:
emo.head(13)

Unnamed: 0,date,source,emotion,emoValue
0,2008-06-04,oba,fear,0.095798
1,2008-06-04,oba,anger,0.055462
2,2008-06-04,oba,trust,0.159664
3,2008-06-04,oba,surprise,0.048739
4,2008-06-04,oba,positive,0.233613
5,2008-06-04,oba,negative,0.115966
6,2008-06-04,oba,sadness,0.042017
7,2008-06-04,oba,disgust,0.023529
8,2008-06-04,oba,joy,0.095798
9,2008-06-04,oba,anticipation,0.129412


<A HREF="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</A>

In [12]:
emo_wide=emo.pivot(index=['date', 'source'],columns='emotion', values='emoValue').reset_index()
emo_wide.shape

(300, 12)

In [13]:
emo_wide.head(3)

emotion,date,source,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,2008-06-04,nyt,0.058824,0.121849,0.012605,0.063025,0.079832,0.130252,0.281513,0.07563,0.046218,0.130252
1,2008-06-04,oba,0.055462,0.129412,0.023529,0.095798,0.095798,0.115966,0.233613,0.042017,0.048739,0.159664
2,2008-06-04,wsj,0.03663,0.087912,0.025641,0.051282,0.106227,0.095238,0.326007,0.054945,0.047619,0.168498


In [14]:
#pos_wide,encodings,pca_values,emo_wide  pd.merge(adf, bdf, how='left', on='x1')
df1 = pd.merge(pos_wide, encodings, how='left', on=['date', 'source'])
df2 = pd.merge(df1, pca_values, how='left', on=['date', 'source'])
df3 = pd.merge(df2, emo_wide, how='left', on=['date', 'source'])

In [16]:
df3.corr(numeric_only=True)[df3.corr(numeric_only=True)>0.5].loc[['PCA1', 'PCA2'] ].transpose()

KeyError: "None of [Index(['PCA1', 'PCA2'], dtype='object')] are in the [index]"

In [None]:
text_oba=pd.melt(wide, id_vars =['date'], value_vars =['text_oba'],var_name='source', value_name='text')
text_oba['source']='oba'
text_nyt=pd.melt(wide, id_vars =['date'], value_vars =['text_nyt'],var_name='source', value_name='text')
text_nyt['source']='nyt'
text_wsj=pd.melt(wide, id_vars =['date'], value_vars =['text_wsj'],var_name='source', value_name='text')
text_wsj['source']='wsj'
texts=pd.concat([text_oba,text_nyt,text_wsj])

In [None]:
texts

<A HREF="https://practicaldatascience.co.uk/data-science/how-to-use-python-regular-expressions-to-extract-information">Decent regex guide</A><BR><A HREF="https://regexr.com/">RegExr</A>

In [None]:
def get_source(text):
    regex = "[^./][a-zA-Z]+[^/]"
    string = re.findall(regex, str(text))[0]
    if string == 'speeches': string = 'oba'
    if string == 'NYTimes': string = 'nyt'
    return string.lower()

def get_date(text):
    regex = "([0-9]+[\-][0-9]+[\-][0-9]+)"
    return re.findall(regex, str(text))[0]

def get_filename(text):
    regex = "[-]([a-zA-Z]+)"
    return re.findall(regex, str(text))[0]

speech_df = pd.DataFrame(speeches)
print(get_source(speeches[50][1]))

print(get_date(speeches[50][1]))

print(get_filename(speeches[50][1]))

In [None]:
cols = ['text', 'filepath']
text_df = pd.DataFrame(speeches, columns=cols)
text_df['date'] = text_df['filepath'].apply(get_date)
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df['source'] = text_df['filepath'].apply(get_source)
text_df

In [None]:
text_df['sentences'] = text_df['text'].apply(sent_tokenize)
text_df['words'] = text_df['text'].apply(word_token.tokenize)
text_df['num_sents'] = text_df['sentences'].apply(len)
text_df['num_words'] = text_df['words'].apply(len)
text_df['word_set'] = text_df['words'].apply(set)
text_df['num_unique_words'] = text_df['word_set'].apply(len)

In [None]:
#text_df.to_csv('./Data/genData/text_sentences_words.csv', index=False)

In [None]:
text_df_numbers = text_df[['date', 'source', 'num_sents', 'num_words', 'num_unique_words']].copy()

In [None]:
df4 = pd.merge(df3, text_df_numbers, how='left', on=['date', 'source'])

In [None]:
df4

In [None]:
#df4.to_csv('./Data/genData/tidy_data.csv', index=False)

In [None]:
df4.corr(numeric_only=True)[df4.corr(numeric_only=True)>0.5].loc[['PCA1', 'PCA2'] ].transpose()