In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import configparser
config = configparser.ConfigParser()
import sys
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']
sys.path.append(local_lib)
from textparser import TextParser

OHCO = ['speech_id', 'para_num', 'sent_num', 'token_num']

In [2]:
LIB = pd.read_csv('LIB.csv', sep='|').set_index('speech_id')
CORPUS = pd.read_csv('CORPUS.csv', sep='|').set_index(OHCO)
BAG = ['speech_id', 'para_num']

In [3]:
DOCS = CORPUS[CORPUS.pos.str.match(r'^NNS?$') | CORPUS.pos.str.match(r'^VBS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(x.astype(str)))\
    .to_frame().rename(columns={'term_str': 'doc_str'})
DOCS

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str
speech_id,para_num,Unnamed: 2_level_1
1,1,satisfaction opportunity prospects affairs acc...
1,2,consultations good encouragement reflection me...
1,3,objects engage attention defense merit regard ...
1,4,people be disciplined end uniform plan safety ...
1,5,establishment troops be be mature consideratio...
...,...,...
114,263,anger revenge retribution ideas lead ideas tak...
114,264,mine future correction democracy it future rig...
114,265,all future country be people youre reason future
114,266,future together remember capacity


In [4]:
count_engine = CountVectorizer(max_features=5000, stop_words='english')
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()
VOCAB = pd.DataFrame(index=TERMS)
VOCAB.index.name='term_str'
DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
DTM

Unnamed: 0_level_0,Unnamed: 1_level_0,10,100,1000,10000,15,15000,1990s,2000,25000,250000,...,yield,youll,young,youre,youth,youve,zeal,zero,zone,zones
speech_id,para_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,263,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114,264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114,265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
114,266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
n_topics=50
lda_engine = LDA(n_components=n_topics, max_iter=20,learning_offset=50., random_state=123)
TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]

In [None]:
lda_model = lda_engine.fit_transform(count_model)
THETA = pd.DataFrame(lda_model, index=DOCS.index)
THETA.columns.name = 'topic_id'
THETA.columns = TNAMES
THETA.T.sample(10)

In [None]:
PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI.index.name='topic_id'
PHI.columns.name = 'term_str'
PHI.T.sample(10)

In [None]:
TOPICS = PHI.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(5).reset_index().term_str))\
    .to_frame('top_terms')
TOPICS['doc_weight_avg'] = THETA.mean()
TOPICS = TOPICS.sort_values(by='doc_weight_avg', ascending=False)

PRES = sorted(LIB.president.value_counts().index.to_list())
TOPICS[PRES] = THETA.join(LIB, on='speech_id').groupby('president')[TNAMES].mean().T
TOPICS['president'] = TOPICS[PRES].idxmax(1)
TOPICS = TOPICS.drop(columns=PRES)

# done to get metadata for next part
party_map = {'washington': 'IND',
            'adams': 'FED',
            'wilson': 'DEM',
            'harding': 'REP',
            'coolidge': 'REP',
            'hoover': 'REP',
            'roosevelt': 'DEM',
            'truman': 'DEM',
            'eisenhower': 'REP',
            'kennedy': 'DEM',
            'johnson': 'DEM',
            'nixon': 'REP',
            'ford': 'REP',
            'carter': 'DEM',
            'reagan': 'REP',
            'bush': 'REP',
            'clinton': 'DEM',
            'obama': 'DEM',
            'trump': 'REP',
            'biden': 'DEM'}
TOPICS['party'] = TOPICS['president'].map(party_map)
TOPICS

In [None]:
DTM.to_csv('count_matrix.csv')
TOPICS.to_csv('TOPICS.csv')
PHI.to_csv('PHI.csv')
THETA.to_csv('THETA.csv')