In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import configparser
config = configparser.ConfigParser()
import sys
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']
sys.path.append(local_lib)
from textparser import TextParser

OHCO = ['speech_id', 'para_num', 'sent_num', 'token_num']

In [2]:
LIB = pd.read_csv('LIB.csv', sep='|').set_index('speech_id')
CORPUS = pd.read_csv('CORPUS.csv', sep='|').set_index(OHCO)
BAG = ['speech_id', 'para_num']

In [3]:
DOCS = CORPUS[CORPUS.pos.str.match(r'^NNS?$') | CORPUS.pos.str.match(r'^VBS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(x.astype(str)))\
    .to_frame().rename(columns={'term_str': 'doc_str'})
DOCS

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str
speech_id,para_num,Unnamed: 2_level_1
0,1,satisfaction opportunity prospects affairs acc...
0,2,consultations good encouragement reflection me...
0,3,objects engage attention defense merit regard ...
0,4,people be disciplined end uniform plan safety ...
0,5,establishment troops be be mature consideratio...
...,...,...
113,263,anger revenge retribution ideas lead ideas tak...
113,264,mine future correction democracy it future rig...
113,265,all future country be people youre reason future
113,266,future together remember capacity


In [4]:
count_engine = CountVectorizer(max_features=5000, stop_words='english')
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()
VOCAB = pd.DataFrame(index=TERMS)
VOCAB.index.name='term_str'
DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
DTM

Unnamed: 0_level_0,Unnamed: 1_level_0,10,100,1000,10000,15,15000,1990s,2000,25000,250000,...,yield,youll,young,youre,youth,youve,zeal,zero,zone,zones
speech_id,para_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,263,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113,264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113,265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
113,266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
n_topics=50
lda_engine = LDA(n_components=n_topics, max_iter=20,learning_offset=50., random_state=123)
TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]

In [6]:
lda_model = lda_engine.fit_transform(count_model)
THETA = pd.DataFrame(lda_model, index=DOCS.index)
THETA.columns.name = 'topic_id'
THETA.columns = TNAMES
THETA.T.sample(10)

speech_id,0,0,0,0,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,113,113
para_num,1,2,3,4,5,6,7,8,9,10,...,258,259,260,261,262,263,264,265,266,267
T10,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.419101,0.001667,0.00087,0.002857,0.005,0.485637
T40,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333
T48,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.168333,0.00087,0.002857,0.005,0.003333
T35,0.00125,0.052699,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.637444,0.357797,0.245482,0.003333
T15,0.00125,0.077043,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333
T39,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333
T06,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333
T13,0.00125,0.000909,0.002,0.704754,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333
T47,0.00125,0.632056,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.510558,0.004,0.265901,0.00087,0.002857,0.005,0.003333
T17,0.00125,0.000909,0.002,0.001818,0.001538,0.001053,0.000833,0.002857,0.002857,0.000952,...,0.003333,0.005,0.02,0.004,0.004,0.001667,0.00087,0.002857,0.005,0.003333


In [7]:
PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI.index.name='topic_id'
PHI.columns.name = 'term_str'
PHI.T.sample(10)

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,...,T40,T41,T42,T43,T44,T45,T46,T47,T48,T49
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
card,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
package,0.02,0.02,0.02,0.02,0.02,0.02,0.02,2.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
disregard,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
shall,0.02,0.02,0.02,0.02,0.02,1.02,0.02,0.02,0.02,4.019995,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.020017,0.02
disunity,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
farming,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,1.152737,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
lows,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,1.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
limb,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,3.02,0.02,0.02
major,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,1.02,0.02,0.02,0.02
baggage,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,2.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02


In [8]:
TOPICS = PHI.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(5).reset_index().term_str))\
    .to_frame('top_terms')
TOPICS['doc_weight_avg'] = THETA.mean()
TOPICS = TOPICS.sort_values(by='doc_weight_avg', ascending=False)

PRES = sorted(LIB.president.value_counts().index.to_list())
TOPICS[PRES] = THETA.join(LIB, on='speech_id').groupby('president')[TNAMES].mean().T
TOPICS['president'] = TOPICS[PRES].idxmax(1)
TOPICS = TOPICS.drop(columns=PRES)

# done to get metadata for next part
party_map = {'washington': 'IND',
            'adams': 'FED',
            'wilson': 'DEM',
            'harding': 'REP',
            'coolidge': 'REP',
            'hoover': 'REP',
            'roosevelt': 'DEM',
            'truman': 'DEM',
            'eisenhower': 'REP',
            'kennedy': 'DEM',
            'johnson': 'DEM',
            'nixon': 'REP',
            'ford': 'REP',
            'carter': 'DEM',
            'reagan': 'REP',
            'bush': 'REP',
            'clinton': 'DEM',
            'obama': 'DEM',
            'trump': 'REP',
            'biden': 'DEM'}
TOPICS['party'] = TOPICS['president'].map(party_map)
TOPICS

Unnamed: 0_level_0,top_terms,doc_weight_avg,president,party
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T16,budget tax year spending programs,0.042206,carter,DEM
T12,peace world freedom men war,0.040851,roosevelt,DEM
T27,security world defense nations strength,0.038175,carter,DEM
T47,years history year world time,0.036903,reagan,REP
T06,health care insurance people coverage,0.028061,clinton,DEM
T42,let welfare children work people,0.02622,clinton,DEM
T45,jobs workers economy year years,0.024335,obama,DEM
T33,program policy aid propose principles,0.024293,washington,IND
T11,school schools education children students,0.023706,clinton,DEM
T08,tax pay help income taxes,0.022502,obama,DEM


In [9]:
DTM.to_csv('count_matrix.csv')
TOPICS.to_csv('TOPICS.csv')
PHI.to_csv('PHI.csv')
THETA.to_csv('THETA.csv')