In [1]:
import pandas as pd
import numpy as np
import configparser
import sys
from sklearn.decomposition import PCA
config = configparser.ConfigParser()
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']
sys.path.append(local_lib)

OHCO = ['speech_id', 'para_num', 'sent_num', 'token_num']

In [2]:
LIB = pd.read_csv('LIB.csv', sep='|').set_index('speech_id')
VOCAB = pd.read_csv('VOCAB.csv', sep='|').set_index('term_str')
TFIDF = pd.read_csv('TFIDF_L2.csv').set_index(['speech_id', 'para_num'])

In [3]:
n_comps = 10
pc_cols = [f"PC{i}" for i in range(n_comps)]
pca_engine = PCA(n_components=n_comps)
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF.fillna(0)), index=TFIDF.index).reset_index()
DCM['speech_id'] = DCM['speech_id'] + 1
DCM = DCM.set_index(['speech_id', 'para_num'])
DCM.columns = pc_cols
DCM = DCM.join(LIB, on='speech_id')
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_str'
LOADINGS = LOADINGS.join(VOCAB)

In [4]:
data = []
for i in range(n_comps):
    for j in [0,1]:
        data.append((f"PC{i}", j, ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())))
comp_strs = pd.DataFrame(data)
comp_strs.columns = ['pc', 'pole', 'top_terms']
comp_strs = comp_strs.set_index(['pc', 'pole'])

In [5]:
comp_strs.to_csv('components.csv')
comp_strs

Unnamed: 0_level_0,Unnamed: 1_level_0,top_terms
pc,pole,Unnamed: 2_level_1
PC0,0,mr members president speaker audience congress...
PC0,1,world peace nations government economic tax jo...
PC1,0,world peace nations united war freedom states ...
PC1,1,tax health year care jobs budget federal perce...
PC2,0,thank god bless america good much americans to...
PC2,1,audience members federal government defense bu...
PC3,0,thank audience bless god members federal membe...
PC3,1,mr people president world americans speaker am...
PC4,0,audience members member world america usa know...
PC4,1,mr congress president states speaker thank uni...


In [6]:
DCM.to_csv('DCM.csv')
DCM

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,president,source_file_path,year,party
speech_id,para_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,0.205831,0.005653,-0.023521,-0.012175,-0.158875,0.489571,0.361118,0.098175,0.129201,0.033398,washington,C:/Users/patso/Documents/DS5001//data/sotu\179...,1790,IND
1,1,-0.014996,0.110949,-0.038285,0.028145,-0.090647,0.018797,-0.123015,-0.158420,0.079066,-0.040273,washington,C:/Users/patso/Documents/DS5001//data/sotu\179...,1790,IND
1,2,-0.008564,0.009144,-0.015104,0.034474,-0.030552,0.031981,-0.023164,-0.036983,-0.011194,-0.028648,washington,C:/Users/patso/Documents/DS5001//data/sotu\179...,1790,IND
1,3,-0.028436,0.116336,-0.063156,0.020987,-0.017645,-0.004080,0.010349,0.064467,-0.074628,-0.102150,washington,C:/Users/patso/Documents/DS5001//data/sotu\179...,1790,IND
1,4,-0.028593,0.049791,-0.040218,0.016126,-0.004389,-0.005460,-0.005663,0.056069,-0.023123,-0.011979,washington,C:/Users/patso/Documents/DS5001//data/sotu\179...,1790,IND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,263,-0.014451,0.018028,0.076430,-0.046979,0.054920,0.007220,0.016255,-0.016712,0.018065,0.047745,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2024,DEM
114,264,-0.004057,-0.028991,0.063765,-0.055304,0.041181,0.063881,0.072085,-0.023368,0.014713,-0.014881,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2024,DEM
114,265,0.055215,-0.028619,0.144953,-0.187336,0.066130,-0.003707,0.007423,-0.085288,0.005060,-0.011042,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2024,DEM
114,266,-0.007100,0.049125,0.073843,-0.022842,0.003958,-0.016273,-0.055389,-0.035998,0.274672,-0.026088,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2024,DEM


In [7]:
LOADINGS.to_csv('LOADINGS.csv')
LOADINGS

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,n,n_chars,p,i,max_pos,max_pos_group,stop,porter_stem,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
people,-0.003365,0.000347,0.006763,-0.011926,0.004436,0.001126,-0.005065,-0.008718,0.002472,-0.002065,1924,6,0.003538,8.142906,NNS,NN,0,peopl,4049.914776
world,-0.005619,0.022832,0.000505,-0.011090,0.005764,-0.010251,0.006732,0.009931,0.000806,0.001892,1602,5,0.002946,8.407140,NN,NN,0,world,3667.352042
new,-0.002657,-0.003557,0.000197,-0.005482,0.001427,-0.005025,0.002089,-0.001748,0.004026,0.018633,1578,3,0.002902,8.428917,JJ,JJ,0,new,3624.966156
congress,0.015358,-0.004398,-0.005500,-0.003214,-0.010586,-0.000182,-0.006055,-0.003932,0.003580,-0.002739,1395,8,0.002565,8.606749,NNP,NN,0,congress,3578.575270
america,-0.002412,0.006197,0.014898,-0.007458,0.005711,-0.005199,0.003448,-0.002973,0.008622,0.006656,1412,7,0.002596,8.589274,NNP,NN,0,america,3534.140499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fairness,-0.000211,-0.000512,0.000030,-0.000120,0.000050,-0.000099,0.000147,-0.000007,0.000103,0.000002,32,8,0.000059,14.052799,NN,NN,0,fair,248.742029
active,-0.000207,0.000530,-0.000155,-0.000005,-0.000126,-0.000129,-0.000109,-0.000075,0.000046,-0.000078,33,6,0.000061,14.008405,JJ,JJ,0,activ,248.742029
transition,-0.000199,0.000111,-0.000188,-0.000078,0.000025,-0.000166,0.000178,0.000042,-0.000245,0.000150,33,10,0.000061,14.008405,NN,NN,0,transit,248.742029
rapid,-0.000120,0.000262,-0.000370,0.000105,-0.000179,-0.000248,-0.000024,-0.000056,-0.000078,0.000227,31,5,0.000057,14.098603,JJ,JJ,0,rapid,248.742029
