In [4]:
import pandas as pd
import numpy as np
import configparser
import sys
from sklearn.decomposition import PCA
config = configparser.ConfigParser()
config.read("../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']
sys.path.append(local_lib)

OHCO = ['speech_id', 'para_num', 'sent_num', 'token_num']

In [17]:
LIB = pd.read_csv('LIB.csv', sep='|').set_index('speech_id')
VOCAB = pd.read_csv('VOCAB.csv', sep='|').set_index('term_str')
TFIDF = pd.read_csv('TFIDF_L2.csv').set_index(['speech_id', 'para_num'])

In [18]:
n_comps = 10
pc_cols = [f"PC{i}" for i in range(n_comps)]
pca_engine = PCA(n_components=n_comps)
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF.fillna(0)), index=TFIDF.index)
DCM.columns = pc_cols
DCM = DCM.join(LIB, on='speech_id')

LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_str'
LOADINGS = LOADINGS.join(VOCAB)

In [38]:
data = []
for i in range(n_comps):
    for j in [0,1]:
        data.append((f"PC{i}", j, ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())))
comp_strs = pd.DataFrame(data)
comp_strs.columns = ['pc', 'pole', 'top_terms']
comp_strs = comp_strs.set_index(['pc', 'pole'])

In [26]:
comp_strs.to_csv('components.csv')
comp_strs

Unnamed: 0,pc,pole,top_terms
0,PC0,0,mr members president speaker audience congress...
1,PC0,1,world peace nations government economic tax jo...
2,PC1,0,world peace nations united war freedom states ...
3,PC1,1,tax health year care jobs federal budget perce...
4,PC2,0,thank god bless america good much americans to...
5,PC2,1,audience members federal government defense bu...
6,PC3,0,thank audience bless god members federal membe...
7,PC3,1,mr people world president americans speaker am...
8,PC4,0,audience members member america world usa amer...
9,PC4,1,mr congress president states federal speaker u...


In [27]:
DCM.to_csv('DCM.csv')
DCM

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,president,source_file_path,year
speech_id,para_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,0.205850,0.005398,-0.024998,-0.012763,-0.166180,0.486503,0.358598,0.095884,0.117039,0.021002,,,
0,1,-0.014680,0.110470,-0.039154,0.030642,-0.090697,0.017164,-0.126550,-0.149847,0.084967,-0.060846,,,
0,2,-0.008399,0.009322,-0.013999,0.033296,-0.026102,0.040675,-0.017760,-0.042752,-0.008024,-0.010793,,,
0,3,-0.028420,0.115801,-0.062938,0.021429,-0.013655,-0.002523,0.008922,0.077639,-0.064673,-0.080911,,,
0,4,-0.028579,0.049705,-0.040260,0.016527,-0.004961,-0.005375,-0.008478,0.058094,-0.029617,-0.001358,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,263,-0.014487,0.018471,0.077653,-0.047516,0.051905,0.007535,0.009357,-0.019138,0.019318,0.050945,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2023.0
113,264,-0.004082,-0.028720,0.065167,-0.055715,0.035094,0.065691,0.065535,-0.023202,0.010543,-0.017002,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2023.0
113,265,0.055283,-0.028220,0.145989,-0.185174,0.066207,0.003253,-0.003460,-0.080861,0.008463,-0.003854,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2023.0
113,266,-0.006955,0.049631,0.074456,-0.022679,0.004251,-0.013393,-0.061155,-0.043457,0.273300,-0.012034,biden,C:/Users/patso/Documents/DS5001//data/sotu\202...,2023.0


In [28]:
LOADINGS.to_csv('LOADINGS.csv')
LOADINGS

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,n,n_chars,p,i,max_pos,max_pos_group,stop,porter_stem,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
people,-0.003367,0.000334,0.006871,-0.012087,0.004108,0.001501,-0.005649,-0.008775,0.001516,-0.000056,1924,6,0.003538,8.142906,NNS,NN,0,peopl,4049.914776
world,-0.005636,0.022839,0.000418,-0.011256,0.005375,-0.010295,0.006567,0.009182,-0.000892,0.002526,1602,5,0.002946,8.407140,NN,NN,0,world,3667.352042
new,-0.002654,-0.003575,0.000241,-0.005386,0.001396,-0.005315,0.002166,-0.000887,0.005052,0.014458,1578,3,0.002902,8.428917,JJ,JJ,0,new,3624.966156
congress,0.015348,-0.004365,-0.005288,-0.003608,-0.010657,0.000457,-0.005959,-0.004658,0.003039,-0.000553,1395,8,0.002565,8.606749,NNP,NN,0,congress,3578.575270
america,-0.002414,0.006235,0.014906,-0.007483,0.005628,-0.005336,0.002727,-0.003487,0.008328,0.007398,1412,7,0.002596,8.589274,NNP,NN,0,america,3534.140499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fairness,-0.000212,-0.000511,0.000032,-0.000132,0.000047,-0.000067,0.000152,-0.000024,0.000059,0.000186,32,8,0.000059,14.052799,NN,NN,0,fair,248.742029
active,-0.000207,0.000529,-0.000157,0.000003,-0.000124,-0.000125,-0.000117,-0.000060,0.000063,-0.000126,33,6,0.000061,14.008405,JJ,JJ,0,activ,248.742029
transition,-0.000200,0.000111,-0.000183,-0.000074,0.000034,-0.000167,0.000170,0.000056,-0.000200,0.000148,33,10,0.000061,14.008405,NN,NN,0,transit,248.742029
rapid,-0.000120,0.000262,-0.000369,0.000111,-0.000179,-0.000228,-0.000018,-0.000053,-0.000063,0.000173,31,5,0.000057,14.098603,JJ,JJ,0,rapid,248.742029
