In [1]:
#########################################################
## Python SKLearn - fit Non Negative Matrix Factorization to sparse DTM
##
## Author: Chris Meaney
## Date: January 2023
#########################################################

In [2]:
################################
## Dependency Modules
################################

## Timing
import time
## Numerics
import numpy as np
## Sparse matrix support
import scipy.sparse
## Pandas data structures
import pandas as pd
## Factorizations/deomcposions
from sklearn.decomposition import NMF
## Session info
#from sinfo import sinfo
## Plotting
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
## Plotting
import seaborn as sns


In [3]:
## Investigate manually created corpus using NLTK
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\meaneych\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\meaneych\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)

In [5]:
################################
## Hyper-parameters
################################

k=50               # Number latent components
slvr='cd'          # {cd, mu}; coordinate descent vs. multiplicative updates
loss='frobenius'   # {frobenius, kullback-leibler, itakura-saito}
init_mat="random"  # {random, nndsvd. nndsvda, nndsvdar}
alp=0.01           # Controls strength of regularization, alpha=0 implies no regulatization
l1=1               # l1_ratio=0 implies ridge/frob penalty; l1_ratio=1 implies lasso penalty
reg='both'         # Can be 'both', components (H) or transformation (W)         

In [6]:
###################################
## Import IJX sparse matrix data
###################################

## Set working directory path
wd_path = "Enter_a_Path_to_a_Working_Directory_For_This_Project"

fpath_ijx = paste0(wd_path, "\\ijx_df.csv")

ijx_dtm = pd.read_csv(filepath_or_buffer=fpath_ijx,
                      delimiter=",",
                      header=0)

ijx_dtm.shape

(2325372, 3)

In [7]:
## Max of row/col indices
(np.max(ijx_dtm.i), np.max(ijx_dtm.j))

(27682, 78047)

In [8]:
##
## Convert from IJX format into Scipy.Sparse format
##
nrow = np.max(ijx_dtm.i)
ncol = np.max(ijx_dtm.j)

sp_dtm = scipy.sparse.coo_matrix((ijx_dtm.x, (ijx_dtm.i-1, ijx_dtm.j-1)), shape=(nrow,ncol))
sp_dtm.shape

(27682, 78047)

In [9]:
#####################################################
## Import additional feature/covariate information
#####################################################
fpath_x = wd_path + "\\X.csv"

X = pd.read_csv(filepath_or_buffer=fpath_x,
                      delimiter=",",
                      header=0,
                      encoding='latin1')

In [10]:
(X.shape, X.columns.values)

((27682, 26),
 array(['au_id', 'prism_url', 'eid', 'doi', 'issn', 'eissn', 'pub_title',
        'pub_author', 'pub_journal', 'pub_date', 'pub_abstract',
        'pub_keywords', 'pub_type', 'pub_subtype', 'open_access',
        'fund_acr', 'fund_num', 'fund_sponsor', 'cited_count', 'nam', 'id',
        'pub_year', 'au_country', 'au_institution', 'au_name',
        'au_institution_country'], dtype=object))

In [11]:
#####################################################
## Import dictionary/vocab file
#####################################################
fpath_vocab = wd_path + "\\vocab.csv"

vocab = pd.read_csv(filepath_or_buffer=fpath_vocab,
                      delimiter=",",
                      header=0,
                      encoding='latin1')

(vocab.shape, vocab.columns.values)

((78047, 1), array(['vocab'], dtype=object))

In [12]:
vocab.vocab.tolist()

['µct',
 'µfe',
 'µg',
 'µgd',
 'µgday',
 'µgdl',
 'µgdwere',
 'µgformoterol',
 'µgg',
 'µgh',
 'µghml',
 'µghrml',
 'µgkg',
 'µgkgday',
 'µgkghr',
 'µgkginfusion',
 'µgkgmin',
 'µgl',
 'µgliter',
 'µgm',
 'µgml',
 'µgmyears',
 'µl',
 'µlside',
 'µm',
 'µmµm',
 'µmgy',
 'µmol',
 'µmoll',
 'µms',
 'µopioid',
 'µsv',
 'µtw',
 'µu',
 'µubucaubduae',
 'µuml',
 'aa',
 'aaa',
 'ãââ',
 'aaaai',
 'aaaf',
 'aaag',
 'aaahpv',
 'aaarelated',
 'aaas',
 'aabp',
 'aac',
 'aacchb',
 'aacd',
 'aace',
 'aaces',
 'aacods',
 'aacr',
 'aact',
 'aad',
 'aaderived',
 'aadimethylmethylethylhindolepropanoic',
 'aadiss',
 'aadrenergic',
 'aadrenoceptors',
 'aads',
 'aae',
 'aafp',
 'aafps',
 'aag',
 'aagarose',
 'aagl',
 'aagonists',
 'aahp',
 'aahperd',
 'aainf',
 'aala',
 'aalborg',
 'aald',
 'aalen',
 'aallele',
 'aalleleprotein',
 'aalowering',
 'aamc',
 'aamediated',
 'aami',
 'aamieshiso',
 'aan',
 'aans',
 'aantagonist',
 'aantitrypsin',
 'aao',
 'aaohns',
 'aaor',
 'aaos',
 'aaosmodems',
 'aap',
 'aapc

In [13]:
## POS tag the vocabulary 
vocab_pos = nltk.pos_tag(vocab.vocab.astype('str').tolist())
vocab_pos

[('µct', 'JJ'),
 ('µfe', 'NNP'),
 ('µg', 'NNP'),
 ('µgd', 'NNP'),
 ('µgday', 'NNP'),
 ('µgdl', 'NNP'),
 ('µgdwere', 'NNP'),
 ('µgformoterol', 'NNP'),
 ('µgg', 'NNP'),
 ('µgh', 'NNP'),
 ('µghml', 'NNP'),
 ('µghrml', 'NNP'),
 ('µgkg', 'NNP'),
 ('µgkgday', 'NNP'),
 ('µgkghr', 'NNP'),
 ('µgkginfusion', 'NNP'),
 ('µgkgmin', 'NNP'),
 ('µgl', 'NNP'),
 ('µgliter', 'NNP'),
 ('µgm', 'NNP'),
 ('µgml', 'NNP'),
 ('µgmyears', 'NNP'),
 ('µl', 'NNP'),
 ('µlside', 'NNP'),
 ('µm', 'NNP'),
 ('µmµm', 'NNP'),
 ('µmgy', 'NNP'),
 ('µmol', 'NNP'),
 ('µmoll', 'NNP'),
 ('µms', 'NNP'),
 ('µopioid', 'NNP'),
 ('µsv', 'NNP'),
 ('µtw', 'NNP'),
 ('µu', 'NNP'),
 ('µubucaubduae', 'NNP'),
 ('µuml', 'NNP'),
 ('aa', 'VBZ'),
 ('aaa', 'JJ'),
 ('ãââ', 'NNP'),
 ('aaaai', 'NN'),
 ('aaaf', 'NN'),
 ('aaag', 'NN'),
 ('aaahpv', 'NN'),
 ('aaarelated', 'VBD'),
 ('aaas', 'JJ'),
 ('aabp', 'NN'),
 ('aac', 'NN'),
 ('aacchb', 'NN'),
 ('aacd', 'JJ'),
 ('aace', 'NN'),
 ('aaces', 'NNS'),
 ('aacods', 'NNS'),
 ('aacr', 'VBP'),
 ('aact', 'JJ')

In [14]:
vocab_pos_df = pd.DataFrame(vocab_pos, columns=['vocab','pos'])
vocab_pos_df['pos_'] = vocab_pos_df.pos.str.slice(stop=2)
vocab_pos_df

Unnamed: 0,vocab,pos,pos_
0,µct,JJ,JJ
1,µfe,NNP,NN
2,µg,NNP,NN
3,µgd,NNP,NN
4,µgday,NNP,NN
...,...,...,...
78042,zx,NNP,NN
78043,zxmri,NNP,NN
78044,zyban,NNP,NN
78045,zygosity,NN,NN


In [15]:
## Type of words (POS tags) in the vocab...
vocab_pos_df.pos.value_counts()

NN      30838
JJ      17534
NNS      9247
VBD      4858
VBP      4673
VBG      3755
RB       2345
VBN      2221
VBZ       954
IN        466
NNP       248
FW        230
VB        213
JJS       121
JJR       117
RBR        68
RP         33
WP         25
CC         21
CD         15
MD         15
PRP        13
DT         12
RBS         8
PRP$        7
WRB         5
EX          3
WDT         1
POS         1
Name: pos, dtype: int64

In [16]:
## Type of words (POS tags) in the vocab...
vocab_pos_df.pos_.value_counts()

NN    40333
JJ    17772
VB    16674
RB     2421
IN      466
FW      230
RP       33
WP       25
CC       21
PR       20
MD       15
CD       15
DT       12
WR        5
EX        3
WD        1
PO        1
Name: pos_, dtype: int64

In [17]:
## What do the tag-sets mean (some examples below)
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [18]:
## Describe the vocab/colSums objects
[
    len(vocab),
    sp_dtm.shape,
    sp_dtm.sum(axis=0).shape
]


[78047, (27682, 78047), (1, 78047)]

In [19]:
## Most frequently occurring words in abstract corpus
vocab_freq = sp_dtm.sum(axis=0)
vocab_freq = pd.Series(np.asarray(vocab_freq).reshape(-1))
vocab_pos_df['vocab_freq'] = vocab_freq
vocab_pos_df = vocab_pos_df.sort_values(by=['vocab_freq'], ascending=False)
vocab_pos_df.head(500)

Unnamed: 0,vocab,pos,pos_,vocab_freq
9187,care,NN,NN,37590
51396,patients,NNS,NN,36863
29168,health,NN,NN,33999
67872,study,VBP,VB,26605
16134,data,NNS,NN,20163
61470,risk,NN,NN,19982
41746,methods,NNS,NN,17060
56103,primary,JJ,JJ,15898
51014,participants,NNS,NN,13652
67841,studies,NNS,NN,13608


In [20]:
## Most freq occurring nouns
vocab_pos_df.loc[vocab_pos_df['pos_']=="NN"].sort_values(by=['vocab_freq'], ascending=False).head(10)

Unnamed: 0,vocab,pos,pos_,vocab_freq
9187,care,NN,NN,37590
51396,patients,NNS,NN,36863
29168,health,NN,NN,33999
16134,data,NNS,NN,20163
61470,risk,NN,NN,19982
41746,methods,NNS,NN,17060
51014,participants,NNS,NN,13652
67841,studies,NNS,NN,13608
77258,women,NNS,NN,12938
8840,cancer,NN,NN,12656


In [21]:
## Most freq occurring adjectives
vocab_pos_df.loc[vocab_pos_df['pos_']=="JJ"].sort_values(by=['vocab_freq'], ascending=False).head(10)

Unnamed: 0,vocab,pos,pos_,vocab_freq
56103,primary,JJ,JJ,15898
11629,clinical,JJ,JJ,12553
41007,medical,JJ,JJ,8619
33321,increased,JJ,JJ,7659
48257,objective,JJ,JJ,7277
32181,identified,JJ,JJ,6775
65731,social,JJ,JJ,6013
71686,total,JJ,JJ,5762
53054,physical,JJ,JJ,5606
54882,potential,JJ,JJ,4881


In [22]:
## Most freq occurring verbs
vocab_pos_df.loc[vocab_pos_df['pos_']=="VB"].sort_values(by=['vocab_freq'], ascending=False).head(10)

Unnamed: 0,vocab,pos,pos_,vocab_freq
67872,study,VBP,VB,26605
12878,compared,VBN,VB,10230
33215,included,VBD,VB,8527
68602,support,VBP,VB,7430
17744,diabetes,VBZ,VB,7124
60333,reported,VBD,VB,7096
65568,smoking,VBG,VB,6462
13420,conducted,VBD,VB,6220
33224,including,VBG,VB,5672
4718,assessed,VBN,VB,4921


In [23]:
## Most freq occurring adverbs
vocab_pos_df.loc[vocab_pos_df['pos_']=="RB"].sort_values(by=['vocab_freq'], ascending=False).head(10)

Unnamed: 0,vocab,pos,pos_,vocab_freq
49876,outcomes,RB,RB,10886
76577,weight,RB,RB,4421
20670,effectiveness,RB,RB,3443
48224,obesity,RB,RB,1934
6766,bias,RB,RB,1487
67139,statistically,RB,RB,1478
33382,independently,RB,RB,1325
28943,hazard,RB,RB,1177
30047,highly,RB,RB,1147
58556,randomly,RB,RB,1133


In [24]:
##########################################################
## Fit NMF to sparse DTM
##########################################################

In [25]:
##
## Initialize an NMF model (with specific hyper-parm settings)
##
nmf = NMF(n_components=k,                   # Number latent components
                max_iter=1000,               # Max updates of iterative algos
                tol=1e-6,                   # Convergence tolerance
                solver=slvr,                # solver={cd, mu}; coordinate descent vs. multiplicative updates
                beta_loss=loss,             # beta_loss={frobenius, kullback-leibler, itakura-saito}
                init=init_mat,              # {random, nndsvd. nndsvda, nndsvdar}
                alpha=alp,                 # Controls strength of regularization, alpha=0 implies no regulatization
                l1_ratio=l1,               # l1_ratio=0 implies ridge/frob penalty; l1_ratio=1 implies lasso penalty
                regularization='both',      # {both, components (H), transformation (W)}
                random_state=51423)

In [26]:
# help(nmf)

In [27]:
# Fit NMF model to sparse DTM
# t0 = time.time()
# nmf.fit(sp_dtm)
# t1 = time.time()
# t1-t0

In [28]:
## Extract the per document topic prevalence weights
t0 = time.time()
theta = nmf.fit_transform(sp_dtm)
t1 = time.time()
t1-t0



593.0528898239136

In [29]:
## Convert to pandas dataframe
theta_pd = pd.DataFrame(theta)
theta_pd.shape

(27682, 50)

In [30]:
## Rename columns in matrix of learned per-doc topic prevalence vecs
theta_colnames = ["Topic" + str(num+1) for num in range(k)]
theta_pd.columns = theta_colnames
theta_pd

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50
0,0.000000,0.000000,0.052074,0.155084,0.000748,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.030113,0.015280,0.000000,0.023410,0.009153,0.000000,0.000000,0.000000,0.000000,0.000000,0.033627,0.010518,0.000785,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015793,0.001253,0.000000,0.000000,0.010520,0.000000,0.000000,0.000000,0.000000,0.029679,0.000000,0.000000,0.064470,0.019545,0.000000,0.000000,0.000000
1,0.005963,0.001661,0.006570,0.000000,0.004449,0.072075,0.000000,0.002364,0.005632,0.000000,0.0,0.002599,0.034876,0.000000,0.000429,0.049504,0.013313,0.001004,0.000000,0.002364,0.002426,0.018482,0.001545,0.000000,0.000537,0.000000,0.000000,0.000369,0.002258,0.002532,0.002170,0.001135,0.000000,0.000000,0.001472,0.000000,0.000084,0.003477,0.003544,0.000000,0.000000,0.000000,0.000022,0.018607,0.000877,0.027991,0.000000,0.001558,0.027643,0.049225
2,0.000000,0.000000,0.053118,0.046076,0.000000,0.011069,0.000000,0.008748,0.000000,0.000000,0.0,0.036189,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.019842,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030828,0.000000,0.044475,0.000000,0.000000,0.072346,0.000000,0.116665,0.000000,0.046225,0.000000,0.000000,0.012275,0.019941,0.128600,0.018762,0.000000
3,0.000000,0.000000,0.054457,0.000000,0.085724,0.016496,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.030324,0.000622,0.000000,0.000000,0.005002,0.002949,0.000000,0.000000,0.000000,0.000000,0.000453,0.009689,0.001375,0.000248,0.000000,0.017512,0.000000,0.004311,0.000457,0.000032,0.000000,0.060669,0.000000,0.000000,0.000000,0.000000,0.024842,0.000000,0.000000,0.000000,0.028510,0.000000,0.020738,0.104071,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000566,0.000000,0.010726,0.039615,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.029950,0.002385,0.000000,0.047224,0.036559,0.000912,0.000000,0.000000,0.000000,0.000000,0.046338,0.000000,0.002828,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030220,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.032427,0.000000,0.000000,0.142984,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27677,0.000000,0.000000,0.000000,0.001909,0.000000,0.029057,0.005253,0.000000,0.000000,0.000000,0.0,0.000000,0.058309,0.000000,0.000000,0.000000,0.000000,0.000000,0.000815,0.000000,0.000000,0.001121,0.015393,0.000000,0.000000,0.008346,0.001885,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.060444,0.001429,0.000000,0.000000,0.000000,0.000000,0.000000,0.050804,0.013809,0.038386,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000344
27678,0.000000,0.000000,0.000000,0.000000,0.000000,0.002343,0.000000,0.000000,0.000113,0.000000,0.0,0.000000,0.090446,0.003217,0.017154,0.000000,0.006081,0.000000,0.000000,0.000000,0.001746,0.000000,0.015802,0.000000,0.000000,0.000000,0.000000,0.000000,0.002228,0.000000,0.001765,0.017575,0.000000,0.046451,0.000000,0.000000,0.000367,0.000000,0.001794,0.000000,0.101651,0.006557,0.022471,0.000000,0.000000,0.000000,0.000017,0.000021,0.000000,0.000859
27679,0.000000,0.000000,0.000000,0.000910,0.000000,0.000000,0.000381,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.033784,0.000000,0.000000,0.000000,0.000000,0.000190,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000453,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030194,0.000699,0.000000,0.000000,0.000000,0.000346,0.000000,0.044269,0.001559,0.070982,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
27680,0.004197,0.000000,0.002162,0.000427,0.000996,0.004360,0.000000,0.003421,0.000000,0.001684,0.0,0.000000,0.001536,0.000000,0.000000,0.093913,0.002733,0.000000,0.000248,0.000000,0.019652,0.001810,0.000000,0.000309,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002431,0.002348,0.108667,0.000947,0.005485,0.000327,0.000000,0.000000,0.001698,0.000000,0.003147,0.012476,0.000000,0.003194,0.000000,0.000000,0.000000,0.000000,0.000000


In [31]:
## Extract the topical vectors
t0 = time.time()
phi = nmf.components_
t1 - time.time()
fit_time = t1-t0
fit_time

-0.20787453651428223

In [32]:
## Convert to pandas dataframe
phi_pd = pd.DataFrame(phi)
phi.shape

(50, 78047)

In [33]:
## Assign colnames to topic matrix
phi_pd.columns = vocab.vocab.tolist()
phi_pd

Unnamed: 0,µct,µfe,µg,µgd,µgday,µgdl,µgdwere,µgformoterol,µgg,µgh,µghml,µghrml,µgkg,µgkgday,µgkghr,µgkginfusion,µgkgmin,µgl,µgliter,µgm,µgml,µgmyears,µl,µlside,µm,...,zscorevariety,zsdss,zsdsw,zsfg,zstatistics,zte,ztest,ztests,ztpi,zuckerberg,zuclopenthixol,zuithoff,zung,zurich,zvl,zwaan,zwar,zwarenstein,zwc,zwfl,zx,zxmri,zyban,zygosity,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004468,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001889,0.0,0.002062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000621,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000441,0.049761,0.000798,0.000211,0.0,0.0,0.002091,0.0,0.000212,0.0,0.0,0.000837,0.000541,0.0,0.0,0.0,0.016629,0.001657,0.0,0.024204,0.0,0.003021,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000674,0.0,0.0,0.000391,0.0,0.0,0.001242,0.0,0.0,0.0,0.0,0.00042
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001227,0.0,0.0,0.0,0.0,0.0,0.000338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000574,0.0,0.0,0.0
3,0.000601,0.0,0.185995,0.0,0.003502,0.0,0.0,0.008008,0.0,0.0,0.000374,0.0,0.001228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0009,0.0,0.0,0.0,0.0,0.0,0.0,0.004292,0.010635,0.0,0.0,0.0,0.0,0.003589,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000113,0.0,0.0,0.0,0.0,0.005271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011853,0.0,0.0,0.0,0.0,0.001096,...,0.0,0.0,0.0,0.0,0.003495,0.0,0.003485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005719,0.0,0.0,0.0,0.0,0.0,0.005709,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066293,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#####################################
## Frobenius norm error
#####################################
frob_norm_error = nmf.reconstruction_err_
frob_norm_error

2674.289590341051

In [35]:
####################################################
##
## 
## Summarize top-P words occuring for each topic k=1...K of matrix phi (these will provide topical/thematic summary of corpus)
##
##
####################################################

In [36]:
phi_pd_ = phi_pd.div(phi_pd.sum(axis=1), axis=0)
phi_pd_

Unnamed: 0,µct,µfe,µg,µgd,µgday,µgdl,µgdwere,µgformoterol,µgg,µgh,µghml,µghrml,µgkg,µgkgday,µgkghr,µgkginfusion,µgkgmin,µgl,µgliter,µgm,µgml,µgmyears,µl,µlside,µm,...,zscorevariety,zsdss,zsdsw,zsfg,zstatistics,zte,ztest,ztests,ztpi,zuckerberg,zuclopenthixol,zuithoff,zung,zurich,zvl,zwaan,zwar,zwarenstein,zwc,zwfl,zx,zxmri,zyban,zygosity,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.065545e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.6e-05,0.0,2.800086e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8e-06,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,3.462835e-06,0.000391,6.260296e-06,2e-06,0.0,0.0,1.6e-05,0.0,1.664038e-06,0.0,0.0,7e-06,4e-06,0.0,0.0,0.0,0.000131,1.300475e-05,0.0,0.00019,0.0,2.4e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.289767e-06,0.0,0.0,3.071808e-06,0.0,0.0,9.749281e-06,0.0,0.0,0.0,0.0,3.297998e-06
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.078613e-05,0.0,0.0,0.0,0.0,0.0,5.724353e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.671152e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.728018e-06,0.0,0.0,0.0
3,3e-06,0.0,0.001078,0.0,2e-05,0.0,0.0,4.6e-05,0.0,0.0,2.170053e-06,0.0,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.330571e-06,0.0,0.0,0.0,0.0,0.0,0.0,3.017542e-05,7.477184e-05,0.0,0.0,0.0,0.0,2.5e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.744092e-07,0.0,0.0,0.0,0.0,3.132823e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.454826e-05,0.0,0.0,0.0,0.0,4e-06,...,0.0,0.0,0.0,0.0,1.3e-05,0.0,1.309672e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.290046e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.221048e-05,0.0,0.0,0.0,0.0,0.0,2.2e-05,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.589657e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000372,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
def get_top_words(topic, num_words):
    vec = phi_pd_.iloc[topic,:].sort_values(ascending=False).head(n=num_words)
    vec_toks = vec.index.tolist()
    vec_vals = vec.tolist() 
    vec_out = [tok + "  (" + str(np.round(val,2)) + ")" for tok, val in zip(vec_toks, vec_vals)]
    return vec_out

# get_top_words(1,10)

In [38]:
topic_list = []

for i in np.arange(k):
    topic_list.append( get_top_words(topic=i, num_words=10) )

topic_list

[['intervention  (0.17)',
  'control  (0.04)',
  'trial  (0.02)',
  'controlled  (0.01)',
  'randomized  (0.01)',
  'effectiveness  (0.01)',
  'feasibility  (0.01)',
  'support  (0.01)',
  'randomised  (0.01)',
  'usual  (0.01)'],
 ['months  (0.06)',
  'trial  (0.04)',
  'baseline  (0.02)',
  'followup  (0.02)',
  'outcome  (0.02)',
  'randomised  (0.01)',
  'difference  (0.01)',
  'weeks  (0.01)',
  'compared  (0.01)',
  'month  (0.01)'],
 ['primary  (0.32)',
  'secondary  (0.02)',
  'care  (0.02)',
  'outcome  (0.01)',
  'setting  (0.01)',
  'pcps  (0.01)',
  'referral  (0.01)',
  'total  (0.01)',
  'testing  (0.01)',
  'service  (0.01)'],
 ['treatment  (0.14)',
  'therapy  (0.01)',
  'treatments  (0.01)',
  'drug  (0.01)',
  'placebo  (0.01)',
  'treated  (0.01)',
  'effects  (0.0)',
  'hypertension  (0.0)',
  'oral  (0.0)',
  'effective  (0.0)'],
 ['participants  (0.18)',
  'adults  (0.01)',
  'setting  (0.01)',
  'design  (0.01)',
  'reported  (0.01)',
  'recruited  (0.01)',
  'ma

In [39]:
#################################################
##
##
## Get top-P documents loading most strongly on each of the k=1...K topical vectors of corpus
##
##
##################################################

In [40]:
theta_pd_ = theta_pd.div(theta_pd.sum(axis=1), axis=0)
theta_pd_

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50
0,0.000000,0.000000,0.110314,0.328531,0.001585,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.063791,0.032370,0.000000,0.049591,0.019391,0.000000,0.000000,0.00000,0.000000,0.000000,0.071235,0.022282,0.001663,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033456,0.002655,0.000000,0.000000,0.022285,0.000000,0.000000,0.000000,0.000000,0.062873,0.00000,0.000000,0.136574,0.041405,0.000000,0.000000,0.000000
1,0.016169,0.004503,0.017818,0.000000,0.012066,0.195454,0.000000,0.006410,0.015273,0.000000,0.0,0.007047,0.094578,0.000000,0.001164,0.134246,0.036103,0.002724,0.000000,0.00641,0.006578,0.050121,0.004190,0.000000,0.001456,0.000000,0.000000,0.00100,0.006123,0.006865,0.005884,0.003078,0.000000,0.000000,0.003991,0.000000,0.000227,0.009428,0.009611,0.000000,0.000000,0.000000,0.000059,0.05046,0.002379,0.075907,0.000000,0.004226,0.074962,0.133489
2,0.000000,0.000000,0.079858,0.069270,0.000000,0.016641,0.000000,0.013152,0.000000,0.000000,0.0,0.054407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.029831,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.046346,0.000000,0.066864,0.000000,0.000000,0.108765,0.000000,0.175394,0.000000,0.069495,0.00000,0.000000,0.018455,0.029979,0.193337,0.028207,0.000000
3,0.000000,0.000000,0.116242,0.000000,0.182983,0.035211,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.064728,0.001328,0.000000,0.000000,0.010678,0.006295,0.000000,0.00000,0.000000,0.000000,0.000966,0.020681,0.002934,0.000530,0.000000,0.03738,0.000000,0.009202,0.000975,0.000068,0.000000,0.129502,0.000000,0.000000,0.000000,0.000000,0.053026,0.000000,0.000000,0.000000,0.060857,0.00000,0.044267,0.222146,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.001339,0.000000,0.025373,0.093711,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.070849,0.005643,0.000000,0.111711,0.086482,0.002158,0.000000,0.00000,0.000000,0.000000,0.109616,0.000000,0.006690,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.071487,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076708,0.00000,0.000000,0.338234,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27677,0.000000,0.000000,0.000000,0.006645,0.000000,0.101137,0.018284,0.000000,0.000000,0.000000,0.0,0.000000,0.202950,0.000000,0.000000,0.000000,0.000000,0.000000,0.002838,0.00000,0.000000,0.003902,0.053576,0.000000,0.000000,0.029050,0.006559,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.210383,0.004975,0.000000,0.000000,0.000000,0.000000,0.000000,0.176830,0.048065,0.133608,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001197
27678,0.000000,0.000000,0.000000,0.000000,0.000000,0.006920,0.000000,0.000000,0.000333,0.000000,0.0,0.000000,0.267071,0.009499,0.050652,0.000000,0.017957,0.000000,0.000000,0.00000,0.005156,0.000000,0.046661,0.000000,0.000000,0.000000,0.000000,0.00000,0.006579,0.000000,0.005212,0.051896,0.000000,0.137162,0.000000,0.000000,0.001083,0.000000,0.005298,0.000000,0.300158,0.019360,0.066354,0.00000,0.000000,0.000000,0.000051,0.000063,0.000000,0.002537
27679,0.000000,0.000000,0.000000,0.004950,0.000000,0.000000,0.002075,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.183840,0.000000,0.000000,0.000000,0.000000,0.001034,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002467,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.164308,0.003804,0.000000,0.000000,0.000000,0.001882,0.000000,0.240899,0.008481,0.386259,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
27680,0.015090,0.000000,0.007773,0.001534,0.003580,0.015672,0.000000,0.012297,0.000000,0.006054,0.0,0.000000,0.005522,0.000000,0.000000,0.337616,0.009824,0.000000,0.000890,0.00000,0.070647,0.006506,0.000000,0.001111,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.008738,0.008441,0.390656,0.003405,0.019720,0.001177,0.000000,0.000000,0.006103,0.000000,0.011312,0.044851,0.00000,0.011481,0.000000,0.000000,0.000000,0.000000,0.000000


In [41]:
theta_pd_.describe()

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50
count,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0,27682.0
mean,0.037265,0.045007,0.041709,0.026253,0.0289,0.021684,0.011158,0.013906,0.011129,0.012146,0.015709,0.01193,0.05102,0.014939,0.009603,0.020853,0.020828,0.007879,0.01575,0.007727,0.013904,0.021963,0.022049,0.023173,0.010174,0.010773,0.017963,0.015409,0.01491,0.009795,0.018981,0.012057,0.01008,0.035878,0.008178,0.019528,0.007193,0.012007,0.00989,0.02356,0.017162,0.022251,0.051242,0.043812,0.020823,0.028208,0.013529,0.02183,0.023248,0.015035
std,0.09205,0.084561,0.08405,0.065345,0.059217,0.053639,0.040174,0.038907,0.053091,0.047348,0.042812,0.048114,0.062364,0.042806,0.03805,0.064615,0.049424,0.038065,0.052657,0.035995,0.036446,0.050817,0.043776,0.04368,0.035367,0.032403,0.064742,0.048401,0.059973,0.033726,0.046055,0.036911,0.044049,0.066536,0.037681,0.045677,0.031102,0.037641,0.030911,0.049656,0.047406,0.056222,0.07492,0.087197,0.043632,0.050734,0.034097,0.046927,0.054759,0.035184
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.000234,0.0,0.000214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001728,0.000254,0.0,0.0,0.0,0.0,0.0,0.0,0.000137,0.0,0.0,0.002538,0.0,0.0,0.0,0.0,0.0,2.2e-05,0.0,0.0,0.021245,0.004692,0.000242,0.000476,0.0,0.000903,0.0,6.4e-05
75%,0.015763,0.053191,0.053916,0.011422,0.029648,0.010281,0.003629,0.006312,0.000774,0.001929,0.009946,0.001257,0.080129,0.00798,0.001403,0.007562,0.0133,0.002479,0.001143,0.000775,0.00897,0.016895,0.029456,0.029515,0.002715,0.006746,0.00474,0.001595,0.006583,0.003485,0.013539,0.004409,0.003343,0.043796,0.002995,0.019057,0.000989,0.004938,0.003981,0.020185,0.011613,0.010715,0.072684,0.04661,0.02237,0.038014,0.008546,0.022434,0.015951,0.008795
max,0.792535,0.792851,0.683544,0.778201,0.649777,0.736508,0.600194,0.620364,0.787813,0.760083,0.669284,0.726141,0.67883,0.840975,0.789569,0.741694,0.650558,0.797018,0.800867,0.712268,0.581021,0.607407,0.744679,0.529538,0.6789,0.632222,0.886896,0.685524,0.790034,0.562567,0.81258,0.873258,0.76543,0.809024,0.720089,0.794573,0.446074,0.693489,0.625572,0.618197,0.719852,0.660791,0.784489,0.900039,0.781408,0.508332,0.629751,0.762796,0.788775,0.472657


In [42]:
theta_pd_.quantile(q=[0,0.1,0.5,0.75,0.90,0.95,0.975,0.99,1.00])

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.5,0.0,0.0,0.000234,0.0,0.000214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001728,0.000254,0.0,0.0,0.0,0.0,0.0,0.0,0.000137,0.0,0.0,0.002538,0.0,0.0,0.0,0.0,0.0,2.2e-05,0.0,0.0,0.021245,0.004692,0.000242,0.000476,0.0,0.000903,0.0,6.4e-05
0.75,0.015763,0.053191,0.053916,0.011422,0.029648,0.010281,0.003629,0.006312,0.000774,0.001929,0.009946,0.001257,0.080129,0.00798,0.001403,0.007562,0.0133,0.002479,0.001143,0.000775,0.00897,0.016895,0.029456,0.029515,0.002715,0.006746,0.00474,0.001595,0.006583,0.003485,0.013539,0.004409,0.003343,0.043796,0.002995,0.019057,0.000989,0.004938,0.003981,0.020185,0.011613,0.010715,0.072684,0.04661,0.02237,0.038014,0.008546,0.022434,0.015951,0.008795
0.9,0.132138,0.158879,0.148908,0.089714,0.099814,0.074231,0.021637,0.043387,0.007373,0.017323,0.045083,0.012977,0.131489,0.043697,0.019223,0.052905,0.070692,0.013324,0.035221,0.008759,0.042943,0.075836,0.06112,0.07463,0.024766,0.028409,0.035052,0.043972,0.025165,0.020225,0.060433,0.034175,0.016202,0.107636,0.013118,0.055956,0.006653,0.030402,0.02785,0.081554,0.047913,0.076821,0.146137,0.133589,0.066671,0.09692,0.043859,0.069485,0.0754,0.052474
0.95,0.246582,0.231663,0.224612,0.164203,0.154607,0.126766,0.057853,0.079129,0.043378,0.068795,0.088055,0.06876,0.170119,0.081809,0.053082,0.1142,0.1158,0.030593,0.124066,0.03433,0.074726,0.127637,0.093157,0.112082,0.058653,0.055,0.098706,0.113314,0.055785,0.053597,0.100511,0.071443,0.035098,0.168468,0.027231,0.096268,0.034098,0.06971,0.056304,0.125679,0.092505,0.133364,0.202455,0.223274,0.10254,0.138676,0.075948,0.113613,0.134946,0.087852
0.975,0.346505,0.299089,0.29767,0.236901,0.205045,0.182983,0.118994,0.126552,0.150493,0.148966,0.144492,0.165044,0.211276,0.132724,0.1024,0.208312,0.166817,0.061849,0.196876,0.087719,0.114499,0.183243,0.134,0.151233,0.106058,0.096578,0.204814,0.177417,0.141158,0.106287,0.148666,0.120926,0.090644,0.233142,0.061558,0.148085,0.096986,0.124428,0.096536,0.170425,0.151091,0.193288,0.258975,0.317345,0.143507,0.17655,0.110372,0.160313,0.194678,0.12319
0.99,0.451892,0.380535,0.389014,0.321897,0.277497,0.258567,0.230706,0.198694,0.29924,0.265406,0.222453,0.271868,0.265925,0.212917,0.196819,0.365327,0.236601,0.157241,0.274444,0.193106,0.18108,0.24486,0.210524,0.204033,0.175951,0.165155,0.364314,0.252122,0.351185,0.18827,0.223829,0.188523,0.256511,0.311917,0.205162,0.231347,0.179883,0.198839,0.156548,0.233982,0.246256,0.278118,0.339968,0.435336,0.209496,0.224933,0.166108,0.225938,0.275787,0.167916
1.0,0.792535,0.792851,0.683544,0.778201,0.649777,0.736508,0.600194,0.620364,0.787813,0.760083,0.669284,0.726141,0.67883,0.840975,0.789569,0.741694,0.650558,0.797018,0.800867,0.712268,0.581021,0.607407,0.744679,0.529538,0.6789,0.632222,0.886896,0.685524,0.790034,0.562567,0.81258,0.873258,0.76543,0.809024,0.720089,0.794573,0.446074,0.693489,0.625572,0.618197,0.719852,0.660791,0.784489,0.900039,0.781408,0.508332,0.629751,0.762796,0.788775,0.472657


In [43]:
X_theta = pd.concat([X.reset_index(drop=True), theta_pd_], axis=1)
X_theta.shape

(27682, 76)

In [44]:
X_theta.columns.values

array(['au_id', 'prism_url', 'eid', 'doi', 'issn', 'eissn', 'pub_title',
       'pub_author', 'pub_journal', 'pub_date', 'pub_abstract',
       'pub_keywords', 'pub_type', 'pub_subtype', 'open_access',
       'fund_acr', 'fund_num', 'fund_sponsor', 'cited_count', 'nam', 'id',
       'pub_year', 'au_country', 'au_institution', 'au_name',
       'au_institution_country', 'Topic1', 'Topic2', 'Topic3', 'Topic4',
       'Topic5', 'Topic6', 'Topic7', 'Topic8', 'Topic9', 'Topic10',
       'Topic11', 'Topic12', 'Topic13', 'Topic14', 'Topic15', 'Topic16',
       'Topic17', 'Topic18', 'Topic19', 'Topic20', 'Topic21', 'Topic22',
       'Topic23', 'Topic24', 'Topic25', 'Topic26', 'Topic27', 'Topic28',
       'Topic29', 'Topic30', 'Topic31', 'Topic32', 'Topic33', 'Topic34',
       'Topic35', 'Topic36', 'Topic37', 'Topic38', 'Topic39', 'Topic40',
       'Topic41', 'Topic42', 'Topic43', 'Topic44', 'Topic45', 'Topic46',
       'Topic47', 'Topic48', 'Topic49', 'Topic50'], dtype=object)

In [45]:
topics = ["Topic"+str(i+1) for i in np.arange(k)]
topics

['Topic1',
 'Topic2',
 'Topic3',
 'Topic4',
 'Topic5',
 'Topic6',
 'Topic7',
 'Topic8',
 'Topic9',
 'Topic10',
 'Topic11',
 'Topic12',
 'Topic13',
 'Topic14',
 'Topic15',
 'Topic16',
 'Topic17',
 'Topic18',
 'Topic19',
 'Topic20',
 'Topic21',
 'Topic22',
 'Topic23',
 'Topic24',
 'Topic25',
 'Topic26',
 'Topic27',
 'Topic28',
 'Topic29',
 'Topic30',
 'Topic31',
 'Topic32',
 'Topic33',
 'Topic34',
 'Topic35',
 'Topic36',
 'Topic37',
 'Topic38',
 'Topic39',
 'Topic40',
 'Topic41',
 'Topic42',
 'Topic43',
 'Topic44',
 'Topic45',
 'Topic46',
 'Topic47',
 'Topic48',
 'Topic49',
 'Topic50']

In [46]:
def get_top_docs(topic, num_docs):
    theta_sort = X_theta.sort_values(by=topic, ascending=False)
    vals = theta_sort[[topic]].head(num_docs).to_numpy()
    notes = theta_sort.pub_abstract.head(num_docs).to_numpy()
    out = ["(" + str(np.round(v[0],3)) + ")  " + n for n, v in zip(notes, vals)]
    return out

In [47]:
## Loop over all of the topics, grabbing top-documents under given topic
doc_list = []

for topic_ in topics:
    doc_list.append(get_top_docs(topic=topic_, num_docs=5))
    
doc_list

[['(0.793)  Objective: Treat-to-target (TTT) is an accepted paradigm for care of patients with rheumatoid arthritis (RA). Because TTT can be associated with more medication switches, concerns arise regarding whether implementing TTT may increase adverse events and/or resource use. The aim of this study was to examine adverse events and resource use during the preintervention and intervention periods of the TTT intervention trial. Methods: We used data from 6 practices enrolled in an 18-month cluster-randomized controlled trial to compare adverse events and resource use before (months 1\x969) and during (months 10\x9618) a TTT intervention. The outcomes of interest, adverse events and resource use, were based on medical record review of all rheumatology visits for RA patients before and during the intervention. Results: We examined records for 321 patients before the intervention and 315 during the intervention. An adverse event was recorded in 10.2% of visits before the intervention an

In [48]:
########################
## Print topic-labels and top-documents
########################

In [49]:
for i in np.arange(k):
    print("Topic" + str(i) + "\n")
    print(" ".join(topic_list[i]), "\n")
    print("\t\t\n".join(doc_list[i]), "\n")
    print("\n\n\n")

Topic0

intervention  (0.17) control  (0.04) trial  (0.02) controlled  (0.01) randomized  (0.01) effectiveness  (0.01) feasibility  (0.01) support  (0.01) randomised  (0.01) usual  (0.01) 

(0.793)  Objective: Treat-to-target (TTT) is an accepted paradigm for care of patients with rheumatoid arthritis (RA). Because TTT can be associated with more medication switches, concerns arise regarding whether implementing TTT may increase adverse events and/or resource use. The aim of this study was to examine adverse events and resource use during the preintervention and intervention periods of the TTT intervention trial. Methods: We used data from 6 practices enrolled in an 18-month cluster-randomized controlled trial to compare adverse events and resource use before (months 19) and during (months 1018) a TTT intervention. The outcomes of interest, adverse events and resource use, were based on medical record review of all rheumatology visits for RA patients before and during the interventio

In [50]:
###############################################################
## Mean Prevalence Vectors in Each Year of Study
###############################################################

In [51]:
topic_vars = ["Topic" + str(x) for x in np.arange(k)]
topics_by_year = X_theta[X_theta.columns.intersection(topic_vars)].groupby(X_theta.pub_year).mean()
topics_by_year

Unnamed: 0_level_0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49
pub_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
2000,0.033591,0.047664,0.034157,0.029143,0.018912,0.02669,0.012829,0.009322,0.013284,0.013269,0.017904,0.012102,0.049362,0.008719,0.013733,0.023619,0.018839,0.006509,0.020686,0.009634,0.014105,0.025351,0.020155,0.019509,0.013638,0.015312,0.015829,0.025904,0.020481,0.014643,0.020742,0.007404,0.00787,0.026656,0.006031,0.020198,0.013296,0.011875,0.00994,0.016045,0.01771,0.027911,0.034084,0.053416,0.023036,0.03348,0.00986,0.021074,0.019655
2001,0.037466,0.044968,0.030206,0.033919,0.016104,0.032119,0.00889,0.011105,0.016359,0.010867,0.017548,0.014943,0.051297,0.011718,0.010319,0.023354,0.022204,0.005329,0.017885,0.010081,0.014074,0.022687,0.017071,0.023448,0.012289,0.011472,0.017736,0.02441,0.020795,0.014482,0.015375,0.010536,0.006282,0.030847,0.005393,0.025841,0.006537,0.012426,0.010841,0.017482,0.013046,0.026058,0.034349,0.059418,0.014811,0.030006,0.013948,0.025205,0.019098
2002,0.032663,0.033005,0.040715,0.032093,0.016654,0.032721,0.010655,0.009391,0.014085,0.008986,0.016673,0.009354,0.050015,0.011601,0.011249,0.025462,0.01727,0.008612,0.019079,0.012971,0.017234,0.024759,0.018136,0.018848,0.012635,0.010216,0.019312,0.022005,0.024271,0.014371,0.01437,0.008749,0.006614,0.031258,0.005039,0.017995,0.007992,0.015096,0.010156,0.015465,0.018596,0.030723,0.038731,0.057074,0.020459,0.030292,0.011735,0.024579,0.022593
2003,0.029864,0.03752,0.038476,0.036137,0.020313,0.024925,0.011066,0.01061,0.01317,0.011878,0.016624,0.010282,0.048768,0.010677,0.008159,0.022178,0.021934,0.008111,0.016701,0.008284,0.013871,0.019951,0.019874,0.02362,0.012728,0.009162,0.017187,0.021715,0.019237,0.015686,0.020489,0.010593,0.009236,0.03728,0.004516,0.02305,0.006664,0.014002,0.010867,0.017228,0.023459,0.027351,0.037168,0.061623,0.018834,0.032921,0.010348,0.022051,0.019039
2004,0.036863,0.036898,0.030858,0.02628,0.020039,0.028327,0.009783,0.012243,0.014374,0.015604,0.017089,0.012011,0.049125,0.015586,0.010197,0.024859,0.020407,0.005505,0.017374,0.009567,0.017032,0.023791,0.01966,0.020721,0.014593,0.010271,0.017456,0.023358,0.017786,0.016243,0.015393,0.00879,0.006853,0.034412,0.006247,0.021802,0.011118,0.013499,0.01058,0.018494,0.020416,0.027064,0.040144,0.051806,0.019815,0.033326,0.011899,0.023405,0.016386
2005,0.038125,0.043101,0.032298,0.030921,0.02051,0.024636,0.013187,0.011357,0.01573,0.010344,0.018853,0.009853,0.047088,0.01221,0.010047,0.025781,0.01753,0.008042,0.017576,0.007617,0.014338,0.022606,0.021695,0.020668,0.011837,0.013188,0.021346,0.019002,0.015781,0.014506,0.020184,0.011072,0.006651,0.032515,0.006239,0.02183,0.01041,0.012692,0.011129,0.015837,0.018516,0.026492,0.042076,0.055732,0.019192,0.031749,0.011537,0.025508,0.015785
2006,0.035404,0.039767,0.037227,0.033404,0.021081,0.024276,0.010818,0.010819,0.014133,0.009741,0.017246,0.017054,0.050355,0.015941,0.012054,0.027396,0.017697,0.008986,0.015088,0.007903,0.01267,0.022025,0.020791,0.021062,0.011583,0.01135,0.019194,0.020092,0.019832,0.013323,0.017164,0.010625,0.006847,0.032869,0.01001,0.020766,0.007799,0.015058,0.009835,0.017823,0.0177,0.026805,0.041773,0.056194,0.021395,0.026628,0.011883,0.019154,0.016377
2007,0.030827,0.042875,0.030147,0.030127,0.020406,0.026102,0.013136,0.008693,0.012642,0.012791,0.01553,0.011399,0.05395,0.010535,0.010047,0.025212,0.01809,0.007584,0.019655,0.007301,0.015102,0.025318,0.019923,0.018874,0.012759,0.013153,0.021499,0.020349,0.017202,0.014786,0.019229,0.010359,0.009489,0.02918,0.009399,0.018021,0.009456,0.012245,0.012165,0.01893,0.019108,0.026163,0.041577,0.056136,0.021461,0.029902,0.011155,0.026473,0.01857
2008,0.035121,0.040976,0.038541,0.029081,0.026496,0.028662,0.012272,0.008885,0.012289,0.015254,0.013418,0.013344,0.049881,0.016659,0.011687,0.023105,0.019484,0.008466,0.016135,0.007781,0.012044,0.024991,0.019072,0.020186,0.013111,0.009563,0.02085,0.017625,0.014871,0.012706,0.017196,0.010118,0.009862,0.031246,0.01001,0.020879,0.006629,0.014289,0.012294,0.016636,0.016825,0.02171,0.045926,0.051344,0.024174,0.030039,0.01315,0.020264,0.017103
2009,0.030173,0.042452,0.036627,0.028539,0.024991,0.021088,0.010227,0.012557,0.018722,0.013218,0.01267,0.015591,0.05202,0.012977,0.011148,0.025859,0.019346,0.00916,0.019272,0.009432,0.013824,0.02711,0.018584,0.017642,0.011752,0.012897,0.018439,0.018633,0.015625,0.010272,0.020843,0.009596,0.008231,0.034623,0.007576,0.017634,0.005963,0.01121,0.009948,0.018057,0.014538,0.023337,0.045336,0.058926,0.022982,0.0272,0.012952,0.021795,0.021937


In [52]:
###################################################################
## Partition Coefficient
###################################################################

In [53]:
((theta_pd_.to_numpy()**2).sum())/theta_pd_.shape[0]

0.16241002477523875

In [54]:
fpath_theta = wd_path + "\\NMF_X_theta.csv"

X_theta.to_csv(path_or_buf=fpath_theta,
                      encoding="utf-8",
                      index=False)

In [55]:
fpath_phi = wd_path + "\\NMF_phi.csv"

phi_pd.to_csv(path_or_buf=fpath_phi,
                      encoding="utf-8",
                      index=False)

In [56]:
#############################
## Session Info
#############################
import IPython
print(IPython.sys_info())

{'commit_hash': '3813660de',
 'commit_source': 'installation',
 'default_encoding': 'cp1252',
 'ipython_path': 'C:\\Users\\meaneych\\Anaconda3\\envs\\pyenv_topicmodels3\\lib\\site-packages\\IPython',
 'ipython_version': '7.29.0',
 'os_name': 'nt',
 'platform': 'Windows-10-10.0.17763-SP0',
 'sys_executable': 'C:\\Users\\meaneych\\Anaconda3\\envs\\pyenv_topicmodels3\\python.exe',
 'sys_platform': 'win32',
 'sys_version': '3.7.12 | packaged by conda-forge | (default, Oct 26 2021, '
                '05:37:49) [MSC v.1916 64 bit (AMD64)]'}


In [57]:
#sinfo()