# Generate Vector Spaces

Create TFIDF, LDA, and NMF document / feature vector spaces.

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from lib.helper import Db 

## Config

In [2]:
n_terms     = 4000      # Vocabulary size
ngram_range = (1,2)     # ngram min and max lengths
n_topics    = 40        # Number of topics
max_iter    = 10        # Number of iterations for topic model

In [3]:
base_path = ''
db_file = f'{base_path}db/ussc.db'
OHCO = ['vol_num','case_num','position']

# Import CORPUS

In [4]:
db = Db(db_file)

In [5]:
db.import_table('CORPUS_COMPRESSED', table_index=OHCO)

In [6]:
CORPUS = db.CORPUS_COMPRESSED

In [7]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_content,doc_len
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1
3,171,dissent,"DISSENT BY: CUSHING\nCUSHING, Justice. As I ha...",252
3,171,opinion,THE COURT delivered their opinions seriatim in...,7826
3,321,dissent,"DISSENT BY: WILSON\nWILSON, Justice. I conside...",16365
3,321,opinion,"ELSWORTH, Chief Justice. The question, how far...",824
3,386,dissent,"DISSENT BY: IREDELL\nIREDELL, Justice. Though ...",8908
...,...,...,...,...
554,471,opinion,Justice Souter delivered the opinion of the Co...,63276
554,527,dissent,DISSENT BY: Stevens \nDISSENT \nJustice Steven...,25244
554,527,opinion,Justice Scalia delivered the opinion of the Co...,45723
554,570,dissent,"Justice Breyer, with whom Justice Stevens, Jus...",150367


# Convert to Bag of Words 

ie. a __Count Vector Space__

We use Scikit Learn's CountVectorizer to convert our corpus of documents into a document-term vector space of word counts.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [9]:
count_engine = CountVectorizer(max_features=n_terms, stop_words='english', ngram_range=ngram_range)
count_model = count_engine.fit_transform(CORPUS.doc_content)

## Get Generated VOCAB

In [10]:
db.add_table('VOCAB', pd.DataFrame(count_engine.get_feature_names(), columns=['term_str']))
db.VOCAB = db.VOCAB.set_index('term_str')
db.VOCAB['ngram_len'] = None # To be added later

In [11]:
db.VOCAB.sample(10)

Unnamed: 0_level_0,ngram_len
term_str,Unnamed: 1_level_1
follows,
pipe,
committed,
133,
id,
national labor,
unit,
mandate,
governments,
associated,


## VOCAB Generated BOW

We do this just to show what the counter vectorizer produced. `DTM` stands for documet-term matrix. We convert this sparse matrix into a "thin" dataframe that keeps only terms with counts for each document. 

In [12]:
# db.add_table('DTM', pd.DataFrame(count_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index))
DTM = pd.DataFrame(count_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index)

In [13]:
db.add_table('BOW', DTM.stack().to_frame('n'))
db.BOW = db.BOW[db.BOW.n > 0]

In [14]:
DTM.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12348 entries, (3, 171, 'dissent') to (554, 570, 'opinion')
Columns: 4000 entries, 000 to zone
dtypes: int64(4000)
memory usage: 377.0+ MB


In [15]:
db.BOW.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5701266 entries, (3, 171, 'dissent', 'affirmed') to (554, 570, 'opinion', 'young')
Columns: 1 entries, n to n
dtypes: int64(1)
memory usage: 81.8+ MB


## Compute TF-IDF

In [16]:
tfidf_engine = TfidfTransformer()
tfidf_model = tfidf_engine.fit_transform(count_model)

In [17]:
TFIDF = pd.DataFrame(tfidf_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index)

In [18]:
TFIDF.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,000 000,10,100,101,102,103,104,104 ct,105,...,wrong,wrongful,wrote,xxx,year,years,years ago,york,young,zone
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.249446,0.0,0.0,0.0,0.0,0.0,0.0
3,171,opinion,0.015294,0.0,0.0,0.015748,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,321,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.009982,0.0,0.0,0.0,0.0,0.0,0.0
3,321,opinion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,386,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024362,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
db.BOW['tfidf'] = TFIDF.stack()

In [20]:
db.BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n,tfidf
vol_num,case_num,position,term_str,Unnamed: 4_level_1,Unnamed: 5_level_1
3,171,dissent,affirmed,1,0.188874
3,171,dissent,argument,1,0.187804
3,171,dissent,cause,1,0.200595
3,171,dissent,circuit,1,0.203941
3,171,dissent,circuit court,1,0.285703


## Add Features to VOCAB

In [21]:
db.VOCAB[['n_dissent','n_opinion']] = db.BOW.groupby(['term_str','position']).n.sum().unstack()

In [22]:
db.VOCAB[['tfidf_mean_dissent','tfidf_mean_opinion']] = db.BOW.groupby(['term_str','position']).tfidf.mean().unstack()

In [23]:
db.VOCAB['ngram_len'] = db.VOCAB.apply(lambda x: len(x.name.split()), 1)
db.VOCAB['n'] = DTM.sum()
db.VOCAB['tfidf_mean'] = TFIDF.mean()

In [24]:
db.VOCAB.sort_values('tfidf_mean_dissent', ascending=False).head(20)

Unnamed: 0_level_0,ngram_len,n_dissent,n_opinion,tfidf_mean_dissent,tfidf_mean_opinion,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dissented,1,653,373,0.34065,0.013286,1026,0.017625
erisa,1,372,677,0.282743,0.268979,1049,0.001359
copyright,1,750,971,0.251849,0.219522,1721,0.001778
arbitration,1,960,1638,0.204579,0.174008,2598,0.0038
____,1,730,270,0.181868,0.077553,1000,0.000988
abortion,1,876,1141,0.170398,0.202744,2017,0.001805
liquors,1,341,554,0.1675,0.103268,895,0.001559
obscenity,1,396,545,0.165025,0.154501,941,0.00177
picketing,1,595,906,0.164289,0.150234,1501,0.002291
deportation,1,651,1232,0.159931,0.156403,1883,0.002864


In [25]:
db.VOCAB.sort_values('tfidf_mean_opinion', ascending=False).head(20)

Unnamed: 0_level_0,ngram_len,n_dissent,n_opinion,tfidf_mean_dissent,tfidf_mean_opinion,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
erisa,1,372,677,0.282743,0.268979,1049,0.001359
copyright,1,750,971,0.251849,0.219522,1721,0.001778
abortion,1,876,1141,0.170398,0.202744,2017,0.001805
arbitration,1,960,1638,0.204579,0.174008,2598,0.0038
martial,1,537,564,0.139803,0.161275,1101,0.001461
obscene,1,401,758,0.138585,0.159094,1159,0.00217
deportation,1,651,1232,0.159931,0.156403,1883,0.002864
obscenity,1,396,545,0.165025,0.154501,941,0.00177
patent,1,2808,5250,0.153918,0.15184,8058,0.008308
picketing,1,595,906,0.164289,0.150234,1501,0.002291


In [26]:
db.VOCAB.ngram_len.value_counts()

1    3677
2     323
Name: ngram_len, dtype: int64

# Generate Topic Models

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [27]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

## Using LDA

In [28]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

### THETA

The Document-Topic Matrix

In [29]:
db.add_table('LDA_THETA', pd.DataFrame(lda_engine.fit_transform(count_model), index=CORPUS.index))
db.LDA_THETA.index.name = 'doc_id'
db.LDA_THETA.columns.name = 'topic_id'

In [30]:
db.LDA_THETA.head(20).style.background_gradient(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
3,171,dissent,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.351581,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.282514,0.001389,0.001389,0.001389,0.001389,0.314516,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389
3,171,opinion,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.505161,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.092547,5.1e-05,5.1e-05,0.384036,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.016423,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05
3,321,dissent,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.709779,2.2e-05,2.2e-05,0.123482,2.2e-05,2.2e-05,0.018694,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.075749,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.071513,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05
3,321,opinion,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.490299,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.492108,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463
3,386,dissent,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.052357,4.6e-05,0.02494,4.6e-05,0.012795,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.018441,0.564303,4.6e-05,0.022537,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.297054,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.006098,4.6e-05,4.6e-05,4.6e-05
3,386,opinion,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.105125,1.5e-05,0.049685,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.06556,1.5e-05,1.5e-05,0.38268,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.067275,1.5e-05,0.270657,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.035879,0.022646,1.5e-05,1.5e-05,1.5e-05,1.5e-05
6,358,dissent,0.003382,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.089537,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.276998,2.5e-05,2.5e-05,0.304558,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.0971,2.5e-05,0.084064,2.5e-05,0.143538,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05
6,358,opinion,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.051908,1.5e-05,1.5e-05,1.5e-05,0.008324,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.21636,1.5e-05,1.5e-05,0.471563,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.175818,1.5e-05,1.5e-05,1.5e-05,0.075517,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
7,300,dissent,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.163554,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.708762,0.062922,0.000157,0.000157,0.000157,0.021862,0.000157,0.000157,0.037397,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157
7,300,opinion,0.035831,5e-05,5e-05,5e-05,5e-05,5e-05,0.180826,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,0.593913,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,0.049149,0.106839,5e-05,0.031751,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05


### PHI

In [32]:
db.add_table('LDA_PHI', pd.DataFrame(lda_engine.components_, columns=db.VOCAB.index))
db.LDA_PHI.index.name = 'topic_id'
db.LDA_PHI.columns.name = 'term_str'
db.LDA_PHI = db.LDA_PHI.T # Could do this earlier; necessary to store in db

In [33]:
db.LDA_PHI.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,1553.637848,65.407266,0.125266,65.622506,299.562255,25.179877,17.913695,49.68737,2.239989,5.081494,628.439494,231.90275,35.772734,363.097204,1.810997,34.464415,259.031047,2.775564,0.975317,54.341982,106.184422,0.623962,94.680051,259.663165,0.025,867.51578,234.559986,142.106671,14.033432,452.609683,2705.521014,28.333109,52.550797,25.188094,21.915977,28.236047,259.604552,382.110881,37.326066,50.142242
000 000,244.0857,3.00865,0.025,0.463412,0.941354,0.025,0.025,2.729871,0.025,0.025,111.43508,41.575094,0.025,0.235028,0.025,0.025,0.94852,0.025,0.025,14.657041,7.38454,0.025,0.025,4.825079,0.025,28.285607,1.782753,6.574387,0.025,0.025,435.004741,0.025,0.025,0.025,0.025,0.025,111.644485,69.499297,0.025,34.394362
10,809.681943,287.692338,282.183839,796.137909,327.048595,353.827867,160.107967,173.928017,323.247266,48.436699,131.75527,205.784245,275.438894,619.565711,413.878833,199.592297,119.575252,282.264767,148.083542,267.929065,258.854648,25.783568,1161.34441,358.055893,195.890724,192.144707,345.451128,125.469375,103.18166,1407.969891,489.625593,277.499741,545.045371,559.070532,536.776934,355.033248,84.085486,385.555315,203.874624,102.126838
100,190.370792,48.483386,11.66703,119.095319,102.673164,83.777888,14.305062,9.593398,55.990384,27.43267,179.739952,175.427812,186.287126,121.292474,61.562217,76.75858,62.362451,12.582154,72.818744,50.047472,44.75045,20.222482,30.220986,58.168916,30.599489,52.241942,31.142055,5.741588,23.219864,124.502595,339.272234,201.713282,106.544424,935.747872,155.040253,29.503305,95.542277,141.777085,131.258538,10.522288
101,152.525743,75.190107,7.138726,81.510623,15.756212,8.830327,30.117414,6.585578,185.127591,71.703157,50.809802,19.255061,8.614717,69.359574,159.291463,5.536771,2.276193,0.943085,8.297371,35.40126,16.318419,2.591074,77.185621,7.00851,43.377645,7.175285,6.953485,2.851539,0.496442,127.897627,81.470933,283.036265,96.399326,773.796756,100.551572,10.362626,12.052664,1.038238,2.309755,2.855443


### Create Topic Glosses

In [34]:
n_top_words = 7

In [35]:
db.add_table('LDA_TOPICS', db.LDA_PHI.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str))

In [36]:
db.LDA_TOPICS

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,stock,bankruptcy,court,trust,securities,corporation,assets
1,united,united states,states,committee,government,petitioner,act
2,court,jury,evidence,trial,defendant,case,state
3,child,children,medical,benefits,state,court,women
4,political,state,election,party,voting,vote,county
5,search,court,fourth,warrant,fourth amendment,amendment,police
6,court,state,case,jurisdiction,judgment,law,states
7,religious,school,state,religion,schools,education,public
8,state,federal,court,law,courts,jurisdiction,states
9,patent,vessel,use,ship,case,admiralty,court


In [38]:
db.LDA_TOPICS['topwords'] = db.LDA_TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

In [39]:
db.LDA_TOPICS

term_str,0,1,2,3,4,5,6,topwords
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,stock,bankruptcy,court,trust,securities,corporation,assets,"0 stock, bankruptcy, court, trust, securities,..."
1,united,united states,states,committee,government,petitioner,act,"1 united, united states, states, committee, go..."
2,court,jury,evidence,trial,defendant,case,state,"2 court, jury, evidence, trial, defendant, cas..."
3,child,children,medical,benefits,state,court,women,"3 child, children, medical, benefits, state, c..."
4,political,state,election,party,voting,vote,county,"4 political, state, election, party, voting, v..."
5,search,court,fourth,warrant,fourth amendment,amendment,police,"5 search, court, fourth, warrant, fourth amend..."
6,court,state,case,jurisdiction,judgment,law,states,"6 court, state, case, jurisdiction, judgment, ..."
7,religious,school,state,religion,schools,education,public,"7 religious, school, state, religion, schools,..."
8,state,federal,court,law,courts,jurisdiction,states,"8 state, federal, court, law, courts, jurisdic..."
9,patent,vessel,use,ship,case,admiralty,court,"9 patent, vessel, use, ship, case, admiralty, ..."


### Add Doc Weights

In [41]:
db.LDA_TOPICS['doc_weight_sum'] = db.LDA_THETA.sum()

In [42]:
db.LDA_TOPICS.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

term_str,topwords,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,"21 justice, mr, mr justice, dissent, court, dissenting, case",1357.806087
26,"26 court, district, petitioner, district court, appeals, court appeals, case",598.441705
6,"6 court, state, case, jurisdiction, judgment, law, states",596.750701
16,"16 case, law, evidence, bank, court, said, contract",445.050056
29,"29 congress, statute, act, united, united states, states, language",403.773173
8,"8 state, federal, court, law, courts, jurisdiction, states",387.298366
22,"22 court, act, district, order, district court, review, congress",385.728609
19,"19 states, power, congress, united, united states, constitution, act",384.183585
2,"2 court, jury, evidence, trial, defendant, case, state",381.10521
33,"33 ed, 2d, ct, ed 2d, court, id, rule",341.275731


## Using NMF

In [43]:
nmf_engine = NMF(n_components=n_topics, init='nndsvd', random_state=1, alpha=.1, l1_ratio=.5)

### THETA

In [44]:
db.add_table('NMF_THETA', pd.DataFrame(nmf_engine.fit_transform(tfidf_model), index=CORPUS.index))
db.NMF_THETA.columns.name = 'topic_id'

In [46]:
db.NMF_THETA.sample(20).style.background_gradient()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
343,747,opinion,0.009349,0.002329,0.001324,0.0,0.0,0.0,0.037215,0.00681,0.0,0.0,0.069802,0.0,0.003385,0.018671,0.0,0.0,0.056369,0.0,0.0,0.0,0.0,0.0,0.0,0.000265,0.002011,0.0,0.002119,0.0,0.0,0.000944,0.013831,0.0,0.001229,0.002461,0.023867,0.0,0.0,0.0,0.0,0.0
428,465,dissent,0.023595,7.5e-05,0.013598,0.0,0.0,0.0,0.003114,0.041387,0.0,0.0,0.076104,0.0,0.015972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001442,0.0,0.0,0.0,0.0,0.0,0.002796,0.0,0.0,0.0,0.012073,0.0,0.0,0.0,0.123325,0.0
479,367,dissent,0.011331,0.00038,0.001596,0.001356,0.0,0.0,0.0,0.0,0.0,0.0,0.120771,0.0,0.02536,0.0,0.0,0.0,0.0,0.0,0.002491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006177,0.0,0.0,0.0,0.0,0.028943
438,59,dissent,0.045326,0.00294,0.024477,0.006231,0.0,0.0,0.009794,0.018016,0.0,0.0,0.0,0.0,0.0,0.006362,0.0,0.0,0.0,0.0,0.0,0.020948,0.0,0.002232,0.0,0.0,0.027821,0.0,0.013089,0.0,0.0,0.016021,0.0,0.0,0.0,0.0,0.0,0.003653,0.0,0.003733,0.0,0.0
454,516,opinion,0.005654,0.0,0.01504,0.0,0.0,0.0,0.0,0.012944,0.0,0.0,0.0,0.0,0.020065,0.012296,0.0,0.015007,0.0,0.001043,0.0,0.0,0.0,0.004533,0.0,0.0,0.0,0.0,0.042105,0.0,0.0,0.0,0.0,0.0,2.5e-05,0.0,0.0,0.0,0.0,0.000731,0.0,0.0
82,195,opinion,0.03046,0.010866,0.0,0.0,0.0,0.0,0.0,0.001578,0.0,0.001741,0.0,0.0,0.0,0.015442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029263,0.0,0.0,0.0,0.0
526,813,dissent,0.010589,0.0,0.031537,0.018845,0.006354,0.0,0.012973,0.0,0.0,0.0,0.000522,0.0,0.017612,0.002889,0.0,0.0,0.009518,0.0,0.003599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03317,0.0,0.000341,0.024371,0.0,0.0,0.0,0.000282,0.0,0.0,0.0,0.0,0.0
494,344,dissent,0.0,0.0,0.0,0.00078,0.0,0.0,0.002915,0.010781,0.0,0.0,0.0101,0.0,0.008024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000691,0.0,0.0,0.0,0.0,0.0,0.009085,0.0,0.168273,0.0,0.0,0.0,0.0,0.010406
245,574,dissent,0.022535,0.000679,0.0,0.0,0.0,0.0,0.0,0.018608,0.0,0.0,0.0,0.041256,0.015722,0.033526,0.0,0.022442,0.0,0.0,0.0,0.003895,0.0,0.006887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017941,0.0,0.0
75,123,opinion,0.0,0.001482,0.011881,0.0,0.050449,0.0,0.036979,0.031953,0.035615,0.0,0.0,0.0,0.0,0.029844,0.0,0.0,0.0,0.016706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018655,0.0,0.0,0.0,0.000959,0.0,0.0,0.0


### PHI

In [47]:
db.add_table('NMF_PHI', pd.DataFrame(nmf_engine.components_, columns=db.VOCAB.index))

In [49]:
db.NMF_PHI.index.name = 'topic_id'
db.NMF_PHI.columns.name = 'term_str'
db.NMF_PHI = db.NMF_PHI.T

In [50]:
db.NMF_PHI.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,0.048479,0.0,0.045279,0.0,0.110083,0.0,0.0,0.0,0.0,0.0,0.0,0.201255,0.011799,0.029554,0.0,0.009884,0.046192,0.0,0.0,0.092829,0.0,0.055097,0.0,0.0,0.062493,0.071082,0.123208,0.013511,0.14107,0.137425,0.0,0.0,0.0,0.012429,0.0,0.0,0.029415,0.0,0.0,0.024295
000 000,0.0,0.0,0.0,0.0,0.015067,0.0,0.0,0.0,0.0,0.0,0.0,0.075825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035246,0.0,0.0,0.0,0.0,0.0,0.002292,0.0201,0.0,0.017194,0.043127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.091573,0.0,0.207873,0.05923,0.016079,0.04479,0.0,0.0,0.0,0.0,0.025708,0.006995,0.065901,0.025595,0.0,0.008356,0.033219,0.0,0.02067,0.022646,0.0,0.012651,0.0,0.0,0.007449,0.0,0.001488,0.009078,0.021743,0.022885,0.006713,0.004358,0.0,0.0,0.027159,0.0,0.0,0.004378,0.0018,0.0
100,0.017035,0.0,0.0318,0.089907,0.023301,0.007989,0.000692,0.009946,0.009682,0.0,0.009933,0.025305,0.031345,0.009809,0.0,0.0,0.0,0.001855,0.006251,0.011622,0.0,0.009553,0.0,0.0,0.007294,0.0,0.0,0.010525,0.014355,0.022184,0.011802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,0.015587,0.0,0.046,0.086512,0.001601,0.019103,0.0,0.013905,0.0,0.0,0.0,0.0,0.015076,0.0,0.039617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000835,0.0,0.0,0.0,0.0,0.0,0.018122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Topics

In [52]:
db.add_table('NMF_TOPICS', db.NMF_PHI.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str).rename_axis(columns={'term_str':'topic_features'}))

In [53]:
db.NMF_TOPICS

topic_features,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,court,district,district court,appeals,court appeals,case,courts
1,xxx,mr justice,mr,justice,dissent,chief justice,chief
2,congress,act,statute,secretary,federal,legislative,section
3,ed 2d,ct,ed,2d,ante,id,3d
4,tax,income,taxes,taxation,taxpayer,revenue,taxable
5,union,board,labor,employees,employer,bargaining,collective
6,states,united states,united,government,power,constitution,war
7,state,federal,court,law,jurisdiction,state court,constitution
8,commerce,interstate,interstate commerce,transportation,commerce clause,state,intrastate
9,dissented,xxx,mr justice,mr,justice,justice white,white


In [54]:
db.NMF_TOPICS['topwords'] = db.NMF_TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [58]:
db.NMF_TOPICS['doc_weight_sum'] = db.NMF_THETA.sum()

In [60]:
# db.NMF_TOPICS

In [61]:
db.NMF_TOPICS.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

topic_features,topwords,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"0 court, district, district court, appeals, court appeals, case, courts",142.860795
13,"13 plaintiff, case, court, law, defendant, judgment, said",133.340078
2,"2 congress, act, statute, secretary, federal, legislative, section",130.659881
12,"12 speech, amendment, public, ordinance, court, city, constitutional",127.394896
7,"7 state, federal, court, law, jurisdiction, state court, constitution",100.019478
6,"6 states, united states, united, government, power, constitution, war",90.213519
34,"34 counsel, trial, defendant, witness, accused, testimony, evidence",81.92882
3,"3 ed 2d, ct, ed, 2d, ante, id, 3d",81.513568
1,"1 xxx, mr justice, mr, justice, dissent, chief justice, chief",78.295554
4,"4 tax, income, taxes, taxation, taxpayer, revenue, taxable",68.188041


# Generate Doc2Vec

In [62]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [63]:
d2v_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(CORPUS.doc_content)]

In [64]:
d2v_model = Doc2Vec(d2v_docs, vector_size=40, window=3, min_count=10, workers=4)

In [65]:
db.add_table('D2V_THETA', pd.DataFrame(d2v_model.docvecs.vectors_docs, index=CORPUS.index))

In [67]:
db.D2V_THETA.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,dissent,0.085409,0.19598,0.283511,0.291919,0.086078,0.37522,0.430004,-0.058938,0.043758,0.13952,...,-0.348641,0.112546,-0.074039,-0.307339,0.247494,-0.159238,0.024389,0.237062,0.130155,-0.093181
3,171,opinion,-0.208106,-0.820854,-0.634768,-0.32418,-0.278607,0.800691,1.646429,0.278438,-0.192282,-0.198566,...,0.774213,0.470562,-0.439159,-0.64984,1.067658,0.474627,-0.222391,0.661339,0.095167,0.16181
3,321,dissent,0.239486,-0.623242,-0.672319,0.475378,-0.556374,0.791858,1.531926,-0.355855,-0.41273,-0.055047,...,0.344228,0.242254,0.205313,-1.034567,0.501684,0.254499,0.14311,0.139469,0.312661,-0.099638
3,321,opinion,0.117468,-0.014615,-0.088055,0.114286,-0.079074,0.187176,0.21943,0.044781,0.280036,0.108367,...,-0.141252,-0.042925,-0.035646,-0.038181,0.354754,0.186451,0.072087,0.05172,0.10452,-0.23661
3,386,dissent,-0.02823,-0.623071,-0.219738,0.072718,-0.23369,0.933976,0.87008,0.306954,-0.062692,0.226092,...,0.0648,0.506251,-0.679834,-0.887727,1.067448,-0.289511,0.384731,-0.125353,0.250832,0.319461


## Create Non-Negative Version

In [71]:
db.add_table('D2VP_THETA', db.D2V_THETA + np.abs(db.D2V_THETA.min()))

In [72]:
db.D2VP_THETA.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,12348.0,2.366781,0.387881,0.0,2.120016,2.361038,2.61098,3.848762
1,12348.0,1.974774,0.338561,0.0,1.774783,1.963915,2.17491,3.551294
2,12348.0,1.541675,0.363459,0.0,1.320656,1.537544,1.757436,3.114794
3,12348.0,1.762564,0.338771,0.0,1.559429,1.770526,1.969448,3.374248
4,12348.0,1.861245,0.390725,0.0,1.631237,1.879343,2.093472,3.767158
5,12348.0,1.251675,0.311478,0.0,1.050543,1.244287,1.451873,2.487003
6,12348.0,1.419695,0.333856,0.0,1.203735,1.403297,1.629722,3.120486
7,12348.0,2.057578,0.359692,0.0,1.848588,2.066574,2.269327,4.440407
8,12348.0,1.558085,0.338092,0.0,1.350563,1.556728,1.769786,3.180269
9,12348.0,1.892993,0.388406,0.0,1.66552,1.876485,2.113772,4.107063


# PCA

In [73]:
from sklearn.decomposition import PCA

In [74]:
n_comps = 10

In [75]:
pca_engine = PCA(n_components=n_comps)

In [76]:
pca_cols = [f'PC{i}' for i in range(n_comps)]

In [77]:
db.add_table('PCA_THETA', pd.DataFrame(pca_engine.fit_transform(TFIDF), columns=pca_cols, index=TFIDF.index))

In [93]:
db.add_table('PCA_PHI', pd.DataFrame(pca_engine.components_.T, index=db.VOCAB.index, columns=pca_cols))
db.PCA_PHI.columns.name = 'topic_id'

In [94]:
db.PCA_PHI.head()

topic_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
000,-0.009508,0.027223,0.002071,-0.002371,-0.02548,-0.000579,-0.022462,0.015318,-0.006232,0.010264
000 000,-0.001502,0.009044,0.000196,-0.000876,-0.006522,-0.001524,-0.001076,0.00705,-0.002384,0.003536
10,-0.011384,-0.003635,0.014754,-0.004947,0.006187,0.002353,-0.000421,-0.003411,0.005609,-0.004104
100,-0.004731,-0.001734,0.011916,0.006644,-0.000367,0.001011,0.000417,0.005781,0.000543,-0.001561
101,-0.003844,-0.003795,0.013368,0.003764,0.006114,0.000217,-0.004993,0.002055,0.001766,-0.003279


In [130]:
PCA_TOPICS_POS = db.PCA_PHI.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str).rename_axis(columns={'term_str':'topic_features'})

In [131]:
PCA_TOPICS_POS['topwords'] = PCA_TOPICS_POS.apply(lambda x: ', '.join(x).strip(), 1)

In [132]:
PCA_TOPICS_NEG = db.PCA_PHI.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=True)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str).rename_axis(columns={'term_str':'topic_features'})

In [133]:
PCA_TOPICS_NEG['topwords'] = PCA_TOPICS_NEG.apply(lambda x: ', '.join(x).strip(), 1)

In [134]:
PCA_TOPICS_POS.topwords

topic_id
PC0    xxx, mr justice, mr, justice, dissented, disse...
PC1    tax, commerce, company, property, interstate, ...
PC2           ed 2d, ct, ed, 2d, employees, board, labor
PC3          tax, state, ed 2d, ct, ed, income, taxation
PC4    state, federal, court, jurisdiction, congress,...
PC5    united states, united, states, search, governm...
PC6    commerce, interstate, commission, state, inter...
PC7    commission, ed 2d, ct, ed, commerce, interstat...
PC8    dissented, jury, union, justice harlan, labor,...
PC9    dissented, search, fourth amendment, court, po...
Name: topwords, dtype: object

In [135]:
PCA_TOPICS_NEG.topwords

topic_id
PC0      court, state, states, act, federal, law, united
PC1        jury, trial, court, petitioner, ed 2d, ct, ed
PC2    court, trial, state, states, jury, united stat...
PC3    union, labor, board, commission, employees, ac...
PC4    tax, jury, petitioner, income, trial, evidence...
PC5      court, state, union, board, tax, labor, federal
PC6       property, court, ed 2d, ct, ed, patent, income
PC7    tax, union, board, labor, employees, search, e...
PC8    dissent, douglas, justice douglas, mr justice,...
PC9    jury, congress, states, act, united states, un...
Name: topwords, dtype: object

In [136]:
PCA_TOPICS = PCA_TOPICS_POS[['topwords']].join(PCA_TOPICS_NEG[['topwords']], lsuffix='_pos', rsuffix='_neg')

In [139]:
PCA_TOPICS['topwords'] = PCA_TOPICS.apply(lambda x: x.topwords_pos + ' | ' + x.topwords_neg, 1)

In [142]:
db.add_table('PCA_TOPICS', PCA_TOPICS[['topwords']])

In [143]:
db.PCA_TOPICS

topic_features,topwords
topic_id,Unnamed: 1_level_1
PC0,"xxx, mr justice, mr, justice, dissented, disse..."
PC1,"tax, commerce, company, property, interstate, ..."
PC2,"ed 2d, ct, ed, 2d, employees, board, labor | c..."
PC3,"tax, state, ed 2d, ct, ed, income, taxation | ..."
PC4,"state, federal, court, jurisdiction, congress,..."
PC5,"united states, united, states, search, governm..."
PC6,"commerce, interstate, commission, state, inter..."
PC7,"commission, ed 2d, ct, ed, commerce, interstat..."
PC8,"dissented, jury, union, justice harlan, labor,..."
PC9,"dissented, search, fourth amendment, court, po..."


# Save the Model

In [147]:
# THESE TABLES ARE TOO WIDE FOR SQLITE
# db.tables.remove('DTM')
# db.tables.remove('TFIDF')
db.tables.remove('PCA_TOPICS_POS')
db.tables.remove('PCA_TOPICS_NEG')

In [151]:
db.tables

['VOCAB',
 'BOW',
 'LDA_THETA',
 'LDA_PHI',
 'LDA_TOPICS',
 'NMF_THETA',
 'NMF_PHI',
 'NMF_TOPICS',
 'D2V_THETA',
 'D2VP_THETA',
 'PCA_THETA',
 'PCA_PHI',
 'PCA_TOPICS']

In [152]:
db.save_all_tables()

Saving VOCAB
Saving BOW
Saving LDA_THETA
Saving LDA_PHI
Saving LDA_TOPICS
Saving NMF_THETA
Saving NMF_PHI
Saving NMF_TOPICS
Saving D2V_THETA
Saving D2VP_THETA
Saving PCA_THETA
Saving PCA_PHI
Saving PCA_TOPICS
