# Generate Vector Spaces

Create TFIDF, LDA, and NMF document / feature vector spaces.

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from lib.helper import Db 

## Config

In [2]:
n_terms     = 4000      # Vocabulary size
ngram_range = (1,2)     # ngram min and max lengths
n_topics    = 40        # Number of topics
max_iter    = 10        # Number of iterations for topic model

In [3]:
base_path = ''
db_file = f'{base_path}db/ussc.db'
OHCO = ['vol_num','case_num','position']

# Import CORPUS

In [4]:
db = Db(db_file)

In [5]:
db.import_table('CORPUS_COMPRESSED', table_index=OHCO)

In [6]:
CORPUS = db.CORPUS_COMPRESSED

In [7]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_content,doc_len
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1
3,171,dissent,"DISSENT BY: CUSHING\nCUSHING, Justice. As I ha...",252
3,171,opinion,THE COURT delivered their opinions seriatim in...,7826
3,321,dissent,"DISSENT BY: WILSON\nWILSON, Justice. I conside...",16365
3,321,opinion,"ELSWORTH, Chief Justice. The question, how far...",824
3,386,dissent,"DISSENT BY: IREDELL\nIREDELL, Justice. Though ...",8908
...,...,...,...,...
554,471,opinion,Justice Souter delivered the opinion of the Co...,63276
554,527,dissent,DISSENT BY: Stevens \nDISSENT \nJustice Steven...,25244
554,527,opinion,Justice Scalia delivered the opinion of the Co...,45723
554,570,dissent,"Justice Breyer, with whom Justice Stevens, Jus...",150367


# Convert to Bag of Words 

ie. a __Count Vector Space__

We use Scikit Learn's CountVectorizer to convert our corpus of documents into a document-term vector space of word counts.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [9]:
count_engine = CountVectorizer(max_features=n_terms, stop_words='english', ngram_range=ngram_range)
count_model = count_engine.fit_transform(CORPUS.doc_content)

## Get Generated VOCAB

In [10]:
db.add_table('VOCAB', pd.DataFrame(count_engine.get_feature_names(), columns=['term_str']))
db.VOCAB = db.VOCAB.set_index('term_str')
db.VOCAB['ngram_len'] = None # To be added later

In [11]:
db.VOCAB.sample(10)

Unnamed: 0_level_0,ngram_len
term_str,Unnamed: 1_level_1
court state,
balance,
scrutiny,
extended,
customs,
works,
494,
substantial,
largely,
published,


## VOCAB Generated BOW

We do this just to show what the counter vectorizer produced. `DTM` stands for documet-term matrix. We convert this sparse matrix into a "thin" dataframe that keeps only terms with counts for each document. 

In [12]:
# db.add_table('DTM', pd.DataFrame(count_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index))
DTM = pd.DataFrame(count_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index)

In [13]:
db.add_table('BOW', DTM.stack().to_frame('n'))
db.BOW = db.BOW[db.BOW.n > 0]

In [15]:
DTM.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12348 entries, (3, 171, 'dissent') to (554, 570, 'opinion')
Columns: 4000 entries, 000 to zone
dtypes: int64(4000)
memory usage: 377.0+ MB


In [16]:
db.BOW.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5701266 entries, (3, 171, 'dissent', 'affirmed') to (554, 570, 'opinion', 'young')
Columns: 1 entries, n to n
dtypes: int64(1)
memory usage: 81.8+ MB


## Compute TF-IDF

In [17]:
tfidf_engine = TfidfTransformer()
tfidf_model = tfidf_engine.fit_transform(count_model)

In [18]:
TFIDF = pd.DataFrame(tfidf_model.toarray(), index=CORPUS.index, columns=db.VOCAB.index)

In [19]:
TFIDF.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,000 000,10,100,101,102,103,104,104 ct,105,...,wrong,wrongful,wrote,xxx,year,years,years ago,york,young,zone
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.249446,0.0,0.0,0.0,0.0,0.0,0.0
3,171,opinion,0.015294,0.0,0.0,0.015748,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,321,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.009982,0.0,0.0,0.0,0.0,0.0,0.0
3,321,opinion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,386,dissent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024362,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
db.BOW['tfidf'] = TFIDF.stack()

In [21]:
db.BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n,tfidf
vol_num,case_num,position,term_str,Unnamed: 4_level_1,Unnamed: 5_level_1
3,171,dissent,affirmed,1,0.188874
3,171,dissent,argument,1,0.187804
3,171,dissent,cause,1,0.200595
3,171,dissent,circuit,1,0.203941
3,171,dissent,circuit court,1,0.285703


## Add Features to VOCAB

In [22]:
db.VOCAB[['n_dissent','n_opinion']] = db.BOW.groupby(['term_str','position']).n.sum().unstack()

In [23]:
db.VOCAB[['tfidf_mean_dissent','tfidf_mean_opinion']] = db.BOW.groupby(['term_str','position']).tfidf.mean().unstack()

In [26]:
db.VOCAB['ngram_len'] = db.VOCAB.apply(lambda x: len(x.name.split()), 1)
db.VOCAB['n'] = DTM.sum()
db.VOCAB['tfidf_mean'] = TFIDF.mean()

In [27]:
db.VOCAB.sort_values('tfidf_mean_dissent', ascending=False).head(20)

Unnamed: 0_level_0,ngram_len,n_dissent,n_opinion,tfidf_mean_dissent,tfidf_mean_opinion,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dissented,1,653,373,0.34065,0.013286,1026,0.017625
erisa,1,372,677,0.282743,0.268979,1049,0.001359
copyright,1,750,971,0.251849,0.219522,1721,0.001778
arbitration,1,960,1638,0.204579,0.174008,2598,0.0038
____,1,730,270,0.181868,0.077553,1000,0.000988
abortion,1,876,1141,0.170398,0.202744,2017,0.001805
liquors,1,341,554,0.1675,0.103268,895,0.001559
obscenity,1,396,545,0.165025,0.154501,941,0.00177
picketing,1,595,906,0.164289,0.150234,1501,0.002291
deportation,1,651,1232,0.159931,0.156403,1883,0.002864


In [28]:
db.VOCAB.sort_values('tfidf_mean_opinion', ascending=False).head(20)

Unnamed: 0_level_0,ngram_len,n_dissent,n_opinion,tfidf_mean_dissent,tfidf_mean_opinion,n,tfidf_mean
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
erisa,1,372,677,0.282743,0.268979,1049,0.001359
copyright,1,750,971,0.251849,0.219522,1721,0.001778
abortion,1,876,1141,0.170398,0.202744,2017,0.001805
arbitration,1,960,1638,0.204579,0.174008,2598,0.0038
martial,1,537,564,0.139803,0.161275,1101,0.001461
obscene,1,401,758,0.138585,0.159094,1159,0.00217
deportation,1,651,1232,0.159931,0.156403,1883,0.002864
obscenity,1,396,545,0.165025,0.154501,941,0.00177
patent,1,2808,5250,0.153918,0.15184,8058,0.008308
picketing,1,595,906,0.164289,0.150234,1501,0.002291


In [29]:
db.VOCAB.ngram_len.value_counts()

1    3677
2     323
Name: ngram_len, dtype: int64

# Generate Topic Models

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [30]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

## Using LDA

In [31]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

### THETA

The Document-Topic Matrix

In [32]:
db.add_table('THETA', pd.DataFrame(lda_engine.fit_transform(count_model), index=CORPUS.index))
db.THETA.index.name = 'doc_id'
db.THETA.columns.name = 'topic_id'

In [33]:
db.THETA.head(20).style.background_gradient(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
3,171,dissent,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.351581,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.282514,0.001389,0.001389,0.001389,0.001389,0.314516,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389,0.001389
3,171,opinion,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.505161,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.092547,5.1e-05,5.1e-05,0.384036,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,0.016423,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05,5.1e-05
3,321,dissent,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.709779,2.2e-05,2.2e-05,0.123482,2.2e-05,2.2e-05,0.018694,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.075749,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,0.071513,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05
3,321,opinion,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.490299,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.492108,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463,0.000463
3,386,dissent,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.052357,4.6e-05,0.02494,4.6e-05,0.012795,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.018441,0.564303,4.6e-05,0.022537,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.297054,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,0.006098,4.6e-05,4.6e-05,4.6e-05
3,386,opinion,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.105125,1.5e-05,0.049685,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.06556,1.5e-05,1.5e-05,0.38268,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.067275,1.5e-05,0.270657,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.035879,0.022646,1.5e-05,1.5e-05,1.5e-05,1.5e-05
6,358,dissent,0.003382,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.089537,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.276998,2.5e-05,2.5e-05,0.304558,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.0971,2.5e-05,0.084064,2.5e-05,0.143538,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05
6,358,opinion,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.051908,1.5e-05,1.5e-05,1.5e-05,0.008324,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.21636,1.5e-05,1.5e-05,0.471563,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,0.175818,1.5e-05,1.5e-05,1.5e-05,0.075517,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
7,300,dissent,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.163554,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.708762,0.062922,0.000157,0.000157,0.000157,0.021862,0.000157,0.000157,0.037397,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157,0.000157
7,300,opinion,0.035831,5e-05,5e-05,5e-05,5e-05,5e-05,0.180826,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,0.593913,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,0.049149,0.106839,5e-05,0.031751,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05


### PHI

In [34]:
# db.PHI = pd.DataFrame(lda_engine.components_, columns=db.VOCAB.index)
db.add_table('PHI', pd.DataFrame(lda_engine.components_, columns=db.VOCAB.index))
db.PHI.index.name = 'topic_id'
db.PHI.columns.name = 'term_str'
db.PHI = db.PHI.T # Could do this earlier; necessary to store in db

In [35]:
db.PHI.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,1553.637848,65.407266,0.125266,65.622506,299.562255,25.179877,17.913695,49.68737,2.239989,5.081494,628.439494,231.90275,35.772734,363.097204,1.810997,34.464415,259.031047,2.775564,0.975317,54.341982,106.184422,0.623962,94.680051,259.663165,0.025,867.51578,234.559986,142.106671,14.033432,452.609683,2705.521014,28.333109,52.550797,25.188094,21.915977,28.236047,259.604552,382.110881,37.326066,50.142242
000 000,244.0857,3.00865,0.025,0.463412,0.941354,0.025,0.025,2.729871,0.025,0.025,111.43508,41.575094,0.025,0.235028,0.025,0.025,0.94852,0.025,0.025,14.657041,7.38454,0.025,0.025,4.825079,0.025,28.285607,1.782753,6.574387,0.025,0.025,435.004741,0.025,0.025,0.025,0.025,0.025,111.644485,69.499297,0.025,34.394362
10,809.681943,287.692338,282.183839,796.137909,327.048595,353.827867,160.107967,173.928017,323.247266,48.436699,131.75527,205.784245,275.438894,619.565711,413.878833,199.592297,119.575252,282.264767,148.083542,267.929065,258.854648,25.783568,1161.34441,358.055893,195.890724,192.144707,345.451128,125.469375,103.18166,1407.969891,489.625593,277.499741,545.045371,559.070532,536.776934,355.033248,84.085486,385.555315,203.874624,102.126838
100,190.370792,48.483386,11.66703,119.095319,102.673164,83.777888,14.305062,9.593398,55.990384,27.43267,179.739952,175.427812,186.287126,121.292474,61.562217,76.75858,62.362451,12.582154,72.818744,50.047472,44.75045,20.222482,30.220986,58.168916,30.599489,52.241942,31.142055,5.741588,23.219864,124.502595,339.272234,201.713282,106.544424,935.747872,155.040253,29.503305,95.542277,141.777085,131.258538,10.522288
101,152.525743,75.190107,7.138726,81.510623,15.756212,8.830327,30.117414,6.585578,185.127591,71.703157,50.809802,19.255061,8.614717,69.359574,159.291463,5.536771,2.276193,0.943085,8.297371,35.40126,16.318419,2.591074,77.185621,7.00851,43.377645,7.175285,6.953485,2.851539,0.496442,127.897627,81.470933,283.036265,96.399326,773.796756,100.551572,10.362626,12.052664,1.038238,2.309755,2.855443


### Create Topic Glosses

In [36]:
n_top_words = 7

In [37]:
db.add_table('TOPICS', db.PHI.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str))

In [38]:
db.TOPICS

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,stock,bankruptcy,court,trust,securities,corporation,assets
1,united,united states,states,committee,government,petitioner,act
2,court,jury,evidence,trial,defendant,case,state
3,child,children,medical,benefits,state,court,women
4,political,state,election,party,voting,vote,county
5,search,court,fourth,warrant,fourth amendment,amendment,police
6,court,state,case,jurisdiction,judgment,law,states
7,religious,school,state,religion,schools,education,public
8,state,federal,court,law,courts,jurisdiction,states
9,patent,vessel,use,ship,case,admiralty,court


In [39]:
db.TOPICS['topwords'] = db.TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

In [40]:
db.TOPICS

term_str,0,1,2,3,4,5,6,topwords
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,stock,bankruptcy,court,trust,securities,corporation,assets,"0 stock, bankruptcy, court, trust, securities,..."
1,united,united states,states,committee,government,petitioner,act,"1 united, united states, states, committee, go..."
2,court,jury,evidence,trial,defendant,case,state,"2 court, jury, evidence, trial, defendant, cas..."
3,child,children,medical,benefits,state,court,women,"3 child, children, medical, benefits, state, c..."
4,political,state,election,party,voting,vote,county,"4 political, state, election, party, voting, v..."
5,search,court,fourth,warrant,fourth amendment,amendment,police,"5 search, court, fourth, warrant, fourth amend..."
6,court,state,case,jurisdiction,judgment,law,states,"6 court, state, case, jurisdiction, judgment, ..."
7,religious,school,state,religion,schools,education,public,"7 religious, school, state, religion, schools,..."
8,state,federal,court,law,courts,jurisdiction,states,"8 state, federal, court, law, courts, jurisdic..."
9,patent,vessel,use,ship,case,admiralty,court,"9 patent, vessel, use, ship, case, admiralty, ..."


### Add Doc Weights

In [41]:
db.TOPICS['doc_weight_sum'] = db.THETA.sum()

In [42]:
db.TOPICS.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

term_str,topwords,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,"21 justice, mr, mr justice, dissent, court, dissenting, case",1357.806087
26,"26 court, district, petitioner, district court, appeals, court appeals, case",598.441705
6,"6 court, state, case, jurisdiction, judgment, law, states",596.750701
16,"16 case, law, evidence, bank, court, said, contract",445.050056
29,"29 congress, statute, act, united, united states, states, language",403.773173
8,"8 state, federal, court, law, courts, jurisdiction, states",387.298366
22,"22 court, act, district, order, district court, review, congress",385.728609
19,"19 states, power, congress, united, united states, constitution, act",384.183585
2,"2 court, jury, evidence, trial, defendant, case, state",381.10521
33,"33 ed, 2d, ct, ed 2d, court, id, rule",341.275731


## Using NMF

In [43]:
nmf_engine = NMF(n_components=n_topics, init='nndsvd', random_state=1, alpha=.1, l1_ratio=.5)

### THETA

In [44]:
db.add_table('THETA_NMF', pd.DataFrame(nmf_engine.fit_transform(tfidf_model), index=CORPUS.index))
db.THETA_NMF.columns.name = 'topic_id'

In [45]:
db.THETA_NMF.sample(20).style.background_gradient()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
96,291,dissent,0.0,0.0,0.010358,0.001028,0.0,0.0,0.041796,0.012558,0.0,0.0,0.0,0.0,0.0,0.069841,0.0,0.018031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002196,0.07206,0.0,0.0,0.003295,0.0,0.0,0.0,0.0,0.0,0.007137,0.0,0.001147,0.0,0.0
58,183,opinion,0.0,0.002723,0.0,0.0,2.6e-05,0.0,0.0,0.0,0.0,0.001617,0.0,0.020851,0.0,0.051341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002983,0.0,0.0,0.004854,0.012533,0.0,0.012056,0.007675,0.01313,0.008427,0.0,0.0,0.0,0.0
161,174,dissent,0.0,0.002156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,307,dissent,0.0022,0.00077,0.012841,0.0,0.0,0.0,0.026257,0.00573,0.000527,0.0,0.0,0.0,0.0,0.045745,0.0,0.0,0.0,0.0,0.066427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032865,0.0,0.0,0.08605,0.0
405,427,opinion,0.000641,0.0,0.0,0.0,0.0,0.0,0.0,0.005033,0.0,0.0,0.009902,0.0,0.0,0.005092,0.0,0.0,0.081244,0.0,0.010033,0.001203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029956,0.0,0.0,0.0,0.063785,0.0,0.0,0.0,0.0,0.0
397,358,opinion,0.012858,0.0,0.004963,0.006814,0.0,0.0,0.012396,0.0,0.0,0.0,0.001207,0.0,0.030006,0.004508,0.0,0.0,0.0,0.0,0.004757,0.001845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01506,0.0,0.0,0.014161,0.0,0.023816,0.0,0.039487,0.0,0.0,0.0,0.0,0.0
331,398,opinion,0.004597,0.0,0.013315,0.0,0.0,0.103838,0.0124,0.0,0.0,0.0,0.002443,0.0,0.005774,0.007646,0.0,0.0,0.0,0.0,0.0,0.00379,0.0,0.0,0.0,0.0,0.006517,0.0,0.000468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002433,0.0,0.0,0.001322,0.046139
346,379,opinion,0.046941,0.0,0.009074,0.0,0.000157,0.0,0.014334,0.002159,0.0,0.0,0.0,0.0,0.0,0.00494,0.0,0.0,0.043593,0.0,0.0,0.0,0.0,0.001056,0.0,0.0,0.001651,0.0,0.0,0.0,0.0,0.013789,0.0,0.0,0.0,0.022216,0.0,0.005622,0.0,0.0,0.013689,0.010436
446,398,dissent,0.019945,0.002006,0.055104,0.0,0.0,0.0,0.001956,0.0,0.0,0.002741,0.002027,0.0,0.002016,0.0,0.0,0.0,0.0,0.0,0.022766,0.005811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073325,0.010427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
489,546,opinion,0.056087,0.0,0.003956,0.002198,0.0,0.0,0.009833,0.00396,0.000525,0.0,0.0,0.0,0.010949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000357,0.0,0.0,0.011309,0.0,0.006466,0.0,0.0,0.0,0.0,0.0


### PHI

In [46]:
db.add_table('PHI_NMF', pd.DataFrame(nmf_engine.components_, columns=db.VOCAB.index))

In [47]:
db.PHI_NMF.index.name = 'topic_id'
db.PHI_NMF.columns.name = 'term_str'
db.PHI_NMF = db.PHI_NMF.T

In [48]:
db.PHI_NMF.head().style.background_gradient()

topic_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
000,0.048479,0.0,0.045279,0.0,0.110083,0.0,0.0,0.0,0.0,0.0,0.0,0.201255,0.011799,0.029554,0.0,0.009884,0.046192,0.0,0.0,0.092829,0.0,0.055097,0.0,0.0,0.062493,0.071082,0.123208,0.013511,0.14107,0.137425,0.0,0.0,0.0,0.012429,0.0,0.0,0.029415,0.0,0.0,0.024295
000 000,0.0,0.0,0.0,0.0,0.015067,0.0,0.0,0.0,0.0,0.0,0.0,0.075825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035246,0.0,0.0,0.0,0.0,0.0,0.002292,0.0201,0.0,0.017194,0.043127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.091573,0.0,0.207873,0.05923,0.016079,0.04479,0.0,0.0,0.0,0.0,0.025708,0.006995,0.065901,0.025595,0.0,0.008356,0.033219,0.0,0.02067,0.022646,0.0,0.012651,0.0,0.0,0.007449,0.0,0.001488,0.009078,0.021743,0.022885,0.006713,0.004358,0.0,0.0,0.027159,0.0,0.0,0.004378,0.0018,0.0
100,0.017035,0.0,0.0318,0.089907,0.023301,0.007989,0.000692,0.009946,0.009682,0.0,0.009933,0.025305,0.031345,0.009809,0.0,0.0,0.0,0.001855,0.006251,0.011622,0.0,0.009553,0.0,0.0,0.007294,0.0,0.0,0.010525,0.014355,0.022184,0.011802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,0.015587,0.0,0.046,0.086512,0.001601,0.019103,0.0,0.013905,0.0,0.0,0.0,0.0,0.015076,0.0,0.039617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000835,0.0,0.0,0.0,0.0,0.0,0.018122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Topics

In [49]:
db.add_table('TOPICS_NMF', db.PHI_NMF.T.stack()\
    .to_frame('weight')\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(n_top_words)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str).rename_axis(columns={'term_str':'topic_features'}))

In [50]:
db.TOPICS_NMF

topic_features,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,court,district,district court,appeals,court appeals,case,courts
1,xxx,mr justice,mr,justice,dissent,chief justice,chief
2,congress,act,statute,secretary,federal,legislative,section
3,ed 2d,ct,ed,2d,ante,id,3d
4,tax,income,taxes,taxation,taxpayer,revenue,taxable
5,union,board,labor,employees,employer,bargaining,collective
6,states,united states,united,government,power,constitution,war
7,state,federal,court,law,jurisdiction,state court,constitution
8,commerce,interstate,interstate commerce,transportation,commerce clause,state,intrastate
9,dissented,xxx,mr justice,mr,justice,justice white,white


In [51]:
db.TOPICS_NMF['topwords'] = db.TOPICS_NMF.apply(lambda x: str(x.name) + ' ' + ', '.join(x), 1)

### Add Doc Weights

In [52]:
db.TOPICS_NMF['doc_weight_sum'] = db.THETA_NMF.sum()

In [53]:
db.TOPICS_NMF.iloc[:, 7:].sort_values('doc_weight_sum', ascending=False).style.bar()

topic_features,topwords,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"0 court, district, district court, appeals, court appeals, case, courts",142.860795
13,"13 plaintiff, case, court, law, defendant, judgment, said",133.340078
2,"2 congress, act, statute, secretary, federal, legislative, section",130.659881
12,"12 speech, amendment, public, ordinance, court, city, constitutional",127.394896
7,"7 state, federal, court, law, jurisdiction, state court, constitution",100.019478
6,"6 states, united states, united, government, power, constitution, war",90.213519
34,"34 counsel, trial, defendant, witness, accused, testimony, evidence",81.92882
3,"3 ed 2d, ct, ed, 2d, ante, id, 3d",81.513568
1,"1 xxx, mr justice, mr, justice, dissent, chief justice, chief",78.295554
4,"4 tax, income, taxes, taxation, taxpayer, revenue, taxable",68.188041


# Generate Doc2Vec

In [54]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [55]:
d2v_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(CORPUS.doc_content)]

In [56]:
d2v_model = Doc2Vec(d2v_docs, vector_size=40, window=3, min_count=10, workers=4)

In [57]:
db.add_table('D2V', pd.DataFrame(d2v_model.docvecs.vectors_docs, index=CORPUS.index))

In [58]:
db.D2V

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,171,dissent,-0.169884,-0.102200,-0.040527,-0.186953,0.152922,-0.050267,0.126498,0.425124,-0.337860,-0.291705,...,0.132204,0.062374,0.081895,-0.195036,0.180266,0.014624,0.080728,0.236278,0.133165,-0.266887
3,171,opinion,-1.247737,-0.380945,0.088340,-0.868145,0.285673,-0.807885,0.280288,0.987596,0.775861,0.317572,...,0.076143,0.840030,0.654778,-0.152558,0.162342,0.264531,-0.769392,-0.349308,-0.554755,-0.263988
3,321,dissent,-2.095453,-0.244773,-0.182080,-1.115058,-0.759947,-0.377939,-0.941164,1.970771,1.459684,0.435265,...,-0.191296,0.961100,0.798699,-0.061913,0.064373,-0.345928,-0.670637,-0.573886,-1.064151,-0.143272
3,321,opinion,-0.857782,0.179522,-0.047587,-0.217101,-0.260413,-0.236349,-0.218680,0.837275,0.215174,0.024422,...,0.072181,0.264568,0.359609,-0.009326,-0.001594,0.067163,-0.432190,-0.117347,0.045557,-0.123154
3,386,dissent,-1.652375,-0.097727,-0.447747,-0.154175,0.020608,-0.998915,-0.482418,1.094503,0.790695,0.304575,...,0.207728,0.673496,0.278429,-0.467558,0.012203,0.045109,-0.356781,-0.014735,-0.383691,0.011943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,471,opinion,-0.268496,-0.098909,0.054163,0.003142,0.474827,-1.095070,0.252813,0.143124,-0.432959,0.160409,...,0.199636,0.494767,1.052008,-0.002625,0.342421,0.344742,-0.337799,-0.100577,0.232507,0.366823
554,527,dissent,0.210962,-0.114022,0.450748,0.396773,-0.841598,-0.340674,-0.399371,0.744098,-0.079800,0.013236,...,0.445365,0.160354,0.086646,-0.239611,-0.281588,-0.135622,0.070073,-0.038128,-0.452045,-0.226124
554,527,opinion,0.188530,-0.206437,0.264088,0.108713,0.285788,-0.114796,-0.133810,0.664123,-0.420781,0.016501,...,0.185295,0.346796,0.145510,0.131372,-0.077017,-0.171855,-0.112667,-0.132902,-0.024377,0.150250
554,570,dissent,-0.592528,-0.469292,0.014814,-0.384084,-0.182350,-0.702161,-0.244282,0.611468,-0.704961,0.351548,...,-0.164045,-0.437539,-0.070248,0.949124,0.693748,-0.469736,-0.238311,-0.390396,0.041242,0.244375


In [59]:
db.D2V.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,12348.0,-0.154694,0.40884,-2.095453,-0.406896,-0.156003,0.0911,2.032917
1,12348.0,-0.048799,0.360938,-1.738457,-0.264073,-0.064761,0.15663,2.024187
2,12348.0,0.039129,0.382633,-1.709864,-0.194489,0.02758,0.280928,1.968295
3,12348.0,0.135634,0.313457,-1.193237,-0.056597,0.116858,0.325395,1.721403
4,12348.0,0.02564,0.384479,-1.812494,-0.208882,0.037415,0.259448,1.708361
5,12348.0,-0.28326,0.382898,-2.492419,-0.519226,-0.284247,-0.056389,1.462921
6,12348.0,-0.23876,0.351853,-1.816552,-0.455156,-0.230855,-0.021633,1.552645
7,12348.0,0.266112,0.371692,-1.463007,0.030417,0.264316,0.496114,1.970771
8,12348.0,0.125789,0.394698,-1.733589,-0.104755,0.109737,0.37164,2.669485
9,12348.0,-0.045253,0.364742,-1.969257,-0.249986,-0.042485,0.176218,1.892149


## Create Non-Negative Version

In [60]:
db.add_table('D2VP', db.D2V + np.abs(db.D2V.min()))

In [61]:
db.D2VP.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,12348.0,1.940763,0.40884,0.0,1.688557,1.93945,2.186553,4.128371
1,12348.0,1.689661,0.360938,0.0,1.474384,1.673696,1.895088,3.762644
2,12348.0,1.748996,0.382633,0.0,1.515376,1.737444,1.990792,3.678159
3,12348.0,1.328872,0.313457,0.0,1.13664,1.310095,1.518632,2.91464
4,12348.0,1.838133,0.384479,0.0,1.603612,1.84991,2.071942,3.520855
5,12348.0,2.209158,0.382897,0.0,1.973193,2.208172,2.43603,3.95534
6,12348.0,1.577794,0.351853,0.0,1.361396,1.585696,1.794919,3.369197
7,12348.0,1.729122,0.371692,0.0,1.493424,1.727323,1.959121,3.433778
8,12348.0,1.859374,0.394698,0.0,1.628835,1.843326,2.105229,4.403075
9,12348.0,1.924007,0.364742,0.0,1.719271,1.926772,2.145475,3.861406


# PCA

In [65]:
from sklearn.decomposition import PCA

In [66]:
pca_engine = PCA(n_components=10)

In [74]:
db.add_table('DCM', pd.DataFrame(pca_engine.fit_transform(TFIDF), index=TFIDF.index))
db.DCM.columns = ['PC{}'.format(i) for i in db.DCM.columns]

In [75]:
db.DCM

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
vol_num,case_num,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,171,dissent,0.169357,-0.046985,-0.068075,-0.036822,0.004968,-0.065154,-0.097576,0.026542,-0.096159,0.063054
3,171,opinion,-0.052705,0.280886,-0.004748,0.318536,-0.178593,-0.019479,-0.077309,-0.142467,0.058459,-0.067408
3,321,dissent,-0.085346,-0.040910,-0.219872,-0.064885,0.144820,-0.069979,-0.146403,0.065300,-0.048466,0.032041
3,321,opinion,0.008449,-0.025956,-0.060586,-0.067452,-0.059557,-0.020682,-0.081806,0.026657,-0.054638,0.062145
3,386,dissent,-0.066854,0.091441,-0.091435,0.031101,0.144099,0.045961,0.062325,-0.019389,0.056138,-0.084985
...,...,...,...,...,...,...,...,...,...,...,...,...
554,471,opinion,-0.076913,-0.118360,0.187168,0.037588,0.015533,0.014193,-0.099002,0.113303,0.019580,-0.036083
554,527,dissent,-0.103096,-0.093499,0.399958,0.048514,0.036386,0.016020,-0.016227,0.350884,-0.076119,0.028490
554,527,opinion,-0.067003,0.048522,0.250611,-0.069508,-0.064153,-0.009499,0.043315,0.280562,-0.065064,0.060573
554,570,dissent,-0.163186,-0.190603,0.271568,0.179976,0.152720,0.140622,0.010282,0.081586,0.013987,-0.022990


# Save the Model

In [62]:
# THESE TABLES ARE TOO WIDE FOR SQLITE
# db.tables.remove('DTM')
# db.tables.remove('TFIDF')

In [76]:
db.tables

['VOCAB',
 'BOW',
 'THETA',
 'PHI',
 'TOPICS',
 'THETA_NMF',
 'PHI_NMF',
 'TOPICS_NMF',
 'D2V',
 'D2VP',
 'DCM']

In [77]:
db.save_all_tables()

Saving VOCAB
Saving BOW
Saving THETA
Saving PHI
Saving TOPICS
Saving THETA_NMF
Saving PHI_NMF
Saving TOPICS_NMF
Saving D2V
Saving D2VP
Saving DCM
