# Basic Topic Modelling

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import re
import moods
import os
import pandas as pd
import random
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaModel, CoherenceModel, TfidfModel

In [3]:
billpaths = glob.glob('/data/bills/txt/*')
random.shuffle(billpaths)

In [46]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [35]:
txts = moods.cleaned_bow_iter('/data/bills/cleaned/by_para.gz', dictionary, include_name=False)
for txt in txts:
    print(txt)
    break

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2), (5, 2), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 2), (28, 1), (29, 2)]


In [31]:
%%time
if not os.path.exists('/data/bills/paragraphs.mm'):
    # First time training of dictionary, serializing corpus as matrix market, and learning term weights
    print('learn')
    txts = moods.cleaned_txts_iter('/data/bills/cleaned/by_para.gz', include_name=False)
    dictionary = Dictionary(txts)
    dictionary.filter_extremes(no_below=10)
    dictionary.compactify()
    dictionary.save('/data/bills/bills.dictionary')
    
    bows = moods.cleaned_bow_iter('/data/bills/cleaned/by_para.gz', dictionary, include_name=False, min_tokens_per_doc=20)
    MmCorpus.serialize('/data/bills/paragraphs.mm', bows, progress_cnt=200000)
    mmcorp = MmCorpus('/data/bills/paragraphs.mm')
    
    tfidf = TfidfModel(mmcorp)
    tfidf.save('/data/bills/models/tfidf.model')
else:
    # Why do it again, right?
    dictionary = Dictionary.load('/data/bills/bills.dictionary')
    mmcorp = MmCorpus('/data/bills/paragraphs.mm')
    tfidf = TfidfModel.load('/data/bills/models/tfidf.model')
    
tfidf_corpus = tfidf[mmcorp]

In [39]:
%%time
bows = moods.cleaned_bow_iter('/data/bills/cleaned/by_para.gz', dictionary, include_name=False, min_tokens_per_doc=20)
MmCorpus.serialize('/data/bills/paragraphs.mm', bows, progress_cnt=200000)
mmcorp = MmCorpus('/data/bills/paragraphs.mm')

# TFIDF messes up training?
tfidf = TfidfModel(mmcorp)
tfidf.save('/data/bills/models/tfidf.model')
tfidf_corpus = tfidf[mmcorp]

CPU times: user 16.3 s, sys: 258 ms, total: 16.5 s
Wall time: 16.5 s


In [57]:
for text in tfidf_corpus:
    print(text)
    break

[(0, 0.3955654281267836), (1, 0.03828989578045541), (2, 0.10837295089734587), (3, 0.11449119223505617), (4, 0.25161749142333556), (5, 0.32341913163609726), (6, 0.2196162732052297), (7, 0.10517988138080964), (8, 0.162142454596323), (9, 0.10714095632968969), (10, 0.08678382209608766), (11, 0.17371679419139766), (12, 0.2403933458886631), (13, 0.07626644757137868), (14, 0.14150504754712298), (15, 0.12269731063521946), (16, 0.11818381183918968), (17, 0.11156722018575885), (18, 0.17329008712957192), (19, 0.28550055112632916), (20, 0.1654840656856758), (21, 0.1312050090110232), (22, 0.08321076334368835), (23, 0.03615902067005341), (24, 0.07714863502627033), (25, 0.41305107392028306), (26, 0.059691167048662695), (27, 0.08257625500452671), (28, 0.17520476589479506), (29, 0.08610330920935243)]


In [58]:
logging.root.level = logging.WARN # Too much logging at the INFO level, temporarily switch to WARN
lda_200 = LdaModel(mmcorp, num_topics=200, passes=2, alpha='auto', eta='auto', 
                   id2word = dictionary, per_word_topics=True)
for topicno in range(lda_200.num_topics):
    print(topicno, "\t", " ".join([word for word, prob in lda_200.show_topic(topicno, topn=12)]))
lda_200.save('/data/bills/models/lda_200.gensim', separately=['expElogbeta', 'sstats', 'alpha', 'eta'])
logging.root.level = logging.INFO

0 	 law enforcement continue set applicable accord origin privilege accordance aside trial impair
1 	 fund act available trust appropriate provide shall purpose use pursuant ground santa
2 	 protection homeland border agent customs mobile stabilization sensitive patrol homeowner headquarters implementing
3 	 consider record house senate congressional pass vol july march august amendment presidential
4 	 federal agency government direct head capitol public architect shall receive originate purpose
5 	 control own railroad line bay global parcel lot materially passive domestically include
6 	 disability permit permanent leave use remainder vii connect accommodate deduct refusal older
7 	 cost total include selection sponsor significant weather possible barrier exceed relate negotiation
8 	 appoint executive schedule position competitive appointment pay regard shall rate director level
9 	 education institution educational student college dissemination quality include setting purpose rece

## Examine topic models

In [31]:
txts = [txt for name, txt in moods.cleaned_txts_iter('/data/bills/cleaned/by_title.gz')]
cm = CoherenceModel(model=lda_40, texts=txts, coherence='c_v')
coherence = cm.get_coherence()  # get coherence value
coherence

0.33035154894675844

In [35]:
# Get the bill-level per-topic coherence.
# The topics were trained on paragraphs, so this confound the strength of the model with how concise the 
# overall bills are, I think.
per_topic_c = cm.get_coherence_per_topic()
pd.Series(per_topic_c).sort_values()[:5]

4     0.142472
32    0.192494
27    0.205974
34    0.212980
35    0.222561
dtype: float64

In [37]:
print('Least coherent topics')
for t in [4, 32, 27, 34, 35]:
    print(t, "%.2f" % per_topic_c[t], " ".join([word for word, prob in lda_40.show_topic(t, topn=12)]))
    
print('\nMost coherent topics')
for t in [15, 13, 3, 25, 29]:
    print(t, "%.2f" % per_topic_c[t], " ".join([word for word, prob in lda_40.show_topic(t, topn=12)]))

Least coherent topics
4 0.14 treasury arizona sector advance feature voluntary repayment army regulations derive t. michigan
32 0.19 prevent comptroller critical guideline missouri respond invest oregon longer displace minimize diverse
27 0.21 relevant encourage task projects train minnesota works expect quantity fourth applications leasing
34 0.21 description operate clause own connection single political reside possible saving economy household
35 0.22 insert strike president mexico speaker recreation vice engage bring issuance enforce john

Most coherent topics
15 0.46 s. november revenue march april combined donor promulgate treat expenditure october july
13 0.48 northern subparagraph calendar facilities mariana electric petroleum describe strategic specific deconfliction redirection
3 0.51 precede course generally d. fiscal year requests thereof begin originator nichols systemwide
25 0.53 respectively retirement accounting advancement records employees inspector administration soo

### Infer probability distributions 

In [40]:
import gzip
import numpy as np 
from SRP import Vector_file

for cat in ['title', 'congress', 'sponsor', 'part', 'para']:
    print(cat)
    with Vector_file('data/topic-dists-%s.bin' %cat, mode='w', dims=lda_40.num_topics) as vfile:
        for name, bow in moods.cleaned_bow_iter('data/cleaned_by_%s.gz' % cat, dictionary):
            vec = np.array([prob for topicno, prob in lda_40.get_document_topics(tfidf[bow], minimum_probability=-1)])
            vfile.add_row("%s_%s" % (cat, name), vec)

title
congress
sponsor
part
para
