In [2]:
import pandas as pd
import re
import warnings; warnings.simplefilter('ignore')
import numpy as np
import warnings
import os
import sys
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.read_parallel import read_parallel_local

In [3]:
df = pd.read_csv("temp_vn.csv", encoding="latin1", parse_dates=True)
print(len(df))
df = df[~df.id.isna()]
df.id = df.id.astype(int)

# Sample a random version of each bill, at the end we only want
# one version of the bill in the dataset.
df_one_bill = df[['bill_id', 'version_number']].groupby('bill_id').sample(n=1, random_state=0)
df = df_one_bill.merge(df[['id', 'version_number', 'bill_id', 'partisan_lean', 'sc_id', 'signed']], on = ['version_number', 'bill_id'])

print(len(df))
df = df.sample(int(len(df)/1.3))  # Max that fits in local memory right now
print(len(df))


df['text'] = read_parallel_local(df['id'], "../data/clean/")
df = df.reset_index(drop=True)

485388
199451
153423
Took 6.076287909348806 min to open 153423 files with 20 processes.


In [4]:
df.head()

Unnamed: 0,bill_id,version_number,id,partisan_lean,sc_id,signed,text
0,1227085,1,2375742,0.754207,580-2,0,x back button image: click to go to previous ...
1,1340732,1,2682190,0.236118,562-1,1,<billno> <sponsor> senate joint resolution by...
2,1263736,2,2519963,0.272146,628-1,1,*sb0177.1* january 2020 senate bill no. diges...
3,1200331,3,2659359,0.396835,563-2,0,by:aalang h.b.ano.a3851 a bill to be entitled...
4,1097029,1,2126414,0.1278,576-1,1,| | | search results close toggle navigation ...


In [5]:
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
         
data_words = list(sent_to_words(df.text.values.tolist()))

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Filter out extremely rare words and extremely common
id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [6]:
print(data_words[0])
df.text[0]

['back', 'button', 'image', 'click', 'to', 'go', 'to', 'previous', 'page', 'quick', 'search', 'bill', 'numberbill', 'keyword', 'text', 'bill', 'text', 'bill', 'information', 'hr', 'share', 'this', 'bill', 'start', 'california', 'legislature', 'regular', 'session', 'house', 'resolution', 'no', 'introduced', 'by', 'assembly', 'member', 'kamlager', 'dove', 'may', 'relative', 'to', 'the', 'university', 'of', 'california', 'los', 'angeles', 'legislative', 'counsel', 'digest', 'hr', 'as', 'introduced', 'kamlager', 'dove', 'digest', 'key', 'bill', 'text', 'whereas', 'the', 'university', 'of', 'california', 'los', 'angeles', 'was', 'founded', 'on', 'may', 'and', 'whereas', 'in', 'classes', 'began', 'at', 'its', 'current', 'campus', 'in', 'the', 'westwood', 'neighborhood', 'of', 'los', 'angeles', 'and', 'whereas', 'ucla', 'consistently', 'receives', 'more', 'applications', 'for', 'undergraduate', 'admission', 'than', 'any', 'other', 'university', 'in', 'the', 'country', 'with', 'over', 'applica

" x back button image: click to go to previous page   quick search: bill numberbill keyword  >> >> text  bill text bill information hr-37            share this: bill start california legislature regular session house resolution no. 37-introduced by assembly member kamlager-dove  may 2019 relative to the university of california, los angeles. legislative counsel's digest hr as introduced, kamlager-dove. digest key bill text whereas, the university of california, los angeles, was founded on may 1919; and whereas, in classes began at its current campus in the westwood neighborhood of los angeles; and whereas, ucla consistently receives more applications for undergraduate admission than any other university in the country, with over applications received for the academic year; and whereas, in u.s. news and world reports overall hospital rankings, the ronald reagan ucla medical center is ranked first in los angeles, second in california, and seventh in the country; and whereas, ucla is the 

In [7]:
n_topics = 40
n_passes = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=n_topics,
                                       passes=n_passes)

NameError: name 'pprint' is not defined

In [8]:
doc_lda = lda_model[corpus]

In [9]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./lda_results/ldavis_prepared_'+str(num_topics))

#try:
#    ! mkdir lda_results
    
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './lda_results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

In [None]:
def dense_vector(ldav, n):
    v = [0]*n
    for (i, p) in ldav:
        v[i] = p
    return tuple(v)

doc_topics = [dense_vector(lda_model[x], num_topics) for x in corpus]
features = np.asarray(doc_topics)
print(f"Features shape:{features.shape}.")


In [19]:
n_topics = 40
n_passes = 10
# n_topics, n_passes
file_name = "bow_models/lda_" + str(n_topics) + "_" + str(n_passes) + ".dat"
if not os.path.isdir("bow_models"):
    os.mkdir("bow_models")
    
with open(file_name, "wb") as f:
    pickle.dump([lda_model, id2word], f)



  and should_run_async(code)


In [21]:
from gensim.models import LsiModel
lsi_model = LsiModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=n_topics)

file_name = "bow_models/lsi_" + str(n_topics) + ".dat"
with open(file_name, "wb") as f:
    pickle.dump([lsi_model, id2word], f)

  and should_run_async(code)
