# Pre-process Bills

Write cleaned (stopped, lemmatized, lowercased) bill data to space-separated tokens in a file, to avoid repeating NLP parsing.

In [2]:
%load_ext autoreload
%autoreload 2

In [8]:
import glob
import moods
import gzip
import random
import pandas as pd
billpaths = glob.glob('/data/bills/txt/*')
random.shuffle(billpaths)

# Write cleaned sections to file

Parse bills, stoplist, and write whitespace-joined versions of the text to file.

In [None]:
with gzip.open('/data/bills/cleaned/by_para.gz', 'wb') as f, gzip.open('/data/bills/cleaned/nouns_by_para.gz', 'wb') as f2:
    for path in billpaths:
        b = moods.Bill(path)
        for i, line in enumerate(b.lines(nlp_doc=True)):
                l = moods.clean_words(line)
                out = "%s_%d\t%s\n" %(b.name, i, " ".join(l))
                f.write(out.encode('utf-8'))
                
                # WRITE NOUN-ONLY VERSION
                l2 = moods.clean_words(line, pos=['NOUN', 'PROPN'])
                out2 = "%s_%d\t%s\n" %(b.name, i, " ".join(l2))
                f2.write(out2.encode('utf-8'))

In [19]:
with gzip.open('/data/bills/cleaned/cleaned_by_para.gz', 'rb') as f:
    for f in f.readlines():
        print(f)
        break

b'115_H-R-_4840_0\t115th congress public law public law 115th congress act designate facility united states postal service locate east franklin street oviedo florida sergeant class alwyn crendall cashe post office building note july enact senate house representatives united states america congress assemble section sergeant class alwyn crendall cashe post office building\n'


## Save folded lists (by bill, by sponsor, by congress)

In [58]:
df = pd.read_csv('/data/bills/cleaned/by_para.gz', sep='\t', names=['name', 'txt'])
df[['congress', 'sponsor', 'billno', 'part']] = df.name.str.split('_', expand=True)
df['title'] = df.congress + '_' + df.sponsor + '_' + df.billno
for n in ['title', 'congress', 'sponsor', 'part']:
    tmp = df.groupby(n).apply(lambda x: " ".join(x.txt.fillna(''))).reset_index()
    tmp.to_csv('/data/bills/cleaned/by_%s.gz' % n, compression='gzip', sep='\t', index=False, header=False)

Unnamed: 0,name,txt,congress,sponsor,billno,part,title
0,115_H-R-_4840_0,115th congress public law public law 115th con...,115,H-R-,4840,0,115_H-R-_4840
1,115_H-R-_4840_1,designation facility united states postal serv...,115,H-R-,4840,1,115_H-R-_4840
2,115_H-R-_4840_2,shall deem reference sergeant class alwyn cren...,115,H-R-,4840,2,115_H-R-_4840
3,115_H-R-_4840_3,congressional record vol consider pass house j...,115,H-R-,4840,3,115_H-R-_4840
4,115_H-R-_4840_4,,115,H-R-,4840,4,115_H-R-_4840


In [10]:
# Save noun-only version
f = pd.read_csv('/data/bills/cleaned/nouns_by_para.gz', sep='\t', names=['name', 'txt'])
df[['congress', 'sponsor', 'billno', 'part']] = df.name.str.split('_', expand=True)
df['title'] = df.congress + '_' + df.sponsor + '_' + df.billno
for n in ['title', 'congress', 'sponsor', 'part']:
    tmp = df.groupby(n).apply(lambda x: " ".join(x.txt.fillna(''))).reset_index()
    tmp.to_csv('/data/bills/cleaned/nouns_by_%s.gz' % n, compression='gzip', sep='\t', index=False, header=False)