1. predict 'Minor': column labeled 'minor'
2. predict 'Major': column labeled 'major'
3. predict only categories where topic_code.csv (also in opt/data) says include ==1 (I will update this soon.)


How?
1. Tokenize bill text
2. Break into 5000 word chunks per bill
3. Use SVM to predict --- cross-validate to tune
4. Show classification success and print out top coefficients for each category so that we can verify that the model makes sense

In [24]:
import pandas as pd
import numpy as np
import time

## Read Bills dataset (cleaned version)

In [2]:
df = pd.read_csv('../data/bills-out-clean.csv.gz', nrows=10)
df.columns

Index([u'uid', u'Major', u'Minor', u'clean_text'], dtype='object')

In [3]:
df = pd.read_csv('../data/bills-out-clean.csv.gz', usecols=['Major', 'Minor', 'clean_text'])

## Split long bill to smaller chunk (5000 words)

In [4]:
import re
def insert_split_marker(text, wc=1000):
    text = re.sub('\d+', '', text)
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [5]:
df['clean_text'] = df['clean_text'].apply(lambda c: insert_split_marker(c, 5000))

In [6]:
s = df['clean_text'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['clean_text'] = s.values
new_df

Unnamed: 0,Major,Minor,clean_text,chunk
0,20,2012,congression bill th congress us govern print ...,0
0,20,2012,period end date evennumb calendar year regula...,1
1,3,300,congression bill th congress us govern print ...,0
1,3,300,health servic act amend section respect agree...,1
1,3,300,establish criteria valid regulatori accept al...,2
1,3,300,program sec revis extens certain program titl...,3
2,15,1520,congression bill th congress us govern print ...,0
3,20,2000,congression bill th congress us govern print ...,0
4,15,1522,congression bill th congress us govern print ...,0
5,1,107,congression bill th congress us govern print ...,0


## Vectorize

In [7]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [9]:
#vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=0.01)
vect = CountVectorizer(ngram_range=(2, 3), min_df=0.01) 
#vect = CountVectorizer(ngram_range=(2, 3)) 
vect.fit(df.clean_text)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(2, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
len(vect.get_feature_names())

10114

In [13]:
ng_df = pd.DataFrame(vect.get_feature_names())
ng_df.columns = ['ngram']
ng_df

Unnamed: 0,ngram
0,aa amend
1,aa amend strike
2,aa strike
3,aa titl
4,ab amend
5,ab amend strike
6,ab titl
7,ac amend
8,academ year
9,academi scienc


In [14]:
#ng_df.to_csv('../data/bills-23gram.csv', index=False)

In [68]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print classlabel, feat, coef

def most_informative_feature_for_class_svm(vectorizer, classifier,  n=10):
    labelid = 3 # this is the coef we're interested in. 
    feature_names = vectorizer.get_feature_names()
    svm_coef = classifier.coef_.toarray() 
    topn = sorted(zip(svm_coef[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print feat, coef

def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " | ".join(feature_names[j] for j in top10)))

## Model (Major)

In [15]:
df.groupby('Major').agg({'Major': 'count'})

Unnamed: 0_level_0,Major
Major,Unnamed: 1_level_1
1,1021
2,539
3,2869
4,558
5,1429
6,1153
7,1228
8,979
10,973
12,1570


In [17]:
X = df.clean_text
y = df.Major

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [20]:
X_train = vect.transform(X_train)
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [22]:
X_test = vect.transform(X_test)
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [40]:
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(X_train, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(X_test)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

print("Results for LinearSVC")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear))

Results for LinearSVC()
Training time: 9.335489s; Prediction time: 0.052043s
             precision    recall  f1-score   support

          1       0.52      0.60      0.55       204
          2       0.57      0.35      0.43       108
          3       0.83      0.89      0.86       574
          4       0.72      0.65      0.69       112
          5       0.76      0.73      0.75       286
          6       0.81      0.79      0.80       231
          7       0.76      0.69      0.72       246
          8       0.76      0.76      0.76       196
         10       0.71      0.77      0.74       195
         12       0.70      0.72      0.71       314
         13       0.71      0.69      0.70       130
         14       0.70      0.65      0.67        96
         15       0.67      0.67      0.67       337
         16       0.69      0.69      0.69       296
         17       0.70      0.70      0.70       103
         18       0.93      0.93      0.93       479
         19       0.6

In [50]:
most_informative_feature_for_class(vect, classifier_liblinear, 99)
#most_informative_feature_for_class_svm(vect, classifier_linear)

99 appropri fee 0.761935747865
99 oper nation 0.806987489961
99 servic render 0.848222848601
99 shall paid 1.00985892779
99 legisl day 1.01397596396
99 time limit 1.33071359277
99 judiciari bill relief 1.66424232091
99 presid author 1.66799803254
99 bill relief 2.99838258892
99 session relief 3.05795669245


In [69]:
print_top10(vect, classifier_liblinear, classifier_liblinear.classes_)

1: receiv individu | avail secretari | basi properti | recipi assist | minimum tax | made decemb | domest product | section tax | congression budget | feder tax
2: may obtain | civil right | right act | account number | may disclos | person identifi | employ opportun | person inform | nation origin | amend constitut
3: medic servic | children health | titl xviii | amend titl xviii | public health servic | health insur | health care | prescript drug | care act | medic care
4: agricultur bill | relat tax | agricultur commod | committe agricultur bill | rule certain | section ae | refer committe agricultur | secretari agricultur | committe agricultur | section agricultur
5: individu employ | depart labor | plan purpos | immigr nation | occup safeti | individu retir | respons act | retir incom | job train | unemploy compens
6: elementari secondari | profession develop | higher educ | educ act | educ expens | educ assist | relat expens | student loan | public school | secretari educ
7: cont

In [61]:
from sklearn.linear_model import SGDClassifier

elastic_clf = SGDClassifier(loss='log', alpha=.00002, n_iter=200, penalty="elasticnet")
t0 = time.time()
elastic_clf.fit(X_train, y_train)
t1 = time.time()
prediction_elastic = elastic_clf.predict(X_test)
t2 = time.time()
time_elastic_train = t1-t0
time_elastic_predict = t2-t1

print("Results for Elastic Net")
print("Training time: %fs; Prediction time: %fs" % (time_elastic_train, time_elastic_predict))
print(classification_report(y_test, prediction_elastic))

Results for Elastic Net
Training time: 404.283877s; Prediction time: 0.067536s
             precision    recall  f1-score   support

          1       0.46      0.67      0.55       204
          2       0.69      0.32      0.44       108
          3       0.82      0.88      0.85       574
          4       0.68      0.62      0.65       112
          5       0.78      0.71      0.74       286
          6       0.82      0.78      0.80       231
          7       0.79      0.64      0.71       246
          8       0.79      0.73      0.76       196
         10       0.71      0.74      0.73       195
         12       0.67      0.70      0.69       314
         13       0.74      0.68      0.71       130
         14       0.73      0.60      0.66        96
         15       0.64      0.65      0.64       337
         16       0.66      0.67      0.66       296
         17       0.72      0.63      0.67       103
         18       0.92      0.92      0.92       479
         19       0

In [62]:
most_informative_feature_for_class(vect, elastic_clf, 99)

99 secretari treasuri 2.27162756979
99 refer committe judiciari 2.33396247279
99 servic render 2.46314464558
99 custom servic 2.5113799946
99 immigr visa 3.08030179582
99 presid author 3.46982699131
99 time limit 3.76281213036
99 judiciari bill relief 4.85264480504
99 session relief 8.33809740652
99 bill relief 8.43836098626


In [70]:
print_top10(vect,  elastic_clf,  elastic_clf.classes_)

1: econom growth | reconcili act | minimum tax | feder reserv | secretari treasuri | tax relief | incom tax | fiscal year | feder tax | congression budget
2: feder offic | privat properti | person identifi | employ opportun | nation origin | account number | amend constitut | right act | civil right | person inform
3: public health servic | health benefit | social secur act | medicar program | titl xviii | health insur | medic care | care act | health care | prescript drug
4: invest act | secretari shall | depart agricultur | agricultur bill | agricultur commod | committe agricultur bill | refer committe agricultur | secretari agricultur | section agricultur | committe agricultur
5: occup safeti | respons act | nation act | labor relat | immigr nation act | individu retir | immigr nation | job train | child care | unemploy compens
6: educ institut | elementari secondari | educ expens | relat expens | educ act | higher educ | student loan | secretari educ | public school | educ assist
7

## Model (Minor)

In [84]:
sdf = df.groupby('Minor').agg({'Minor': 'count'})
minors = sdf[sdf.Minor > 200].index
minors

Int64Index([ 107,  208,  301,  302,  321,  331,  332,  402,  503,  530,  601,
             602,  709,  803, 1003, 1007, 1204, 1208, 1209, 1303, 1501, 1521,
            1523, 1608, 1609, 1807, 2002, 2004, 2008, 2012, 2101, 2102, 2103,
            2104, 9999],
           dtype='int64', name=u'Minor')

In [85]:
X = df[df.Minor.isin(minors)].clean_text
y = df[df.Minor.isin(minors)].Minor

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [86]:
X_train = vect.transform(X_train)
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [87]:
X_test = vect.transform(X_test)
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [88]:
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(X_train, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(X_test)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear))

Results for LinearSVC()
Training time: 5.939257s; Prediction time: 0.039521s
             precision    recall  f1-score   support

        107       0.65      0.82      0.72       152
        208       0.71      0.48      0.57        42
        301       0.50      0.40      0.44        43
        302       0.59      0.60      0.59        95
        321       0.77      0.73      0.75        41
        331       0.61      0.67      0.64        42
        332       0.60      0.66      0.63        41
        402       0.77      0.73      0.75        41
        503       0.69      0.68      0.68        81
        530       0.80      0.90      0.84        97
        601       0.88      0.70      0.78        87
        602       0.78      0.82      0.80        65
        709       0.77      0.72      0.74        67
        803       0.68      0.65      0.67        63
       1003       0.83      0.80      0.82        50
       1007       0.80      0.75      0.77        52
       1204       0.5

In [90]:
most_informative_feature_for_class(vect, classifier_liblinear, 9999)

9999 person violat 0.843023633852
9999 refer committe judiciari 0.886832131244
9999 sec nation 0.897203798013
9999 usc section 1.04024143168
9999 servic render 1.04533424117
9999 presid author 1.54521015975
9999 judiciari bill relief 1.695195477
9999 time limit 1.86713237437
9999 bill relief 3.10942289715
9999 session relief 3.16347633549


In [91]:
print_top10(vect, classifier_liblinear, classifier_liblinear.classes_)

107: made date | basi properti | made decemb | intern revenu | invest compani | incom tax | princip resid | feder tax | reconcili act | minimum tax
208: inform obtain | requir state | written consent | use inform | provis act | inform individu | person identifi | inform act | account number | person inform
301: amend strike | amount provid | health center | commun health | urban area | racial ethnic | medic care | rural area | health care | care act
302: limit number | elig individu | individu enrol | benefit part | care act | rule certain | medic care | medic treatment | medic servic | health insur
321: food drug administr | feder food | feder food drug | act respect | relat tax | final rule | emerg medic | market price | food drug | prescript drug
331: diseas control prevent | control prevent | diseas control | shall ensur | center diseas | grant loan | medicar program | period servic | public health | diagnosi treatment
332: state plan | section act usc | famili incom | benefit serv