5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 112

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2011-05-13,mr speaker rise today seek congression recogni...,100
1,2012-06-06,mr speaker rise today celebr life evelyn weins...,100
2,2012-11-16,mr speaker rise today heavi heart tragic two c...,100
3,2012-02-29,mr speaker rise today recognit februari nation...,100
4,2011-10-13,mr speaker rise support korea colombia panama ...,100
5,2011-07-27,mr speaker tuesday juli inadvert vote aye roll...,100
6,2011-04-08,mr speaker rise today congratul chabad port wa...,100
7,2012-05-31,motion recommit speaker say motion thank mr sp...,100
8,2011-12-20,mr speaker rise today seek congression recogni...,100
9,2011-06-22,mr speaker rise today celebr recognit annivers...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Major Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/major_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,Macroeconomics,"Civil Rights, Minority Issues, and Civil Liberties",Health,Agriculture,"Labor, Employment, and Immigration",Education,Environment,Energy,...,Social Welfare,Community Development and Housing Issues,"Banking, Finance, and Domestic Commerce",Defense,"Space, Science, Technology and Communications",Foreign Trade,International Affairs and Foreign Aid,Government Operations,Public Lands and Water Management,"Other, Miscellaneous, and Human Interest"
0,2011-05-13,100,-1.287708,-1.059234,-0.764554,-0.988508,-0.911946,-0.881701,-0.875769,-0.812527,...,-0.888988,-0.898261,-0.759266,-1.167845,-0.875305,-1.034442,-0.848803,-0.784109,-0.749500,-1.142688
1,2012-06-06,100,-1.200736,-1.084577,-0.835755,-1.085810,-0.726804,-0.907196,-0.930183,-1.010703,...,-0.607531,-0.777010,-0.981621,-0.794522,-1.054590,-1.051958,-0.973204,-1.033517,-0.895963,-1.170943
2,2012-11-16,100,-1.029702,-1.047465,-0.675425,-0.973712,-0.959931,-0.749910,-0.890840,-0.888923,...,-1.021326,-0.956638,-0.881934,-0.904438,-1.016113,-0.975038,-0.816640,-0.674310,-0.889367,-1.116560
3,2012-02-29,100,-1.109533,-1.016445,-0.236678,-0.999440,-0.948862,-0.866812,-1.036200,-0.955548,...,-1.014551,-0.992536,-0.758786,-0.844287,-0.966489,-1.008745,-0.912412,-0.791645,-0.763000,-1.152004
4,2011-10-13,100,-1.152204,-1.052112,-0.847333,-0.923112,-1.074454,-0.962567,-0.848192,-0.855359,...,-0.994695,-1.019641,-0.974316,-1.081345,-0.982380,-0.077241,-0.797338,-1.018184,-0.777874,-1.177953
5,2011-07-27,100,-1.065748,-0.969707,-0.693393,-0.943766,-0.846071,-0.882804,-0.872835,-0.814268,...,-0.938813,-0.971865,-0.827461,-0.878129,-0.962900,-0.932060,-0.890127,-0.848255,-0.668266,-1.142674
6,2011-04-08,100,-1.105251,-1.044734,-0.741239,-0.983405,-0.895770,-0.787140,-0.919578,-0.849373,...,-0.661535,-0.941902,-0.845310,-0.965073,-0.915162,-0.969112,-0.841480,-0.923792,-0.671970,-1.150388
7,2012-05-31,100,-1.065748,-0.969707,-0.693393,-0.943766,-0.846071,-0.882804,-0.872835,-0.814268,...,-0.938813,-0.971865,-0.827461,-0.878129,-0.962900,-0.932060,-0.890127,-0.848255,-0.668266,-1.142674
8,2011-12-20,100,-1.183811,-1.056398,-0.843430,-1.029007,-1.044644,-0.617167,-0.864454,-0.924046,...,-1.022108,-0.987349,-0.859287,-1.027348,-0.994517,-0.952324,-0.362683,-1.174167,-0.818898,-1.096667
9,2011-06-22,100,-1.107367,-0.981371,-1.048833,-0.904000,-0.684830,-1.129451,-0.856543,-0.921107,...,-0.984263,-0.928064,-0.997982,-0.503159,-0.870078,-0.935322,-0.858542,-1.010862,-0.637788,-1.089713


In [16]:
result.to_csv('../data/congress_%d_predict_major.csv' % NTH, index=False)