5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 107

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2011-05-13,mr speaker rise today seek congression recogni...,100
1,2012-06-06,mr speaker rise today celebr life evelyn weins...,100
2,2012-11-16,mr speaker rise today heavi heart tragic two c...,100
3,2012-02-29,mr speaker rise today recognit februari nation...,100
4,2011-10-13,mr speaker rise support korea colombia panama ...,100
5,2011-07-27,mr speaker tuesday juli inadvert vote aye roll...,100
6,2011-04-08,mr speaker rise today congratul chabad port wa...,100
7,2012-05-31,motion recommit speaker say motion thank mr sp...,100
8,2011-12-20,mr speaker rise today seek congression recogni...,100
9,2011-06-22,mr speaker rise today celebr recognit annivers...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Topic-1 Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/topic_1_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,"Inflation, Prices, and Interest Rates",National Budget and Debt,"Taxation, Tax policy, and Tax Reform",Ethnic Minority and Racial Group Discrimination,Gender and Sexual Orientation Discrimination,Handicap or Disease Discrimination,"Voting Rights, Participation, and Related Issues",Freedom of Speech & Religion,...,Government Efficiency and Bureaucratic Oversight,Postal Service Issues (Including Mail Fraud),Nominations and Appointments,"Currency, Commemorative Coins, Medals, U.S. Mint",IRS Administration,"Federal Government Branch Relations and Administrative Issues, Congressional Operations","Regulation of Political Campaigns, Political Advertising, PAC regulation, Government Ethics",Census,"National Parks, Memorials, Historic Sites, and Recreation",Native American Affairs
0,2011-05-13,100,-1.053799,-1.082654,-1.118574,-1.037563,-1.043315,-1.034005,-1.044886,-1.040380,...,-1.024780,-1.036827,-1.047371,-1.044257,-1.086371,-0.894309,-1.006763,-0.970190,-0.927364,-0.931605
1,2012-06-06,100,-1.034199,-1.077097,-1.148049,-1.023602,-1.072300,-1.034652,-1.029499,-1.067686,...,-1.050018,-1.037001,-0.976492,-1.083964,-1.088072,-1.063623,-1.072820,-1.017375,-0.899395,-1.000065
2,2012-11-16,100,-1.031233,-1.075836,-0.952338,-1.034389,-1.048747,-1.029742,-1.042180,-1.040106,...,-0.984418,-1.055574,-1.060207,-0.980101,-1.076817,-0.868405,-0.978913,-1.014454,-0.959801,-0.961985
3,2012-02-29,100,-1.029576,-1.094841,-0.990757,-1.031279,-0.969749,-1.020204,-1.010210,-1.046663,...,-1.024556,-1.039647,-1.018949,-1.022220,-1.092745,-0.929577,-1.011078,-1.007622,-0.961526,-0.939298
4,2011-10-13,100,-1.038398,-1.059052,-1.044244,-1.047271,-1.059165,-1.035467,-1.027627,-1.043505,...,-1.094401,-1.034618,-1.052510,-1.026144,-1.095859,-0.952080,-1.054031,-1.008841,-0.960573,-0.963161
5,2011-07-27,100,-1.030535,-1.067803,-0.986884,-1.023464,-1.038935,-1.034369,-1.023792,-1.021740,...,-0.974220,-1.026989,-1.049473,-1.023114,-1.084648,-0.952553,-0.999121,-1.011407,-0.914692,-0.924935
6,2011-04-08,100,-1.023635,-1.053400,-1.061328,-1.038203,-1.049424,-1.045094,-1.030696,-0.999642,...,-1.002479,-1.018913,-1.053660,-1.016236,-1.083729,-0.974314,-1.024230,-1.022137,-0.889120,-0.928214
7,2012-05-31,100,-1.030535,-1.067803,-0.986884,-1.023464,-1.038935,-1.034369,-1.023792,-1.021740,...,-0.974220,-1.026989,-1.049473,-1.023114,-1.084648,-0.952553,-0.999121,-1.011407,-0.914692,-0.924935
8,2011-12-20,100,-1.042633,-1.076675,-1.057609,-1.012435,-1.070403,-1.039345,-0.994809,-1.081340,...,-1.050925,-1.064098,-1.051850,-1.025789,-1.091760,-1.028308,-1.045355,-0.998431,-0.873941,-1.020470
9,2011-06-22,100,-1.037521,-1.057160,-1.029128,-1.067383,-1.034024,-1.037787,-1.029731,-1.076428,...,-1.026235,-1.015349,-1.058077,-1.197407,-1.093987,-1.016912,-0.959771,-1.016348,-0.595788,-0.888860


In [16]:
result.to_csv('../data/congress_%d_predict_topic_1.csv' % NTH, index=False)