5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 112

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2011-05-13,mr speaker rise today seek congression recogni...,100
1,2012-06-06,mr speaker rise today celebr life evelyn weins...,100
2,2012-11-16,mr speaker rise today heavi heart tragic two c...,100
3,2012-02-29,mr speaker rise today recognit februari nation...,100
4,2011-10-13,mr speaker rise support korea colombia panama ...,100
5,2011-07-27,mr speaker tuesday juli inadvert vote aye roll...,100
6,2011-04-08,mr speaker rise today congratul chabad port wa...,100
7,2012-05-31,motion recommit speaker say motion thank mr sp...,100
8,2011-12-20,mr speaker rise today seek congression recogni...,100
9,2011-06-22,mr speaker rise today celebr recognit annivers...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Minor Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/minor_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,General Domestic Macroeconomic Issues,"Inflation, Prices, and Interest Rates","Monetary Supply, Federal Reserve Board, and the Treasury",National Budget and Debt,"Taxation, Tax policy, and Tax Reform",Industrial Policy,General (includes combinations of multiple subtopics),Ethnic Minority and Racial Group Discrimination,...,"Regulation of Political Campaigns, Political Advertising, PAC regulation, Government Ethics",Census,District of Columbia Affairs,Relief of Claims against the U.S. Government,Federal Holidays,"National Parks, Memorials, Historic Sites, and Recreation",Native American Affairs,"Natural Resources, Public Lands, and Forest Management",Water Resources Development and Research,U.S. Dependencies and Territorial Issues
0,2011-05-13,100,-1.136128,-1.057166,-1.098748,-1.110071,-1.231015,-1.069806,-1.071017,-1.056559,...,-1.080551,-0.973740,-0.972383,-1.066377,-1.050335,-0.941464,-0.962316,-0.893860,-1.000399,-1.069301
1,2012-06-06,100,-1.125683,-1.050245,-1.045478,-1.089071,-1.175776,-1.060618,-1.051440,-1.043933,...,-1.081099,-1.026894,-1.031566,-1.079607,-1.091129,-0.975198,-1.008281,-0.944203,-1.092972,-1.125035
2,2012-11-16,100,-1.134458,-1.049635,-1.042959,-1.127004,-1.025676,-1.059214,-1.050879,-1.053074,...,-1.025034,-1.008632,-1.095053,-1.096427,-1.069889,-1.058545,-0.990630,-0.905117,-1.069331,-1.068322
3,2012-02-29,100,-1.134458,-1.041849,-1.050244,-1.146066,-1.075387,-1.056591,-1.045360,-1.034038,...,-1.043808,-1.008045,-1.056504,-1.064699,-1.079273,-0.943127,-0.987720,-0.942070,-1.028711,-1.055209
4,2011-10-13,100,-1.163375,-1.058412,-1.068499,-1.131151,-1.193849,-1.063203,-1.053317,-1.045676,...,-1.067316,-1.010653,-1.031497,-1.052446,-1.093480,-0.895898,-0.983151,-0.992123,-0.979238,-1.048660
5,2011-07-27,100,-1.150244,-1.051307,-1.046498,-1.108208,-1.041551,-1.052608,-1.023453,-1.038581,...,-1.019367,-1.023058,-1.030191,-1.054109,-1.092018,-0.859636,-0.958021,-0.936804,-1.003412,-1.048041
6,2011-04-08,100,-1.144200,-1.036166,-1.038275,-1.099926,-1.096970,-1.058583,-1.056788,-1.056467,...,-1.049188,-1.037464,-1.027600,-1.047562,-1.066953,-0.850374,-0.967300,-0.956534,-0.986374,-1.068818
7,2012-05-31,100,-1.150244,-1.051307,-1.046498,-1.108208,-1.041551,-1.052608,-1.023453,-1.038581,...,-1.019367,-1.023058,-1.030191,-1.054109,-1.092018,-0.859636,-0.958021,-0.936804,-1.003412,-1.048041
8,2011-12-20,100,-1.138313,-1.056775,-1.062321,-1.123015,-1.100023,-1.061033,-1.050457,-0.975658,...,-1.063201,-1.014266,-1.029083,-1.060717,-1.072635,-0.916281,-1.141849,-0.898550,-1.091117,-1.065858
9,2011-06-22,100,-1.155173,-1.063333,-1.052957,-1.069213,-1.090857,-1.057185,-1.011579,-1.075599,...,-0.954222,-1.037833,-0.994184,-1.081981,-1.130907,-0.731193,-0.880926,-1.006229,-1.072376,-1.055535


In [16]:
result.to_csv('../data/congress_%d_predict_minor.csv' % NTH, index=False)