5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 107

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2002-03-19,mr speaker yield minut gentleman virginia mr m...,100
1,2002-11-19,mr speaker would like submit appropri congress...,100
2,2002-06-06,mr speaker yesterday flight lo angel delay dep...,100
3,2002-03-06,mr speaker earlier today unabl cast vote two r...,100
4,2002-04-16,mr speaker thank gentleman yield time mr speak...,100
5,2002-10-01,mr speaker yield minut gentlewoman new york ms...,100
6,2001-10-16,mr speaker friday octob unavoid miss two roll ...,100
7,2002-05-09,mr chairman although disappoint rule bodi rise...,100
8,2002-10-01,mr speaker yield minut gentlewoman texa ms edd...,100
9,2002-10-01,mr speaker yield minut gentleman new jersey mr...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Topic-1 Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/topic_1_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,"Inflation, Prices, and Interest Rates",National Budget and Debt,"Taxation, Tax policy, and Tax Reform",Ethnic Minority and Racial Group Discrimination,Gender and Sexual Orientation Discrimination,Handicap or Disease Discrimination,"Voting Rights, Participation, and Related Issues",Freedom of Speech & Religion,...,Government Efficiency and Bureaucratic Oversight,Postal Service Issues (Including Mail Fraud),Nominations and Appointments,"Currency, Commemorative Coins, Medals, U.S. Mint",IRS Administration,"Federal Government Branch Relations and Administrative Issues, Congressional Operations","Regulation of Political Campaigns, Political Advertising, PAC regulation, Government Ethics",Census,"National Parks, Memorials, Historic Sites, and Recreation",Native American Affairs
0,2002-03-19,100,-1.038918,-1.027012,-1.154653,-1.041305,-1.069807,-1.034519,-1.021988,-1.078464,...,-1.005909,-0.941631,-1.065322,-1.026871,-0.966541,-0.990079,-1.002104,-1.021846,-0.901212,-0.945942
1,2002-11-19,100,-1.036854,-1.118608,-0.948187,-1.009685,-1.015489,-1.043899,-1.037797,-1.048155,...,-0.748837,-1.050059,-1.053650,-1.016383,-1.086811,-0.984143,-1.037656,-0.995466,-0.958491,-0.952525
2,2002-06-06,100,-1.029529,-1.014305,-0.623790,-1.026822,-1.053546,-1.040477,-1.028825,-0.973097,...,-0.952068,-1.064037,-1.052979,-1.041325,-1.168251,-0.991517,-0.995382,-1.016932,-0.971173,-1.003063
3,2002-03-06,100,-1.030535,-1.067803,-0.986884,-1.023464,-1.038935,-1.034369,-1.023792,-1.021740,...,-0.974220,-1.026989,-1.049473,-1.023114,-1.084648,-0.952553,-0.999121,-1.011407,-0.914692,-0.924935
4,2002-04-16,100,-1.037111,-1.111847,-0.927158,-1.001794,-1.027821,-1.023011,-0.965411,-1.033744,...,-1.006615,-1.002247,-1.057002,-1.022276,-1.100282,-0.959636,-0.988292,-1.012892,-0.935614,-0.930353
5,2002-10-01,100,-1.032864,-1.088309,-1.034424,-1.045384,-0.975289,-1.038126,-0.853943,-1.030272,...,-1.099276,-1.041158,-1.056514,-1.023088,-1.090121,-0.864198,-1.049120,-1.022375,-0.868316,-0.991142
6,2001-10-16,100,-1.030535,-1.067803,-0.986884,-1.023464,-1.038935,-1.034369,-1.023792,-1.021740,...,-0.974220,-1.026989,-1.049473,-1.023114,-1.084648,-0.952553,-0.999121,-1.011407,-0.914692,-0.924935
7,2002-05-09,100,-1.042395,-1.035986,-1.088922,-1.037365,-0.908311,-1.015194,-0.958449,-1.048922,...,-1.010621,-0.990502,-1.038814,-1.046987,-1.090444,-0.909607,-1.093752,-1.009757,-0.964464,-1.026178
8,2002-10-01,100,-1.040939,-1.067111,-0.730692,-0.978292,-0.870934,-1.041585,-1.008745,-1.053521,...,-1.001018,-1.078412,-1.056560,-1.021079,-1.116421,-0.885743,-0.895426,-0.942470,-0.891356,-0.991543
9,2002-10-01,100,-1.048144,-1.069790,-1.040049,-1.032795,-0.971243,-1.038463,-1.070321,-1.074666,...,-0.978823,-1.066434,-1.053943,-1.052779,-1.092290,-0.958700,-0.894819,-1.031332,-0.933498,-0.948960


In [16]:
result.to_csv('../data/congress_%d_predict_topic_1.csv' % NTH, index=False)