5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 112

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2011-05-13,mr speaker rise today seek congression recogni...,100
1,2012-06-06,mr speaker rise today celebr life evelyn weins...,100
2,2012-11-16,mr speaker rise today heavi heart tragic two c...,100
3,2012-02-29,mr speaker rise today recognit februari nation...,100
4,2011-10-13,mr speaker rise support korea colombia panama ...,100
5,2011-07-27,mr speaker tuesday juli inadvert vote aye roll...,100
6,2011-04-08,mr speaker rise today congratul chabad port wa...,100
7,2012-05-31,motion recommit speaker say motion thank mr sp...,100
8,2011-12-20,mr speaker rise today seek congression recogni...,100
9,2011-06-22,mr speaker rise today celebr recognit annivers...,100


In [5]:
import re
import textwrap

def insert_chars_split_marker(text, cc=2500):
    # FIXME: text still has number
    text = re.sub('\d+', '', text)
    out = '|'.join(textwrap.wrap(text, cc))
    return out

def insert_words_split_marker(text, wc=500):
    text = re.sub('\d+', '', text)
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += '|' + w
        else:
            out += ' ' + w
    return out

In [6]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_chars_split_marker(c, 2500))

In [7]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [8]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Topic-1 Model

In [9]:
vect_name = '../models/vec_count_bills_23gram_new.joblib'
clf_name = '../models/topic_1_bills_clf_liblinear_new.joblib'

In [10]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

686467

In [11]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [12]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [13]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [14]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,"Inflation, Prices, and Interest Rates",National Budget and Debt,"Taxation, Tax policy, and Tax Reform",Ethnic Minority and Racial Group Discrimination,Gender and Sexual Orientation Discrimination,"Voting Rights, Participation, and Related Issues",Freedom of Speech & Religion,Comprehensive health care reform,...,"International Organizations other than Finance: United Nations (UN), UNESCO, International Red Cross","Terrorism, Hijacking",Government Efficiency and Bureaucratic Oversight,Postal Service Issues (Including Mail Fraud),"Currency, Commemorative Coins, Medals, U.S. Mint","Federal Government Branch Relations and Administrative Issues, Congressional Operations","Regulation of Political Campaigns, Political Advertising, PAC regulation, Government Ethics",Census,"National Parks, Memorials, Historic Sites, and Recreation",Native American Affairs
0,2011-05-13,100,-1.078942,-1.032937,-1.092884,-1.053716,-1.072521,-1.040202,-1.038817,-1.193759,...,-1.022255,-1.040206,-1.181718,-0.983077,-0.976260,-0.929581,-1.041835,-1.015022,-0.978783,-0.935661
1,2012-06-06,100,-1.047539,-1.111827,-1.117426,-1.041326,-1.096317,-1.056339,-1.060065,-0.928951,...,-1.025389,-1.024341,-1.048436,-0.964875,-0.959667,-1.123101,-1.076010,-1.041036,-0.998228,-1.063483
2,2012-11-16,100,-1.056101,-1.074345,-0.817399,-1.050733,-1.074468,-1.054835,-1.032916,-0.994094,...,-1.041600,-1.014642,-0.966618,-1.054516,-0.910521,-0.894622,-0.976141,-1.045930,-1.002763,-0.972952
3,2012-02-29,100,-1.035872,-1.063388,-0.947986,-1.040809,-1.004251,-1.042152,-1.009708,-0.672881,...,-1.042117,-1.032631,-1.058569,-1.043380,-0.983888,-0.965626,-1.019427,-1.042333,-0.985645,-0.984259
4,2011-10-13,100,-1.050787,-1.091062,-1.161127,-1.059276,-1.044025,-1.056346,-1.043311,-1.042655,...,-1.050008,-1.078337,-1.113256,-1.049357,-1.027137,-1.016664,-1.052508,-1.033843,-0.989830,-1.053837
5,2011-07-27,100,-1.046147,-1.009391,-0.940505,-1.043174,-1.037604,-1.052538,-1.025154,-0.996242,...,-1.028279,-1.024151,-1.013365,-1.037503,-0.996539,-0.991468,-0.988207,-1.034500,-0.931379,-0.957560
6,2011-04-08,100,-1.035646,-0.995801,-0.944506,-1.055708,-1.039118,-1.057756,-1.016157,-0.994068,...,-1.041822,-1.031152,-1.066494,-1.024520,-0.981293,-1.026493,-1.010938,-1.050633,-0.948057,-0.974845
7,2012-05-31,100,-1.046147,-1.009391,-0.940505,-1.043174,-1.037604,-1.052538,-1.025154,-0.996242,...,-1.028279,-1.024151,-1.013365,-1.037503,-0.996539,-0.991468,-0.988207,-1.034500,-0.931379,-0.957560
8,2011-12-20,100,-1.045472,-1.017918,-1.028708,-1.012987,-1.091164,-1.031061,-1.079173,-1.008533,...,-0.934029,-1.041821,-1.167469,-1.053344,-0.952822,-1.082457,-1.052532,-1.037217,-0.763473,-1.095740
9,2011-06-22,100,-1.047369,-1.047848,-0.981872,-1.015108,-1.047691,-1.046153,-1.066861,-0.902185,...,-0.971265,-1.049602,-1.107660,-0.993398,-0.924353,-1.092280,-1.015242,-1.044825,-0.695528,-0.919313


In [15]:
result.to_csv('../data/congress_%d_predict_topic_1_new.csv' % NTH, index=False)