5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 107

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2002-03-19,mr speaker yield minut gentleman virginia mr m...,100
1,2002-11-19,mr speaker would like submit appropri congress...,100
2,2002-06-06,mr speaker yesterday flight lo angel delay dep...,100
3,2002-03-06,mr speaker earlier today unabl cast vote two r...,100
4,2002-04-16,mr speaker thank gentleman yield time mr speak...,100
5,2002-10-01,mr speaker yield minut gentlewoman new york ms...,100
6,2001-10-16,mr speaker friday octob unavoid miss two roll ...,100
7,2002-05-09,mr chairman although disappoint rule bodi rise...,100
8,2002-10-01,mr speaker yield minut gentlewoman texa ms edd...,100
9,2002-10-01,mr speaker yield minut gentleman new jersey mr...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Minor Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/minor_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,General Domestic Macroeconomic Issues,"Inflation, Prices, and Interest Rates","Monetary Supply, Federal Reserve Board, and the Treasury",National Budget and Debt,"Taxation, Tax policy, and Tax Reform",Industrial Policy,General (includes combinations of multiple subtopics),Ethnic Minority and Racial Group Discrimination,...,"Regulation of Political Campaigns, Political Advertising, PAC regulation, Government Ethics",Census,District of Columbia Affairs,Relief of Claims against the U.S. Government,Federal Holidays,"National Parks, Memorials, Historic Sites, and Recreation",Native American Affairs,"Natural Resources, Public Lands, and Forest Management",Water Resources Development and Research,U.S. Dependencies and Territorial Issues
0,2002-03-19,100,-1.018557,-1.056333,-1.072174,-1.089758,-1.241623,-1.068851,-1.015360,-1.061755,...,-1.029129,-1.034879,-0.916619,-1.078350,-1.128155,-0.810480,-1.007197,-0.922271,-0.815068,-1.105145
1,2002-11-19,100,-1.157178,-1.054988,-1.058498,-1.153033,-1.000309,-1.055078,-0.965598,-0.992805,...,-1.052785,-0.996482,-1.013849,-1.080406,-1.093721,-0.927944,-0.996961,-0.900779,-0.958601,-1.056654
2,2002-06-06,100,-1.156141,-1.046211,-0.935608,-1.073021,-0.601143,-1.042988,-1.035101,-1.049097,...,-1.060912,-1.029650,-1.010677,-1.060497,-1.092652,-0.974861,-0.980095,-1.015331,-1.030502,-0.967254
3,2002-03-06,100,-1.150244,-1.051307,-1.046498,-1.108208,-1.041551,-1.052608,-1.023453,-1.038581,...,-1.019367,-1.023058,-1.030191,-1.054109,-1.092018,-0.859636,-0.958021,-0.936804,-1.003412,-1.048041
4,2002-04-16,100,-1.150689,-1.056711,-1.052914,-1.128361,-1.054934,-1.057827,-1.052933,-1.013904,...,-1.021273,-1.036006,-1.004147,-1.059362,-1.116847,-0.941006,-0.976403,-0.902314,-1.042870,-1.060421
5,2002-10-01,100,-1.145204,-1.081141,-1.051138,-1.151291,-1.173384,-1.026706,-1.040366,-1.045534,...,-1.066282,-1.038446,-1.084399,-1.078157,-1.051765,-1.014592,-1.003333,-0.877249,-1.086130,-0.989775
6,2001-10-16,100,-1.150244,-1.051307,-1.046498,-1.108208,-1.041551,-1.052608,-1.023453,-1.038581,...,-1.019367,-1.023058,-1.030191,-1.054109,-1.092018,-0.859636,-0.958021,-0.936804,-1.003412,-1.048041
7,2002-05-09,100,-1.149127,-1.058372,-1.060701,-1.117135,-1.253692,-1.067468,-1.068994,-1.063004,...,-1.130025,-1.026780,-1.047380,-1.087806,-1.084416,-1.032774,-1.064862,-0.918276,-1.060011,-1.078927
8,2002-10-01,100,-1.166538,-1.053749,-1.077708,-1.115139,-0.788496,-0.938334,-1.195804,-0.988700,...,-0.970176,-1.045364,-1.076634,-1.068700,-1.095685,-0.866726,-1.002093,-0.920684,-1.041630,-1.042940
9,2002-10-01,100,-1.129382,-1.092382,-0.986378,-1.126645,-1.142905,-1.025519,-0.940783,-1.041449,...,-0.881202,-1.049155,-1.013191,-1.077529,-1.020814,-1.037514,-0.990899,-0.992721,-1.104721,-1.046388


In [16]:
result.to_csv('../data/congress_%d_predict_minor.csv' % NTH, index=False)