5. Predict congressional speech --- it is in opt/cong (or you can download via capitolwords API based on the script you wrote, we want to predict for each congress.)
   Predict each category. If you have 9 categories, you will get 9 columns. 

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
NTH = 108

## Read Congressional speech dataset

In [3]:
#df = pd.read_csv('../cong/cong_107_reg.csv', nrows=10)
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, nrows=10)
df.columns

Index([u'biocong', u'speaker_state', u'speaker_raw', u'speaker_first',
       u'congress', u'title', u'origin_url', u'number', u'id', u'volume',
       u'chamber', u'session', u'speaker_last', u'pages', u'speaker_party',
       u'date', u'bills', u'bioguide_id', u'order', u'speaking',
       u'capitolwords_url', u'bills1', u'bills1n', u'chamb', u'bills_unique1',
       u'major', u'minor', u'billnum', u'idno', u'cong', u'state', u'cd',
       u'statenm', u'party', u'name', u'dwnom1', u'dwnom2', u'bootse1',
       u'bootse2', u'corr12', u'logl', u'nchoices', u'errors', u'gmp',
       u'fullname', u'bioguide_id_1'],
      dtype='object')

In [4]:
df = pd.read_csv('../cong/capitolwords_clean_%d_dw.csv' % NTH, usecols=['party', 'date', 'speaking'])
df

Unnamed: 0,date,speaking,party
0,2004-04-28,want point stori washington time someon want t...,100
1,2004-05-17,mr speaker would suggest discuss held hold wee...,100
2,2004-02-24,gentleman yield mr woolsey appear televis nume...,100
3,2004-04-28,iraqi made clear want limit author interim gov...,100
4,2004-06-21,think gentleman indiana miss last comment migh...,100
5,2004-02-24,gentleman yield point,100
6,2004-02-24,mr speaker let repeat may tune tri get context...,100
7,2003-07-08,today sure colleagu acknowledg everyon observ ...,100
8,2004-11-18,mr speaker want wish everybodi aloha get smile...,100
9,2004-03-30,mr speaker gentleman might find interest regar...,100


In [5]:
#df[df.speaking.str.count(' ') > 5000]

In [6]:
import re
def insert_split_marker(text, wc=1000):
    words = text.split()
    out = ''
    for i, w in enumerate(words):
        if i != 0 and i % wc == 0:
            out += ' | ' + w
        else:
            out += ' ' + w
    return out

In [7]:
df['speaking'] = df['speaking'].astype(str).apply(lambda c: insert_split_marker(c, 5000))

In [8]:
s = df['speaking'].str.split('|', expand=True).stack()
i = s.index.get_level_values(0)
new_df = df.loc[i].copy()
new_df['chunk'] = s.index.get_level_values(1)
new_df['speaking'] = s.values
df = new_df.reset_index(drop=True)

In [9]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

## Major Model

In [10]:
vect_name = '../models/vec_count_bills_23gram.joblib'
clf_name = '../models/major_bills_clf_liblinear.joblib'

In [11]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)
len(vect.vocabulary_)

244682

In [12]:
X = vect.transform(df.speaking)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)
y_pred = clf.predict(X)

In [13]:
conf = clf.decision_function(X)
conf_df = pd.DataFrame(conf)

In [14]:
topics_df = pd.read_csv('../data/topic_code.csv')
columns = []
for c in clf.classes_:
    cname = topics_df[topics_df.code == c].topic.values[0]
    columns.append(cname)
conf_df.columns = columns

In [15]:
result = pd.concat([df[['date', 'party']], conf_df], axis=1)
result

Unnamed: 0,date,party,Macroeconomics,"Civil Rights, Minority Issues, and Civil Liberties",Health,Agriculture,"Labor, Employment, and Immigration",Education,Environment,Energy,...,Social Welfare,Community Development and Housing Issues,"Banking, Finance, and Domestic Commerce",Defense,"Space, Science, Technology and Communications",Foreign Trade,International Affairs and Foreign Aid,Government Operations,Public Lands and Water Management,"Other, Miscellaneous, and Human Interest"
0,2004-04-28,100,-1.065748,-0.969707,-0.693393,-0.943766,-0.846071,-0.882804,-0.872835,-0.814268,...,-0.938813,-0.971865,-0.827461,-0.878129,-0.962900,-0.932060,-0.890127,-0.848255,-0.668266,-1.142674
1,2004-05-17,100,-1.013729,-1.049250,-0.750936,-0.958367,-0.868307,-1.008341,-0.962003,-0.868486,...,-0.952234,-0.995242,-0.893193,-0.976135,-0.879230,-0.986161,-0.613436,-0.585316,-0.868746,-1.037131
2,2004-02-24,100,-1.079669,-1.014374,-0.643489,-0.949692,-0.913442,-0.886058,-0.867757,-0.863204,...,-0.943444,-0.950659,-0.794240,-0.962595,-0.895927,-0.948845,-0.906743,-0.667709,-0.780393,-1.148446
3,2004-04-28,100,-1.065753,-1.022435,-0.843450,-0.988142,-0.839898,-0.893435,-0.884169,-0.829179,...,-0.918223,-1.005657,-0.731842,-0.982199,-0.945843,-0.956671,-0.882882,-0.495335,-0.797198,-1.138971
4,2004-06-21,100,-1.065748,-0.969707,-0.693393,-0.943766,-0.846071,-0.882804,-0.872835,-0.814268,...,-0.938813,-0.971865,-0.827461,-0.878129,-0.962900,-0.932060,-0.890127,-0.848255,-0.668266,-1.142674
5,2004-02-24,100,-1.065748,-0.969707,-0.693393,-0.943766,-0.846071,-0.882804,-0.872835,-0.814268,...,-0.938813,-0.971865,-0.827461,-0.878129,-0.962900,-0.932060,-0.890127,-0.848255,-0.668266,-1.142674
6,2004-02-24,100,-1.041714,-1.009254,-0.777762,-0.966108,-0.763970,-0.912522,-0.936428,-0.732621,...,-0.860849,-1.009220,-0.864719,-0.897051,-1.018685,-0.924773,-0.925537,-0.664731,-0.648407,-1.188469
7,2003-07-08,100,-0.943582,-1.044521,-0.734144,-0.955064,-0.952856,-0.865044,-0.897367,-0.894024,...,-0.965963,-1.007746,-1.044507,-0.503613,-0.943552,-0.936704,-0.999177,-0.804100,-0.887445,-1.209135
8,2004-11-18,100,-0.818512,-0.943203,-0.731113,-1.036904,-0.789385,-0.838704,-0.893515,-0.738963,...,-0.978170,-1.045111,-0.818511,-0.848468,-0.949321,-1.065013,-0.902161,-0.961585,-0.648452,-1.135827
9,2004-03-30,100,-1.100106,-0.963480,-0.963860,-0.996628,-0.800562,-0.931920,-0.871616,-0.810589,...,-0.971090,-1.003239,-0.867052,-0.656314,-0.988185,-0.768759,-0.674709,-0.898565,-0.701051,-1.158130


In [16]:
result.to_csv('../data/congress_%d_predict_major.csv' % NTH, index=False)