### Model Text/Political Party:
   
   a. Go to uk_proceedings_clean_subset.csv and create all bigrams/trigrams that are in more than 10 documents.
   
   b. Go to roberts_rules and create all bigrams/trigrams
   
   c. Remove all roberts_rules bigrams/trigrams from uk_proceedings bigrams/trigrams
   
   d. Use Regularized Logistic Regression or SVM to predict Labour/Conservative using the remaining bigrams/trigrams
   

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../uk_proceedings_clean.zip', usecols=['PARTY', 'SPEECH_TEXT'])
df

Unnamed: 0,PARTY,SPEECH_TEXT
0,Labour,sir edward truli beauti day sun shine god heav...
1,Conservative,sir edward may say delight back leader interru...
2,Labour,sir edward sat eight speaker like endors every...
3,SPK,five year sir edward sinc abl address way firs...
4,,stand upper step take chair speakerelect wish ...
5,Labour,madam speakerelect rise congratul secur unanim...
6,Conservative,madam speakerelect delight join prime minist c...
7,Liberal Democrats,madam speakerelect give great pleasur behalf b...
8,Scottish National Party,behalf joint scottish nation partyplaid cymru ...
9,Ulster Unionist Party,great pleasur rise endors much said alreadi ex...


In [3]:
df.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Alliance,278
CWM,5709
Conservative,336361
DCWM,10996
Democratic Unionist Party,6014
Green,756
Independent,1324
Independent Conservative,46
Independent Labour,44
Independent Ulster Unionist,89


In [4]:
sdf = df[df.PARTY.isin(['Labour', 'Conservative']) & df.SPEECH_TEXT.notnull() & (df.SPEECH_TEXT.str.len() >= 2000)]

In [7]:
sdf.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Conservative,22299
Labour,27301


In [8]:
import random

def RandomSample(data, size):
    """Returns random sample for given size
       If training data set is too big, we can reduce its size
    """
    return data.ix[random.sample(data.index, size)]


def StratifiedSample(data, nperlabel, label='label'):
    """Returns stratified data with N per label
    """
    sample = pd.DataFrame()
    datagrp = data.groupby(label)
    sortedgrp = datagrp.size().sort_values(ascending=False)
    for i, l in enumerate(sortedgrp.index):
        if sortedgrp[l] > nperlabel:
            print("==> %-50s %6d" % (l, sortedgrp[l]))
            sample = sample.append(RandomSample(data[data[label] == l],
                                   nperlabel))
        else:
            break
    print("There are %d labels have more than %d articles" % (i + 1, nperlabel))
    print("Sample size: %s articles" % (len(sample)))
    return sample


In [9]:
N_PER_LABEL = 10000

subset = StratifiedSample(sdf, N_PER_LABEL, 'PARTY')

==> Labour                                              27301
==> Conservative                                        22299
There are 2 labels have more than 10000 articles
Sample size: 20000 articles


In [10]:
subset

Unnamed: 0,PARTY,SPEECH_TEXT,textlen
287829,Labour,come right hon friend secretari state said ear...,3531
374642,Labour,amend would mean mayor spatial develop strateg...,2464
173758,Labour,beg move bill read third time hous heard bill ...,3770
71003,Labour,pleasur welcom hon gentleman dispatch box firs...,4141
798879,Labour,pleasur respond debat right hon friend member ...,5474
585383,Labour,hon friend absolut right perhap everybodi cham...,2442
354555,Labour,one member select committe thank chairman memb...,2861
175025,Labour,thank hon gentleman intervent mix natur farm p...,3544
286000,Labour,hon gentleman ask mani question entir sure rep...,4045
767791,Labour,alway listen hon friend member walthamstow muc...,2315


In [11]:
#subset.to_csv('/tmp/subset.csv.gz', compression='gzip', index=False)

In [12]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [13]:
with open('../roberts_rules/all_text.txt', 'rt') as f:
    text = f.read()
text = text.decode('ascii', 'ignore')
text = re.sub(r'\d+', '', text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [15]:
vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3)) 
vect.fit([text])
roberts_rules = set(vect.get_feature_names())

In [16]:
X_train, X_test, y_train, y_test = train_test_split(subset.SPEECH_TEXT, subset.PARTY, test_size=0.2)

In [17]:
#all_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10)
all_vect = CountVectorizer(ngram_range=(2, 3), min_df=10)
all_vect.fit(X_train.astype(str))
vocab = []
i = 0
for a in all_vect.vocabulary_:
    if a not in roberts_rules:
        vocab.append(a)
    else:
        #print a
        i += 1
print("Removed {0:d}".format(i))
print("Total {0:d}".format(len(vocab)))

Removed 2558
Total 137093


In [18]:
#train_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10, vocabulary=vocab)
train_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, vocabulary=vocab)

In [19]:
X_train = train_vect.transform(X_train.astype(str))
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [20]:
X_test = train_vect.transform(X_test.astype(str))
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [21]:
from sklearn.linear_model import SGDClassifier

elastic_clf = SGDClassifier(loss='log', alpha=.00002, n_iter=200, penalty="elasticnet")
t0 = time.time()
elastic_clf.fit(X_train, y_train)
t1 = time.time()
prediction_elastic = elastic_clf.predict(X_test)
t2 = time.time()
time_elastic_train = t1-t0
time_elastic_predict = t2-t1

print("Results for Elastic Net")
print("Training time: %fs; Prediction time: %fs" % (time_elastic_train, time_elastic_predict))
print(classification_report(y_test, prediction_elastic))

Results for Elastic Net
Training time: 16.385052s; Prediction time: 0.005898s
             precision    recall  f1-score   support

Conservative       0.81      0.82      0.82      2040
     Labour       0.81      0.80      0.81      1960

avg / total       0.81      0.81      0.81      4000



In [22]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print "\t%.4f\t%-20s\t\t%.4f\t%-20s" % (coef_1, fn_1, coef_2, fn_2)

In [23]:
show_most_informative_features(train_vect, elastic_clf, 100)

	-7.6289	labour member       		9.8542	friend minist       
	-5.8475	secretari state     		9.5322	hon friend minist   
	-5.7485	hope minist         		6.1936	hon friend          
	-5.4684	friend member south 		6.1585	right hon friend    
	-5.4598	hon member birmingham		5.3760	hon gentleman       
	-5.2105	right hon gentleman 		5.0759	opposit member      
	-5.1785	labour parti        		4.9505	friend member birmingham
	-5.0096	friend member       		4.8833	conserv parti       
	-4.5625	minist said         		4.7223	hon friend secretari
	-4.3310	offici report       		4.6856	friend secretari    
	-4.2526	friend member bromley		4.6657	friend secretari state
	-4.1818	friend member north 		4.6444	hon member south    
	-4.1406	hon member birkenhead		4.1273	tori govern         
	-4.0875	hon learn friend    		3.9568	govern member       
	-4.0701	hon member nottingham		3.8664	friend member leicest
	-3.9886	friend member southwest		3.8266	hon member southwest
	-3.9163	financi secretari   		3.8036	trad

In [24]:
elastic_clf.predict_proba(X_test)

array([[ 0.15689238,  0.84310762],
       [ 0.05752817,  0.94247183],
       [ 0.16306576,  0.83693424],
       ..., 
       [ 0.33880761,  0.66119239],
       [ 0.57376833,  0.42623167],
       [ 0.7484129 ,  0.2515871 ]])

In [25]:
X_test.shape

(4000, 137093)

## Save model to files

In [28]:
from sklearn.externals import joblib

#joblib.dump(train_vect, "../models/vec_count_23gram_081.joblib")
#joblib.dump(elastic_clf, "../models/clf_elasticnet_081.joblib")

['../models/clf_elasticnet_081.joblib',
 '../models/clf_elasticnet_081.joblib_01.npy',
 '../models/clf_elasticnet_081.joblib_02.npy',
 '../models/clf_elasticnet_081.joblib_03.npy',
 '../models/clf_elasticnet_081.joblib_04.npy']

In [27]:
def get_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top_a = coefs_with_fns[:n]
    top_b = coefs_with_fns[:-(n + 1):-1]
    return top_a, top_b

top_a, top_b = get_most_informative_features(train_vect, elastic_clf, 100)
top_a_df = pd.DataFrame(top_a)
top_a_df.columns = ['coef', 'term']
top_b_df = pd.DataFrame(top_b)
top_b_df.columns = ['coef', 'term']
top_a_df.to_csv('../data/top10_cons_1.csv', index=False)
top_b_df.to_csv('../data/top10_lab_1.csv', index=False)