### Model Text/Political Party:
   
   a. Go to uk_proceedings_clean_subset.csv and create all bigrams/trigrams that are in more than 10 documents.
   
   b. Go to roberts_rules and create all bigrams/trigrams
   
   c. Remove all roberts_rules bigrams/trigrams from uk_proceedings bigrams/trigrams
   
   d. Use Regularized Logistic Regression or SVM to predict Labour/Conservative using the remaining bigrams/trigrams
   

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../uk_proceedings_clean.zip', usecols=['PARTY', 'SPEECH_TEXT'])
df

Unnamed: 0,PARTY,SPEECH_TEXT
0,Labour,sir edward truli beauti day sun shine god heav...
1,Conservative,sir edward may say delight back leader interru...
2,Labour,sir edward sat eight speaker like endors every...
3,SPK,five year sir edward sinc abl address way firs...
4,,stand upper step take chair speakerelect wish ...
5,Labour,madam speakerelect rise congratul secur unanim...
6,Conservative,madam speakerelect delight join prime minist c...
7,Liberal Democrats,madam speakerelect give great pleasur behalf b...
8,Scottish National Party,behalf joint scottish nation partyplaid cymru ...
9,Ulster Unionist Party,great pleasur rise endors much said alreadi ex...


In [3]:
df.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Alliance,278
CWM,5709
Conservative,336361
DCWM,10996
Democratic Unionist Party,6014
Green,756
Independent,1324
Independent Conservative,46
Independent Labour,44
Independent Ulster Unionist,89


In [4]:
sdf = df[df.PARTY.isin(['Labour', 'Conservative']) & df.SPEECH_TEXT.notnull()]

In [5]:
sdf.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Conservative,335283
Labour,441703


In [6]:
import random

def RandomSample(data, size):
    """Returns random sample for given size
       If training data set is too big, we can reduce its size
    """
    return data.ix[random.sample(data.index, size)]


def StratifiedSample(data, nperlabel, label='label'):
    """Returns stratified data with N per label
    """
    sample = pd.DataFrame()
    datagrp = data.groupby(label)
    sortedgrp = datagrp.size().sort_values(ascending=False)
    for i, l in enumerate(sortedgrp.index):
        if sortedgrp[l] > nperlabel:
            print("==> %-50s %6d" % (l, sortedgrp[l]))
            sample = sample.append(RandomSample(data[data[label] == l],
                                   nperlabel))
        else:
            break
    print("There are %d labels have more than %d articles" % (i + 1, nperlabel))
    print("Sample size: %s articles" % (len(sample)))
    return sample


In [7]:
N_PER_LABEL = 20000

subset = StratifiedSample(sdf, N_PER_LABEL, 'PARTY')

==> Labour                                             441703
==> Conservative                                       335283
There are 2 labels have more than 20000 articles
Sample size: 40000 articles


In [8]:
subset

Unnamed: 0,PARTY,SPEECH_TEXT
703630,Labour,would disservic mani million peopl affectedcer...
467190,Labour,must make progress although sixhour debat take...
384464,Labour,hon gentleman paid tribut organis work disabl ...
457242,Labour,examin potenti scenario test nuclear hypothesi...
789066,Labour,minist ask allparti financi mutual inquiri pub...
750858,Labour,last labour govern oversaw greatest renaiss ci...
47885,Labour,first point confirm said would statement tomor...
324085,Labour,grate inform hon gentleman give subject shall ...
449184,Labour,assur hon friend shall commun chairman crimin ...
664752,Labour,secretari state realis opposit parti parliamen...


In [9]:
#subset.to_csv('/tmp/subset.csv.gz', compression='gzip', index=False)

In [10]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [11]:
with open('../roberts_rules/all_text.txt', 'rt') as f:
    text = f.read()
text = text.decode('ascii', 'ignore')
text = re.sub(r'\d+', '', text)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [13]:
vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3)) 
vect.fit([text])
roberts_rules = set(vect.get_feature_names())

In [14]:
X_train, X_test, y_train, y_test = train_test_split(subset.SPEECH_TEXT, subset.PARTY, test_size=0.2)

In [15]:
all_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10)
all_vect.fit(X_train.astype(str))
vocab = []
i = 0
for a in all_vect.vocabulary_:
    if a not in roberts_rules:
        vocab.append(a)
    else:
        #print a
        i += 1
print("Removed {0:d}".format(i))
print("Total {0:d}".format(len(vocab)))

Removed 969
Total 28246


In [16]:
train_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10, vocabulary=vocab)

In [17]:
X_train = train_vect.transform(X_train.astype(str))
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [18]:
X_test = train_vect.transform(X_test.astype(str))
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [19]:
# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(X_train, y_train)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(X_test)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(y_test, prediction_rbf))

Results for SVC(kernel=rbf)
Training time: 418.881899s; Prediction time: 97.778848s
             precision    recall  f1-score   support

Conservative       0.50      1.00      0.67      3991
     Labour       0.00      0.00      0.00      4009

avg / total       0.25      0.50      0.33      8000



  'precision', 'predicted', average, warn_for)


In [20]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear))

Results for SVC(kernel=linear)
Training time: 584.533539s; Prediction time: 51.560148s
             precision    recall  f1-score   support

Conservative       0.65      0.68      0.67      3991
     Labour       0.67      0.64      0.65      4009

avg / total       0.66      0.66      0.66      8000



In [21]:
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(X_train, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(X_test)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear))

Results for LinearSVC()
Training time: 0.448868s; Prediction time: 0.001059s
             precision    recall  f1-score   support

Conservative       0.64      0.66      0.65      3991
     Labour       0.65      0.62      0.63      4009

avg / total       0.64      0.64      0.64      8000



In [30]:
from sklearn.linear_model import SGDClassifier

elastic_clf = SGDClassifier(loss='log', alpha=.00002, n_iter=200, penalty="elasticnet")
t0 = time.time()
elastic_clf.fit(X_train, y_train)
t1 = time.time()
prediction_elastic = elastic_clf.predict(X_test)
t2 = time.time()
time_elastic_train = t1-t0
time_elastic_predict = t2-t1

print("Results for Elastic Net()")
print("Training time: %fs; Prediction time: %fs" % (time_elastic_train, time_elastic_predict))
print(classification_report(y_test, prediction_elastic))

Results for Elastic Net()
Training time: 4.379103s; Prediction time: 0.001238s
             precision    recall  f1-score   support

Conservative       0.66      0.68      0.67      3991
     Labour       0.67      0.65      0.66      4009

avg / total       0.66      0.66      0.66      8000



In [31]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print "\t%.4f\t%-20s\t\t%.4f\t%-20s" % (coef_1, fn_1, coef_2, fn_2)

In [32]:
show_most_informative_features(train_vect, elastic_clf, 100)

	-4.9061	labour member       		4.5134	conserv parti       
	-3.8063	labour parti        		3.7013	friend minist       
	-3.4125	secretari state     		3.1306	friend member birmingham
	-3.2792	paymast gener       		3.1031	tori parti          
	-3.2665	leader hou          		3.0677	bedroom tax         
	-3.2570	hon member birmingham		2.8464	hon member bromley  
	-2.8507	minist agr          		2.8195	hon friend minist   
	-2.7511	financ secretari    		2.7490	wide rang           
	-2.7088	hon member rhondda  		2.6880	conserv member      
	-2.6190	hon ladi            		2.6076	right hon friend    
	-2.5641	work experi         		2.5640	tori govern         
	-2.5572	european court      		2.5035	learn gentleman     
	-2.5040	friend member south 		2.5035	hon learn gentleman 
	-2.4980	grate minist        		2.4959	hon member stone    
	-2.4526	hon learn ladi      		2.4794	hon member richmond 
	-2.4526	learn ladi          		2.4197	energi compani      
	-2.4135	friend member bromley		2.4142	hon member w

In [33]:
elastic_clf.predict_proba(X_test)

array([[ 0.57070082,  0.42929918],
       [ 0.71435708,  0.28564292],
       [ 0.38103458,  0.61896542],
       ..., 
       [ 0.75160613,  0.24839387],
       [ 0.52588482,  0.47411518],
       [ 0.31653077,  0.68346923]])

In [34]:
X_test.shape

(8000, 28246)