### Model Text/Political Party:
   
   a. Go to uk_proceedings_clean_subset.csv and create all bigrams/trigrams that are in more than 10 documents.
   
   b. Go to roberts_rules and create all bigrams/trigrams
   
   c. Remove all roberts_rules bigrams/trigrams from uk_proceedings bigrams/trigrams
   
   d. Use Regularized Logistic Regression or SVM to predict Labour/Conservative using the remaining bigrams/trigrams
   

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../uk_proceedings_clean.zip', usecols=['PARTY', 'SPEECH_TEXT'])
df

Unnamed: 0,PARTY,SPEECH_TEXT
0,Labour,sir edward truli beauti day sun shine god heav...
1,Conservative,sir edward may say delight back leader interru...
2,Labour,sir edward sat eight speaker like endors every...
3,SPK,five year sir edward sinc abl address way firs...
4,,stand upper step take chair speakerelect wish ...
5,Labour,madam speakerelect rise congratul secur unanim...
6,Conservative,madam speakerelect delight join prime minist c...
7,Liberal Democrats,madam speakerelect give great pleasur behalf b...
8,Scottish National Party,behalf joint scottish nation partyplaid cymru ...
9,Ulster Unionist Party,great pleasur rise endors much said alreadi ex...


In [3]:
df.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Alliance,278
CWM,5709
Conservative,336361
DCWM,10996
Democratic Unionist Party,6014
Green,756
Independent,1324
Independent Conservative,46
Independent Labour,44
Independent Ulster Unionist,89


In [4]:
sdf = df[df.PARTY.isin(['Labour', 'Conservative']) & df.SPEECH_TEXT.notnull()]

In [5]:
sdf.loc[:, 'textlen'] = sdf.loc[:, 'SPEECH_TEXT'].apply(lambda c: len(c))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
sdf = sdf[sdf.textlen >= 2000]

In [7]:
sdf.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Conservative,22299
Labour,27301


In [8]:
import random

def RandomSample(data, size):
    """Returns random sample for given size
       If training data set is too big, we can reduce its size
    """
    return data.ix[random.sample(data.index, size)]


def StratifiedSample(data, nperlabel, label='label'):
    """Returns stratified data with N per label
    """
    sample = pd.DataFrame()
    datagrp = data.groupby(label)
    sortedgrp = datagrp.size().sort_values(ascending=False)
    for i, l in enumerate(sortedgrp.index):
        if sortedgrp[l] > nperlabel:
            print("==> %-50s %6d" % (l, sortedgrp[l]))
            sample = sample.append(RandomSample(data[data[label] == l],
                                   nperlabel))
        else:
            break
    print("There are %d labels have more than %d articles" % (i + 1, nperlabel))
    print("Sample size: %s articles" % (len(sample)))
    return sample


In [9]:
N_PER_LABEL = 10000

subset = StratifiedSample(sdf, N_PER_LABEL, 'PARTY')

==> Labour                                              27301
==> Conservative                                        22299
There are 2 labels have more than 10000 articles
Sample size: 20000 articles


In [10]:
subset

Unnamed: 0,PARTY,SPEECH_TEXT,textlen
286202,Labour,think hon ladi find professor stewart restrict...,9353
219126,Labour,grate chanc discuss nuffield report teach fore...,7089
397804,Labour,permiss mr speaker would like make statement d...,4463
180160,Labour,term strict word bill answer ye howev term ind...,4840
281188,Labour,perfectli awar statement hon friend draw atten...,2922
141670,Labour,worthwhil debat enabl right hon friend secreta...,4349
904269,Labour,sure hon gentleman know take issu serious seen...,2571
164640,Labour,hous rare heard meaner speech one made hon mem...,2162
366266,Labour,add warmest congratul hon friend member west r...,2230
426549,Labour,made point earlier accept hon gentleman say mu...,2925


In [11]:
#subset.to_csv('/tmp/subset.csv.gz', compression='gzip', index=False)

In [12]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [13]:
with open('../roberts_rules/all_text.txt', 'rt') as f:
    text = f.read()
text = text.decode('ascii', 'ignore')
text = re.sub(r'\d+', '', text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [15]:
vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3)) 
vect.fit([text])
roberts_rules = set(vect.get_feature_names())

In [16]:
X_train, X_test, y_train, y_test = train_test_split(subset.SPEECH_TEXT, subset.PARTY, test_size=0.2, random_state=42)

In [17]:
#all_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10)
#all_vect = CountVectorizer(ngram_range=(2, 3), min_df=10)
all_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, stop_words=['friend', 'member', 'hon', 'ladi', 'gentleman'])
all_vect.fit(X_train.astype(str))
vocab = []
i = 0
for a in all_vect.vocabulary_:
    if a not in roberts_rules:
        vocab.append(a)
    else:
        #print a
        i += 1
print("Removed {0:d}".format(i))
print("Total {0:d}".format(len(vocab)))

Removed 2477
Total 132607


In [18]:
#train_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10, vocabulary=vocab)
#train_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, vocabulary=vocab)
train_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, vocabulary=vocab, stop_words=['friend', 'member', 'hon', 'ladi', 'gentleman'])

In [19]:
X_train = train_vect.transform(X_train.astype(str))
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [20]:
X_test = train_vect.transform(X_test.astype(str))
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [21]:
from sklearn.linear_model import SGDClassifier

elastic_clf = SGDClassifier(loss='log', alpha=.00002, n_iter=200, penalty="elasticnet")
t0 = time.time()
elastic_clf.fit(X_train, y_train)
t1 = time.time()
prediction_elastic = elastic_clf.predict(X_test)
t2 = time.time()
time_elastic_train = t1-t0
time_elastic_predict = t2-t1

print("Results for Elastic Net")
print("Training time: %fs; Prediction time: %fs" % (time_elastic_train, time_elastic_predict))
print(classification_report(y_test, prediction_elastic))

Results for Elastic Net
Training time: 17.344430s; Prediction time: 0.005730s
             precision    recall  f1-score   support

Conservative       0.76      0.77      0.77      1981
     Labour       0.78      0.76      0.77      2019

avg / total       0.77      0.77      0.77      4000



In [22]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print "\t%.4f\t%-20s\t\t%.4f\t%-20s" % (coef_1, fn_1, coef_2, fn_2)

In [23]:
show_most_informative_features(train_vect, elastic_clf, 100)

	-7.6254	secretari state     		8.7919	right secretari     
	-5.1908	labour parti        		8.7244	right secretari state
	-4.8900	financi secretari   		6.8433	right minist        
	-4.4804	hope minist         		5.5866	right home secretari
	-4.2001	offici report       		5.5717	right home          
	-3.9866	minist said         		5.1739	conserv parti       
	-3.8865	coalit govern       		4.4298	right chancellor    
	-3.6495	new labour          		4.4171	tori govern         
	-3.6122	hous adjourn        		4.3458	amend no            
	-3.5988	paymast gener       		4.1364	tori parti          
	-3.3071	minist know         		4.1152	right prime         
	-3.2794	perhap minist       		4.0966	right prime minist  
	-3.2434	red book            		3.8673	right foreign       
	-3.2359	econom secretari    		3.7398	right foreign secretari
	-3.2077	agre hous           		3.2768	way forward         
	-3.1959	govern fail         		3.2377	trade union         
	-3.1528	red tape            		3.1656	south wale    

In [24]:
elastic_clf.predict_proba(X_test)

array([[ 0.87373131,  0.12626869],
       [ 0.48098852,  0.51901148],
       [ 0.30823505,  0.69176495],
       ..., 
       [ 0.52484475,  0.47515525],
       [ 0.35799884,  0.64200116],
       [ 0.50084232,  0.49915768]])

In [25]:
X_test.shape

(4000, 132607)

## Save model to files

In [27]:
from sklearn.externals import joblib

#joblib.dump(train_vect, "../models/vec_count_23gram_077.joblib")
#joblib.dump(elastic_clf, "../models/clf_elasticnet_077.joblib")


['../models/clf_elasticnet_077.joblib',
 '../models/clf_elasticnet_077.joblib_01.npy',
 '../models/clf_elasticnet_077.joblib_02.npy',
 '../models/clf_elasticnet_077.joblib_03.npy',
 '../models/clf_elasticnet_077.joblib_04.npy']

In [28]:
def get_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top_a = coefs_with_fns[:n]
    top_b = coefs_with_fns[:-(n + 1):-1]
    return top_a, top_b

top_a, top_b = get_most_informative_features(train_vect, elastic_clf, 100)
top_a_df = pd.DataFrame(top_a)
top_a_df.columns = ['coef', 'term']
top_b_df = pd.DataFrame(top_b)
top_b_df.columns = ['coef', 'term']
top_a_df.to_csv('../data/top10_cons_3a.csv', index=False)
top_b_df.to_csv('../data/top10_lab_3a.csv', index=False)