### Model Text/Political Party:
   
   a. Go to uk_proceedings_clean_subset.csv and create all bigrams/trigrams that are in more than 10 documents.
   
   b. Go to roberts_rules and create all bigrams/trigrams
   
   c. Remove all roberts_rules bigrams/trigrams from uk_proceedings bigrams/trigrams
   
   d. Use Regularized Logistic Regression or SVM to predict Labour/Conservative using the remaining bigrams/trigrams
   

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../uk_proceedings_clean.zip', usecols=['PARTY', 'SPEECH_TEXT'])
df

Unnamed: 0,PARTY,SPEECH_TEXT
0,Labour,sir edward truli beauti day sun shine god heav...
1,Conservative,sir edward may say delight back leader interru...
2,Labour,sir edward sat eight speaker like endors every...
3,SPK,five year sir edward sinc abl address way firs...
4,,stand upper step take chair speakerelect wish ...
5,Labour,madam speakerelect rise congratul secur unanim...
6,Conservative,madam speakerelect delight join prime minist c...
7,Liberal Democrats,madam speakerelect give great pleasur behalf b...
8,Scottish National Party,behalf joint scottish nation partyplaid cymru ...
9,Ulster Unionist Party,great pleasur rise endors much said alreadi ex...


In [3]:
df.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Alliance,278
CWM,5709
Conservative,336361
DCWM,10996
Democratic Unionist Party,6014
Green,756
Independent,1324
Independent Conservative,46
Independent Labour,44
Independent Ulster Unionist,89


In [4]:
sdf = df[df.PARTY.isin(['Labour', 'Conservative']) & df.SPEECH_TEXT.notnull()]

In [5]:
sdf.loc[:, 'textlen'] = sdf.loc[:, 'SPEECH_TEXT'].apply(lambda c: len(c))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
sdf = sdf[sdf.textlen >= 2000]

In [7]:
sdf.groupby('PARTY').agg({'PARTY': 'count'})

Unnamed: 0_level_0,PARTY
PARTY,Unnamed: 1_level_1
Conservative,22299
Labour,27301


In [8]:
import random

def RandomSample(data, size):
    """Returns random sample for given size
       If training data set is too big, we can reduce its size
    """
    return data.ix[random.sample(data.index, size)]


def StratifiedSample(data, nperlabel, label='label'):
    """Returns stratified data with N per label
    """
    sample = pd.DataFrame()
    datagrp = data.groupby(label)
    sortedgrp = datagrp.size().sort_values(ascending=False)
    for i, l in enumerate(sortedgrp.index):
        if sortedgrp[l] > nperlabel:
            print("==> %-50s %6d" % (l, sortedgrp[l]))
            sample = sample.append(RandomSample(data[data[label] == l],
                                   nperlabel))
        else:
            break
    print("There are %d labels have more than %d articles" % (i + 1, nperlabel))
    print("Sample size: %s articles" % (len(sample)))
    return sample


In [9]:
N_PER_LABEL = 10000

subset = StratifiedSample(sdf, N_PER_LABEL, 'PARTY')

==> Labour                                              27301
==> Conservative                                        22299
There are 2 labels have more than 10000 articles
Sample size: 20000 articles


In [10]:
subset

Unnamed: 0,PARTY,SPEECH_TEXT,textlen
544470,Labour,debat govern draft legisl programmeth list bil...,2448
684590,Labour,absolut pleasur follow hon member midsussex mr...,4331
585115,Labour,give hon gentleman assur respect balanc struck...,2291
506719,Labour,thank hon gentleman know well understand point...,4902
284884,Labour,withdraw saidi want record point appear made c...,4521
547156,Labour,govern respons report accept recommend made im...,2453
930616,Labour,agre get away fact see whole brew industri pub...,2591
194502,Labour,grate opportun rais regul care commun scheme f...,6503
642068,Labour,apologis right hon friend secretari state oppo...,4507
653926,Labour,soldier kill afghanistan kyle adam charlen bar...,2357


In [11]:
#subset.to_csv('/tmp/subset.csv.gz', compression='gzip', index=False)

In [12]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [13]:
with open('../roberts_rules/all_text.txt', 'rt') as f:
    text = f.read()
text = text.decode('ascii', 'ignore')
text = re.sub(r'\d+', '', text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [15]:
vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3)) 
vect.fit([text])
roberts_rules = set(vect.get_feature_names())

In [16]:
X_train, X_test, y_train, y_test = train_test_split(subset.SPEECH_TEXT, subset.PARTY, test_size=0.2, random_state=42)

In [17]:
#all_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10)
all_vect = CountVectorizer(ngram_range=(2, 3), min_df=10)
#all_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, stop_words=['friend', 'member', 'hon', 'ladi', 'gentleman'])
all_vect.fit(X_train.astype(str))
vocab = []
i = 0
for a in all_vect.vocabulary_:
    if a not in roberts_rules:
        vocab.append(a)
    else:
        #print a
        i += 1
print("Removed {0:d}".format(i))
print("Total {0:d}".format(len(vocab)))

Removed 2554
Total 136604


In [18]:
uk_media_ngram = pd.read_csv('../data/uk_media_50k_23gram.csv')

In [19]:
uk_media_vocab = uk_media_ngram[uk_media_ngram.ngram.isin(vocab)]

In [20]:
len(uk_media_ngram)

169217

In [21]:
len(uk_media_vocab)

47800

In [22]:
#train_vect = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(2, 3), min_df=10, vocabulary=vocab)
train_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, vocabulary=uk_media_vocab.ngram)
#train_vect = CountVectorizer(ngram_range=(2, 3), min_df=10, vocabulary=vocab, stop_words=['friend', 'member', 'hon', 'ladi', 'gentleman'])

In [23]:
X_train = train_vect.transform(X_train.astype(str))
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [24]:
X_test = train_vect.transform(X_test.astype(str))
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)

In [25]:
from sklearn.linear_model import SGDClassifier

elastic_clf = SGDClassifier(loss='log', alpha=.00002, n_iter=200, penalty="elasticnet")
t0 = time.time()
elastic_clf.fit(X_train, y_train)
t1 = time.time()
prediction_elastic = elastic_clf.predict(X_test)
t2 = time.time()
time_elastic_train = t1-t0
time_elastic_predict = t2-t1

print("Results for Elastic Net")
print("Training time: %fs; Prediction time: %fs" % (time_elastic_train, time_elastic_predict))
print(classification_report(y_test, prediction_elastic))

Results for Elastic Net
Training time: 9.302912s; Prediction time: 0.003244s
             precision    recall  f1-score   support

Conservative       0.74      0.74      0.74      1981
     Labour       0.74      0.74      0.74      2019

avg / total       0.74      0.74      0.74      4000



In [26]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print "\t%.4f\t%-20s\t\t%.4f\t%-20s" % (coef_1, fn_1, coef_2, fn_2)

In [27]:
show_most_informative_features(train_vect, elastic_clf, 100)

	-6.6304	labour member       		7.3809	friend home         
	-6.0219	offici report       		6.3868	tori govern         
	-5.2847	minist may          		4.2237	tori parti          
	-5.0986	secretari state     		4.1004	trade union         
	-4.9075	financi secretari   		4.0034	opposit parti       
	-4.7959	minist said         		3.9778	busi secretari      
	-4.5309	labour parti        		3.9404	conserv parti       
	-4.1650	agre hous           		3.6646	govern member       
	-4.0415	minist would        		3.6187	south yorkshir      
	-3.7203	declar interest     		3.4849	previou administr   
	-3.6841	govern seem         		3.3649	conserv member      
	-3.6819	good point          		3.3611	point rais          
	-3.5834	side hous           		3.3147	bedroom tax         
	-3.5536	coalit govern       		3.3115	work peopl          
	-3.5455	polit correct       		3.2947	south wale          
	-3.4824	tell us             		3.2693	greater manchest    
	-3.3178	public financ       		3.2446	unit nation       

In [28]:
elastic_clf.predict_proba(X_test)

array([[ 0.96458298,  0.03541702],
       [ 0.25061432,  0.74938568],
       [ 0.46569952,  0.53430048],
       ..., 
       [ 0.50337894,  0.49662106],
       [ 0.04627053,  0.95372947],
       [ 0.80784458,  0.19215542]])

In [29]:
X_test.shape

(4000, 47800)

## Save model to files

In [31]:
from sklearn.externals import joblib

#joblib.dump(train_vect, "../models/vec_count_ukmedia_23gram_074.joblib")
#joblib.dump(elastic_clf, "../models/clf_elasticnet_ukmedia_23gram_074.joblib")


['../models/clf_elasticnet_ukmedia_23gram_074.joblib',
 '../models/clf_elasticnet_ukmedia_23gram_074.joblib_01.npy',
 '../models/clf_elasticnet_ukmedia_23gram_074.joblib_02.npy',
 '../models/clf_elasticnet_ukmedia_23gram_074.joblib_03.npy',
 '../models/clf_elasticnet_ukmedia_23gram_074.joblib_04.npy']

In [47]:
def get_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top_a = coefs_with_fns[:n]
    top_b = coefs_with_fns[:-(n + 1):-1]
    return top_a, top_b

top_a, top_b = get_most_informative_features(train_vect, elastic_clf, 100)
top_a_df = pd.DataFrame(top_a)
top_a_df.columns = ['coef', 'term']
top_b_df = pd.DataFrame(top_b)
top_b_df.columns = ['coef', 'term']
top_a_df.to_csv('../data/top10_cons_3c.csv', index=False)
top_b_df.to_csv('../data/top10_lab_3c.csv', index=False)