In [1]:
import pickle
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_pickle('train_clean.pkl')
test = pd.read_pickle('test_clean.pkl')

In [3]:
train.head()

Unnamed: 0,id,title,text_clean
0,619941,Loch Katrine,\n\n\n\n\nInfobox lake\n name Loch Katrine\n ...
1,3884222,Bhadayasa,\n\nInfobox royalty\n image FileBhadrayasha c...
2,4229879,Lee Jones (author),\n\nLee Jones is an online poker executive and...
3,5320685,School District 54 Bulkley Valley,Infobox school district\n name Scho...
4,9146365,Combing,Interlace disambiguationInterlacethe combing o...


In [4]:
test.head()

Unnamed: 0,id,title,text_clean
0,2936718,Meitei Christians,\n\n\nThe Meitei Christians are a Christianity...
1,17023672,Musical expression,\n\n\n\n\nMusical expression is the art of pla...
2,37590035,READ 180,\nREAD is a reading intervention program in w...
3,3191002,The Ambulance,\nInfobox film\n name The Ambulanc...
4,24154353,Sifo Company,\n\nThe Sifo Company which did business as Sif...


In [5]:
print test.shape, train.shape

(1578, 3) (4379, 3)


In [6]:
#Remove non-letters & split into words       
train['text_clean'] = train['text_clean'].map(lambda x: re.sub("[^a-zA-Z]", " ", x).lower().split())
test['text_clean'] = test['text_clean'].map(lambda x: re.sub("[^a-zA-Z]", " ", x).lower().split())
test.head()

Unnamed: 0,id,title,text_clean
0,2936718,Meitei Christians,"[the, meitei, christians, are, a, christianity..."
1,17023672,Musical expression,"[musical, expression, is, the, art, of, playin..."
2,37590035,READ 180,"[read, is, a, reading, intervention, program, ..."
3,3191002,The Ambulance,"[infobox, film, name, the, ambulance, image, t..."
4,24154353,Sifo Company,"[the, sifo, company, which, did, business, as,..."


In [7]:
from nltk.corpus import stopwords # Import the stop word list
print stopwords.words("english") 

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [8]:
stops = set(stopwords.words("english")) 

In [9]:
#remove stop words and words with length <= 2
train['text_clean'] = train['text_clean'].map(lambda x: " ".join([w for w in x if not w in stops and len(w)>2]))
test['text_clean'] = test['text_clean'].map(lambda x: " ".join([w for w in x if not w in stops and len(w)>2]))


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [45]:
train_data_features = vectorizer.fit_transform(train['text_clean'])

In [46]:
train_data_features = train_data_features.toarray()

In [47]:
print train_data_features.shape

(4379, 5000)


In [48]:
vocab = vectorizer.get_feature_names()
print vocab



In [49]:
feature_matrix = pd.DataFrame(train_data_features, index = train['title'], columns=vectorizer.get_feature_names())
print feature_matrix.shape

(4379, 5000)


In [33]:
pov = pd.read_csv('POV.csv')
pov.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
pov.head()

Unnamed: 0,id,title
0,5597,'Politics of Cyprus'
1,14668,'Economy of Iraq'
2,24400,'Pair programming'
3,32783,'Antisemitism and the New Testament'
4,38424,'Bikram Yoga'


In [17]:
pov['title_strip'] = pov['title'].map(lambda x: re.sub("'", '', x))
pov.head()

Unnamed: 0,id,title,title_strip
0,5597,'Politics of Cyprus',Politics of Cyprus
1,14668,'Economy of Iraq',Economy of Iraq
2,24400,'Pair programming',Pair programming
3,32783,'Antisemitism and the New Testament',Antisemitism and the New Testament
4,38424,'Bikram Yoga',Bikram Yoga


In [18]:
#add labels to train set
train['pov'] = train['title'].isin(pov['title_strip'])
test['pov'] = test['title'].isin(pov['title_strip'])

test.head()

Unnamed: 0,id,title,text_clean,pov
0,2936718,Meitei Christians,meitei christians christianity movement based ...,True
1,17023672,Musical expression,musical expression art playing singing music e...,True
2,37590035,READ 180,read reading intervention program wide use stu...,True
3,3191002,The Ambulance,infobox film name ambulance image ambulancejpg...,False
4,24154353,Sifo Company,sifo company business sifos toys sifo novelty ...,False


In [20]:
#join features with labels
#feature_matrix = pd.concat([feature_matrix, pd.DataFrame({'pov': list(df['pov'])}, index=df['title'])], axis=1)

In [50]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test['text_clean'])
test_data_features = test_data_features.toarray()

In [None]:
from sklearn.svm import SVC
svc = SVC(C=100, kernel='linear') 
svc = svc.fit(feature_matrix, train['pov'])

In [None]:
svc_radial = SVC(C=1, kernel='rbf') 
svc_radial = svc_radial.fit(feature_matrix, train['pov'])

In [54]:
print "test score:", svc.score(test_data_features, test['pov'])
#print "train score:", svc.score(train_data_features, train['pov'])

test score: 0.854245880862
train score: 1.0


In [37]:
svc.score(test_data_features[test['pov']],[True]*727)

0.88720770288858319

In [38]:
svc.score(test_data_features[test['pov']==False],[False]*851)

0.79553466509988247

1.0

In [28]:
print "test score:", svc_radial.score(test_data_features, test['pov'])
print "train score", svc_radial.score(train_data_features, train['pov'])

test score: 0.777566539924
train score 0.867321306234


In [26]:
svc.coef_

array([[-0.0635912 , -0.18849899,  0.06561046, ...,  0.04556072,
         0.        ,  0.22078349]])