In [19]:
import pickle
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [45]:
train = pd.read_pickle('train_bow.pkl')
test = pd.read_pickle('test_bow.pkl')

In [91]:
train.head()

Unnamed: 0,id,title,text_clean,pov
0,619941,Loch Katrine,infobox lake name loch katrin imag loch katrin...,False
1,3884222,Bhadayasa,infobox royalti imag filebhadrayasha coinjpg a...,False
2,4229879,Lee Jones (author),lee jone onlin poker execut author win low lim...,False
3,5320685,School District 54 Bulkley Valley,infobox school district name school district b...,False
4,9146365,Combing,interlac disambiguationinterlaceth comb hairco...,False


In [65]:
pov = pd.read_csv('POV.csv')
pov.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
pov['title_strip'] = pov['title'].map(lambda x: re.sub("'", '', x))
pov.head()

Unnamed: 0,id,title,title_strip
0,5597,'Politics of Cyprus',Politics of Cyprus
1,14668,'Economy of Iraq',Economy of Iraq
2,24400,'Pair programming',Pair programming
3,32783,'Antisemitism and the New Testament',Antisemitism and the New Testament
4,38424,'Bikram Yoga',Bikram Yoga


In [66]:
#add labels to train and test set
train['pov'] = train['title'].isin(pov['title_strip'])
test['pov'] = test['title'].isin(pov['title_strip'])

test.head()

Unnamed: 0,id,title,text_clean,pov
0,2936718,Meitei Christians,meitei christian christian movement base manip...,True
1,17023672,Musical expression,music express art play sing music emot communi...,True
2,37590035,READ 180,read read interv program wide use student grad...,True
3,3191002,The Ambulance,infobox film name ambul imag ambulancejpg imag...,False
4,24154353,Sifo Company,sifo compani busi sifo toy sifo novelti compan...,False


In [47]:
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

In [48]:
# MyDocs reading from a data frame
class MyDocs(object):
    def __iter__(self):
        for i in range(train.shape[0]):
            yield TaggedDocument(words=simple_preprocess(train.iloc[i,2]), tags=['%s' % train.iloc[i,1]])

In [88]:
%%time

if not os.path.exists('models/doc2vec.model'):
    print "start training doc2vec model..."
    documents = MyDocs()
    doc2vec_model = Doc2Vec(dm=1, dbow_words=1, size=200, window=8, min_count=20, workers=1)
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.iter)
    if not os.path.exists('models'):
        os.makedirs('models')
        doc2vec_model.save('models/doc2vec.model')
    else:
        doc2vec_model.save('models/doc2vec.model')
else:
    doc2vec_model = Doc2Vec.load('models/doc2vec.model')

CPU times: user 295 ms, sys: 591 ms, total: 885 ms
Wall time: 1.84 s


In [85]:
doc2vec_model.initialize_word_vectors()

In [90]:
doc2vec_model.infer_vector(['test', 'exam'])

array([ 0.01331308,  0.03355028,  0.00083345, -0.01781101, -0.00846532,
        0.0115648 ,  0.03717271,  0.01931503, -0.00898304, -0.01628613,
       -0.02143989, -0.01236345, -0.00175919, -0.00817022, -0.0319072 ,
        0.01063433,  0.06575609, -0.02196492, -0.0052598 ,  0.00462697,
        0.01951978, -0.0254368 ,  0.02517551,  0.0192138 , -0.10008786,
        0.00689221, -0.00562786,  0.02308743,  0.03699073,  0.02374153,
       -0.03125314,  0.03349215,  0.03653101,  0.03098488,  0.02748667,
        0.02981285,  0.03268561, -0.06556045,  0.04465526, -0.01643338,
       -0.06122965, -0.00427128,  0.03971937, -0.01968607,  0.00660813,
       -0.03867188,  0.03125178, -0.02825534,  0.02126163, -0.01786485,
        0.04203948,  0.02636727,  0.03465724, -0.03409329, -0.0070151 ,
       -0.04217396, -0.02301036, -0.00711416, -0.04873128,  0.00234991,
       -0.01196017,  0.01518714,  0.00164491, -0.00178916, -0.03028015,
       -0.02253855, -0.00187654, -0.0302928 , -0.01497698, -0.03

In [62]:
feature_matrix = pd.concat([pd.Series(doc2vec_model.docvecs[i]) for i in range(0, train.shape[0])], axis=1).T

In [None]:
doc2vec_model.

In [76]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=.01) 

logit = logit.fit(feature_matrix, train['pov'])

In [77]:
logit.score(feature_matrix, train['pov'])

0.68455915943353129

In [78]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100, criterion='entropy') 
forest = forest.fit(feature_matrix, train['pov'])

In [80]:
#print "test score:", forest.score(test_data_features, test['pov'])
print "train score:", forest.score(feature_matrix, train['pov'])

 train score: 1.0


In [83]:
from sklearn.svm import SVC
svc = SVC(C=100, kernel='linear') 
svc = svc.fit(feature_matrix, train['pov'])

In [84]:
print "train score:", svc.score(feature_matrix, train['pov'])

train score: 0.709913202376
