In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# random
from random import shuffle


%matplotlib inline
training_tokens = pd.read_pickle('../data/training_tokens.pk')
test_tokens = pd.read_pickle('../data/test_tokens.pk')
df_variants = pd.read_csv('../data/training_variants.csv',index_col=0)
df = pd.merge(df_variants,training_tokens,left_index=True,right_index=True)
df.head()

Unnamed: 0_level_0,Gene,Variation,Class,text,processed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...,"[cyclin-depend, kinas, cdk, regul, varieti, fu..."
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...,"[abstract, background, non-smal, cell, lung, c..."
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,"[abstract, background, non-smal, cell, lung, c..."
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...,"[recent, evid, demonstr, acquir, uniparent, di..."
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,"[oncogen, mutat, monomer, casita, b-lineag, ly..."


In [9]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
    
    def __iter__(self):
        for source, series in self.sources.items():
            for item_no, list_ in enumerate(series):
                yield LabeledSentence(list_, [source + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, series in self.sources.items():
            for item_no, list_ in enumerate(series):
                self.sentences.append(LabeledSentence(list_, [source + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [76]:
ll = LabeledLineSentence({'train':training_tokens.processed,'test':test_tokens.processed})

model = Doc2Vec(min_count=1, window=10, size=100, negative=5, workers=8)

model.build_vocab(ll.to_array())

In [77]:
model.train(ll.sentences,total_examples=model.corpus_count,epochs=10)

436817904

In [78]:
model.save('./imdb.d2v')
model = Doc2Vec.load('./imdb.d2v')

In [79]:
training_docvecs = np.vstack([model.docvecs['train_{}'.format(i)] for i in range(training_tokens.shape[0])])

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression()
-cross_val_score(lr,training_docvecs,df.Class,scoring='neg_log_loss')

array([ 2.05214121,  1.89688489,  1.87893022])

In [81]:
from sklearn.svm import SVC
svc = SVC(probability=True)
-cross_val_score(svc,training_docvecs,df.Class,scoring='neg_log_loss',n_jobs=-1)

array([ 1.7555764 ,  1.73151985,  1.73879514])

In [82]:
lr.fit(training_docvecs,df.Class)
pred = lr.predict(training_docvecs)
from sklearn.metrics import confusion_matrix
confusion_matrix(df.Class.values,pred)

array([[289,  26,   0, 145,  19,  16,  72,   1,   0],
       [ 23, 137,   1,  28,  14,  10, 239,   0,   0],
       [  5,   1,  26,  21,   2,   1,  33,   0,   0],
       [119,  17,   7, 399,  16,  12, 116,   0,   0],
       [ 40,   5,   3,  46,  64,  14,  70,   0,   0],
       [ 33,   8,   3,  33,   7, 139,  52,   0,   0],
       [ 32,  60,   6,  61,   9,  14, 770,   0,   1],
       [  2,   0,   0,   0,   0,   0,   5,  11,   1],
       [  2,   1,   0,   0,   0,   0,   4,   0,  30]])

In [83]:
svc.fit(training_docvecs,df.Class)
pred = svc.predict(training_docvecs)
from sklearn.metrics import confusion_matrix
confusion_matrix(df.Class.values,pred)

array([[487,   3,   1,  50,  18,   5,   4,   0,   0],
       [  6, 349,   0,   9,   2,   2,  84,   0,   0],
       [  4,   0,  31,  19,   4,   0,  31,   0,   0],
       [ 20,   0,   3, 643,  11,   3,   6,   0,   0],
       [ 39,   5,   4,  18, 133,  11,  32,   0,   0],
       [ 21,   5,   1,  14,   8, 213,  13,   0,   0],
       [  2,   7,   6,   8,   6,   0, 924,   0,   0],
       [  2,   1,   0,   0,   0,   0,   6,   9,   1],
       [  3,   1,   0,   2,   0,   0,   3,   0,  28]])

In [84]:
from sklearn.metrics import log_loss

In [85]:
print(log_loss(df.Class,svc.predict_proba(training_docvecs)))
print(log_loss(df.Class,lr.predict_proba(training_docvecs)))

0.502919342836
1.22059425388


In [86]:
from sklearn.model_selection import GridSearchCV
params = {'C':[2**i for i in range(-4,5)]}
gs = GridSearchCV(svc,params,scoring='neg_log_loss',n_jobs=-1)
gs.fit(training_docvecs,df.Class)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [87]:
gs.grid_scores_



[mean: -1.75708, std: 0.03146, params: {'C': 0.0625},
 mean: -1.75383, std: 0.03641, params: {'C': 0.125},
 mean: -1.73908, std: 0.03921, params: {'C': 0.25},
 mean: -1.74365, std: 0.03610, params: {'C': 0.5},
 mean: -1.74478, std: 0.01755, params: {'C': 1},
 mean: -1.75564, std: 0.02748, params: {'C': 2},
 mean: -1.76119, std: 0.03527, params: {'C': 4},
 mean: -1.77003, std: 0.04862, params: {'C': 8},
 mean: -1.77746, std: 0.04930, params: {'C': 16}]