In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# random
from random import shuffle


%matplotlib inline
training_tokens = pd.read_pickle('../data/stem-train.pk')
test_tokens = pd.read_pickle('../data/stem-test.pk')
df_variants = pd.read_csv('../data/training_variants.csv',index_col=0)
df = pd.merge(df_variants,training_tokens,left_index=True,right_index=True)
df.head()

Unnamed: 0_level_0,Gene,Variation,Class,text,processed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...,"[cyclin-depend, kinas, cdk, regul, varieti, fu..."
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...,"[abstract, background, non-smal, cell, lung, c..."
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,"[abstract, background, non-smal, cell, lung, c..."
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...,"[recent, evid, demonstr, acquir, uniparent, di..."
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,"[oncogen, mutat, monomer, casita, b-lineag, ly..."


In [3]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
    
    def __iter__(self):
        for source, series in self.sources.items():
            for item_no, list_ in enumerate(series):
                yield LabeledSentence(list_, [source + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, series in self.sources.items():
            for item_no, list_ in enumerate(series):
                self.sentences.append(LabeledSentence(list_, [source + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [4]:
ll = LabeledLineSentence({'train':training_tokens.processed,'test':test_tokens.processed})

model = Doc2Vec(min_count=1, window=10, size=100, negative=5, workers=8)

model.build_vocab(ll.to_array())

In [5]:
model.train(ll.sentences,total_examples=model.corpus_count,epochs=10)

436863610

In [6]:
training_docvecs = np.vstack([model.docvecs['train_{}'.format(i)] for i in range(training_tokens.shape[0])])

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import utils
lr = LogisticRegression()
X,y = utils.shuffle(training_docvecs,df.Class.values)

-cross_val_score(lr,X,y,scoring='neg_log_loss')

array([ 1.52193308,  1.44891384,  1.50156088])

In [11]:
from sklearn.svm import SVC
svc = SVC(probability=True)
-cross_val_score(svc,X,y,scoring='neg_log_loss',n_jobs=-1)

array([ 1.05644162,  1.11092472,  1.09573461])

In [13]:
lr.fit(X,y)
pred = lr.predict(X)
from sklearn.metrics import confusion_matrix
confusion_matrix(pred,y)

array([[287,  19,   6, 110,  40,  35,  38,   0,   2],
       [ 17, 133,   1,  17,   4,  11,  55,   2,   1],
       [  0,   1,  27,   6,   4,   2,   8,   0,   0],
       [140,  31,  17, 428,  43,  36,  65,   1,   3],
       [ 25,  11,   3,  17,  68,   9,  11,   0,   0],
       [ 27,  12,   3,  13,  11, 135,  13,   0,   0],
       [ 72, 244,  32,  94,  72,  47, 762,   5,   0],
       [  0,   1,   0,   0,   0,   0,   0,  10,   0],
       [  0,   0,   0,   1,   0,   0,   1,   1,  31]])

In [14]:
svc.fit(X,y)
pred = lr.predict(X)
from sklearn.metrics import confusion_matrix
confusion_matrix(pred,y)

array([[287,  19,   6, 110,  40,  35,  38,   0,   2],
       [ 17, 133,   1,  17,   4,  11,  55,   2,   1],
       [  0,   1,  27,   6,   4,   2,   8,   0,   0],
       [140,  31,  17, 428,  43,  36,  65,   1,   3],
       [ 25,  11,   3,  17,  68,   9,  11,   0,   0],
       [ 27,  12,   3,  13,  11, 135,  13,   0,   0],
       [ 72, 244,  32,  94,  72,  47, 762,   5,   0],
       [  0,   1,   0,   0,   0,   0,   0,  10,   0],
       [  0,   0,   0,   1,   0,   0,   1,   1,  31]])

In [16]:
from sklearn.metrics import log_loss

In [17]:
print(log_loss(df.Class,svc.predict_proba(training_docvecs)))
print(log_loss(df.Class,lr.predict_proba(training_docvecs)))

0.498159767078
1.22370026015


In [18]:
from sklearn.model_selection import GridSearchCV
params = {'C':[2**i for i in range(-4,5)]}
gs = GridSearchCV(svc,params,scoring='neg_log_loss',n_jobs=-1)
gs.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [19]:
gs.grid_scores_



[mean: -1.26302, std: 0.01942, params: {'C': 0.0625},
 mean: -1.24138, std: 0.02096, params: {'C': 0.125},
 mean: -1.18541, std: 0.02316, params: {'C': 0.25},
 mean: -1.13122, std: 0.02496, params: {'C': 0.5},
 mean: -1.09238, std: 0.02613, params: {'C': 1},
 mean: -1.07758, std: 0.02437, params: {'C': 2},
 mean: -1.07734, std: 0.02365, params: {'C': 4},
 mean: -1.08252, std: 0.02261, params: {'C': 8},
 mean: -1.08733, std: 0.02116, params: {'C': 16}]

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=.1)

## Lets try this with a different number of document features