# TFIDF classification for the Kaggle cancer dataset

Using the preprocessing functions found in src/text_procesing to process the data, I run several simple logistic regression and tfidf classifications for the data.  These are simple and rely on no feature engineering so they should be taken as a benchmark for future algorithms.  

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
df_train = pd.merge(pd.read_pickle('../data/stem-train.pk'),pd.read_csv('../data/training_variants.csv',index_col='ID'),
                                                                       left_index=True,right_index=True)
df_test = pd.merge(pd.read_pickle('../data/stem-test.pk'),pd.read_csv('../data/test_variants.csv',index_col='ID'),
                   left_index=True, right_index=True)
df = pd.concat((df_train,df_test)).reset_index(drop=True)
df.head()

Unnamed: 0,Class,Gene,Variation,processed,text
0,1.0,FAM58A,Truncating Mutations,"[cyclin-depend, kinas, cdk, regul, varieti, fu...",Cyclin-dependent kinases (CDKs) regulate a var...
1,2.0,CBL,W802*,"[abstract, background, non-smal, cell, lung, c...",Abstract Background Non-small cell lung canc...
2,2.0,CBL,Q249E,"[abstract, background, non-smal, cell, lung, c...",Abstract Background Non-small cell lung canc...
3,3.0,CBL,N454D,"[recent, evid, demonstr, acquir, uniparent, di...",Recent evidence has demonstrated that acquired...
4,4.0,CBL,L399V,"[oncogen, mutat, monomer, casita, b-lineag, ly...",Oncogenic mutations in the monomeric Casitas B...


### Hashing Vectorizer (Similar to bag of words)

Froom this model, we get a slightly higher score then the tfidf vectorizer.  I do not understand exactly why the bag of words odes better thena  more sophisticated tfidf, but the results speak for themselves.  I will analyze the results for more than several strengths of regularization in logistic regression (Note that C is the inverse regularization strength, so smaller values of C are more regulated models).

In [3]:
def return_same(x):
    return x
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.utils import shuffle
X,y = shuffle(df_train.processed,df_train.Class)
print('here')

for i in [2**x for x in range(-2,5,2)]:
    pipe = Pipeline([('encoder',HashingVectorizer(analyzer=return_same)),('lr',LogisticRegression(C=i))])
    cvs = -cross_val_score(pipe,X,y,scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

here
regularization 2^-2: 1.346
regularization 2^0: 1.166
regularization 2^2: 1.047
regularization 2^4: 1.018


### Using TFIDF and logistic regression

We can see that TFIDF does not preform as well as just the bag of words model found above.

In [4]:
for i in [2**x for x in range(-2,5,2)]:
    pipe = Pipeline([('encoder',TfidfVectorizer(analyzer=return_same,min_df=10)),('lr',LogisticRegression(C=i))])
    cvs = -cross_val_score(pipe,X,y,scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^-2: 1.336
regularization 2^0: 1.137
regularization 2^2: 1.025
regularization 2^4: 1.029


## Using tfidf on the entire dataset

Lets try using TFIDF on the entire dataset and then taking the first training sets (since we have the test data, we might as well use it int the construction of our vectors.  

In [6]:
X = TfidfVectorizer(analyzer=return_same, min_df=10).fit_transform(df.processed)[:df_train.shape[0]]
X, y = shuffle(X, df_train.Class)
for i in [2**x for x in range(-2,5,2)]:
    lr = LogisticRegression(C=i)
    cvs = -cross_val_score(lr, X, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^-2: 1.350
regularization 2^0: 1.138
regularization 2^2: 1.012
regularization 2^4: 1.005


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.svm import SVC

In [13]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1000)
X = TfidfVectorizer(analyzer=return_same, min_df=10).fit_transform(df.processed)
X_pc = pca.fit_transform(X.toarray())[:df_train.shape[0]]
i = 4
lr = LogisticRegression(C=i)
X_pc, y = shuffle(X_pc, df_train.Class)
cvs = -cross_val_score(lr, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^2: 1.015


In [14]:
pca = PCA(n_components=500)
X = TfidfVectorizer(analyzer=return_same, min_df=10).fit_transform(df.processed)
X_pc = pca.fit_transform(X.toarray())[:df_train.shape[0]]
i = 4
lr = LogisticRegression(C=i)
X_pc, y = shuffle(X_pc, df_train.Class)
cvs = -cross_val_score(lr, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^2: 1.044


In [15]:
pca = PCA(n_components=500)
X = TfidfVectorizer(analyzer=return_same, min_df=50).fit_transform(df.processed)
X_pc = pca.fit_transform(X.toarray())[:df_train.shape[0]]
i = 4
lr = LogisticRegression(C=i)
X_pc, y = shuffle(X_pc, df_train.Class)
cvs = -cross_val_score(lr, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^2: 1.043


In [16]:
pca = PCA(n_components=250)
X = TfidfVectorizer(analyzer=return_same, min_df=50).fit_transform(df.processed)
X_pc = pca.fit_transform(X.toarray())[:df_train.shape[0]]
i = 4
lr = LogisticRegression(C=i)
X_pc, y = shuffle(X_pc, df_train.Class)
cvs = -cross_val_score(lr, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
print('regularization 2^{:d}: {:.3f}'.format(int(np.log2(i)),cvs))

regularization 2^2: 1.071


In [23]:
for i in range(-2,7,2):
    lr = LogisticRegression(C=2**i)
    cvs = -cross_val_score(lr, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(i,cvs))

regularization 2^-2: 1.357
regularization 2^0: 1.172
regularization 2^2: 1.071
regularization 2^4: 1.054
regularization 2^6: 1.094


In [27]:
from sklearn.svm import SVC
for i in range(-2,7,2):
    svc = SVC(probability=True,C=2**i, class_weight='balanced')
    cvs = -cross_val_score(svc, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(i,cvs))

regularization 2^-2: 1.213
regularization 2^0: 1.209
regularization 2^2: 1.207
regularization 2^4: 1.160
regularization 2^6: 1.114


In [30]:
from sklearn.svm import SVC
for i in range(-2,7,2):
    svc = SVC(probability=True,C=2**i, class_weight='balanced')
    cvs = -cross_val_score(svc, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
    print('regularization 2^{:d}: {:.3f}'.format(i,cvs))

regularization 2^-2: 1.835
regularization 2^0: 1.822
regularization 2^2: 1.478
regularization 2^4: 1.169
regularization 2^6: 1.123


In [34]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=1000,n_jobs=-1,objective='multi:softprob',num_classes=9)
X_pc.shape

(3321, 250)

In [35]:
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss', n_jobs=-1, cv=5).mean()
print('regularization 2^{:d}: {:.3f}'.format(i,cvs))

regularization 2^6: 1.263


In [40]:
xgb = XGBClassifier(n_estimators=500,eval_metric='mlogloss',n_jobs=-1,
                    objective='multi:softprob',num_classes=9,
                   subsample=.5,colsample_bytree=.5,max_depth=3)
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
print('xgb : {:.3f}'.format(cvs))

xgb : 1.115


In [42]:
xgb = XGBClassifier(n_estimators=1000,eval_metric='mlogloss',n_jobs=-1,
                    learning_rate=.05,
                    objective='multi:softprob',num_classes=9,
                   subsample=.5,colsample_bytree=.5,max_depth=3)
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
print('xgb : {:.3f}'.format(cvs))

xgb : 1.097


In [50]:
xgb = XGBClassifier(n_estimators=1000,eval_metric='mlogloss',n_jobs=-1,
                    learning_rate=.02,
                    objective='multi:softprob',num_classes=9,
                   subsample=.5,colsample_bytree=.5,max_depth=3)
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
print('xgb : {:.3f}'.format(cvs))

xgb : 0.978


In [None]:
pca = PCA(n_components=500)
X = TfidfVectorizer(analyzer=return_same, min_df=10).fit_transform(df.processed)
X_pc = pca.fit_transform(X.toarray())[:df_train.shape[0]]
X_pc, y = shuffle(X_pc, df_train.Class)
xgb = XGBClassifier(n_estimators=2000,eval_metric='mlogloss',n_jobs=-1,
                    learning_rate=.01,
                    objective='multi:softprob',num_classes=9,
                   subsample=.5,colsample_bytree=.5,max_depth=3)
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
print('xgb : {:.3f}'.format(cvs))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
cvs = -cross_val_score(xgb, X_pc, y, scoring='neg_log_loss',n_jobs=-1,cv=5).mean()
print('xgb : {:.3f}'.format(cvs))