<a href="https://colab.research.google.com/github/piyush5566/ML_Projects/blob/master/genetic_mutation_classification_v2_part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re

In [0]:
# Loading training_variants. Its a comma seperated file
data_variants = pd.read_csv('Variants_data')
# Loading training_text dataset. This is seperated by ||
data_text =pd.read_csv("Text_data",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)

In [0]:
# We would like to remove all stop words like a, is, an, the, ... 
# so we collecting all of them from nltk library
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:

def data_text_preprocess(total_text, ind, col):
    # Remove int values from text data as that might not be imp
    if type(total_text) is not int:
        string = ""
        # replacing all special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replacing multiple spaces with single space
        total_text = re.sub('\s+',' ', str(total_text))
        # bring whole text to same lower-case scale.
        total_text = total_text.lower()
        
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from text
            if not word in stop_words:
                string += word + " "
        
        data_text[col][ind] = string

In [0]:
# Below code will take some time because its huge text (took 4 minute on my 16 GB RAM system), so run it and have a cup of coffee :)
for index, row in data_text.iterrows():
    if type(row['TEXT']) is str:
        data_text_preprocess(row['TEXT'], index, 'TEXT')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:
#merging both gene_variations and text data based on ID
result = pd.merge(data_variants, data_text,on='ID', how='left')
result.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [0]:
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation']

In [0]:
y_true = result['Class'].values
result.Gene      = result.Gene.str.replace('\s+', '_')
result.Variation = result.Variation.str.replace('\s+', '_')
result=result.drop(columns=['Class','ID'])

In [0]:
# Splitting the data into train and test set 
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
# split the train data now into train validation and cross validation
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)



In [0]:
print(train_df.shape,y_train.shape)

(2124, 3) (2124,)


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
# one-hot encoding of Gene feature.

#gene_vectorizer = CountVectorizer()
#gene_feature_onehotCoding = gene_vectorizer.fit_transform(result['Gene'])
#gene_feature_onehotCoding.shape
gene_vectorizer = TfidfVectorizer()
train_gene_feature_TFIDF=gene_vectorizer.fit_transform(train_df['Gene'])

cv_gene_feature_TFIDF=gene_vectorizer.transform(cv_df['Gene'])
test_gene_feature_TFIDF=gene_vectorizer.transform(test_df['Gene'])

In [0]:
# one-hot encoding of variation feature.
#variation_vectorizer = CountVectorizer()
#variation_feature_onehotCoding = variation_vectorizer.fit_transform(result['Variation'])

var_vectorizer = TfidfVectorizer()
train_variation_feature_TFIDF=var_vectorizer.fit_transform(train_df['Variation'])
cv_variation_feature_TFIDF=var_vectorizer.transform(cv_df['Variation'])
test_variation_feature_TFIDF=var_vectorizer.transform(test_df['Variation'])

In [0]:
from sklearn.preprocessing import normalize
text_vectorizer = TfidfVectorizer()
train_text_feature_TFIDF=text_vectorizer.fit_transform(train_df['TEXT'])
train_text_feature_TFIDF=normalize(train_text_feature_TFIDF,axis=0)

cv_text_feature_TFIDF=text_vectorizer.transform(cv_df['TEXT'])
cv_text_feature_TFIDF=normalize(cv_text_feature_TFIDF,axis=0)

test_text_feature_TFIDF=text_vectorizer.transform(test_df['TEXT'])
test_text_feature_TFIDF=normalize(test_text_feature_TFIDF,axis=0)


In [0]:
train_gene_var_TFIDF = np.hstack((train_gene_feature_TFIDF.toarray(),train_variation_feature_TFIDF.toarray()))

train_x_TFIDF = np.hstack((train_gene_var_TFIDF, train_text_feature_TFIDF.toarray()))

cv_gene_var_TFIDF = np.hstack((cv_gene_feature_TFIDF.toarray(),cv_variation_feature_TFIDF.toarray()))

cv_x_TFIDF = np.hstack((cv_gene_var_TFIDF, cv_text_feature_TFIDF.toarray()))

test_gene_var_TFIDF = np.hstack((test_gene_feature_TFIDF.toarray(),test_variation_feature_TFIDF.toarray()))

test_x_TFIDF = np.hstack((test_gene_var_TFIDF, test_text_feature_TFIDF.toarray()))

In [0]:
train_x_TFIDF.shape

(2124, 26463)

TFIDF + RANDOM OVERSAMPLER + XGBOOST

In [0]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
train_X_ros, train_y_ros = ros.fit_sample(train_x_TFIDF, y_train)

train_X_ros.shape




(5481, 26463)

In [0]:
df_y=pd.Series(train_y_ros,name='Class')
df_y.value_counts().sort_index()

1    609
2    609
3    609
4    609
5    609
6    609
7    609
8    609
9    609
Name: Class, dtype: int64

In [0]:
import xgboost as xgb
#dtrain = xgb.DMatrix(X_train, label=y_train)
#dcv=xgb.DMatrix(X_cv,label=y_cv)


In [0]:

from sklearn.metrics.classification import accuracy_score, log_loss

model = xgb.XGBClassifier()

eval_set = [(cv_x_TFIDF, y_cv)]
model.fit(train_X_ros, train_y_ros, eval_metric="mlogloss", eval_set=eval_set, verbose=True)
# make predictions for test data




[0]	validation_0-mlogloss:2.13041
[1]	validation_0-mlogloss:2.07981
[2]	validation_0-mlogloss:2.03574
[3]	validation_0-mlogloss:1.99943
[4]	validation_0-mlogloss:1.9701
[5]	validation_0-mlogloss:1.94256
[6]	validation_0-mlogloss:1.91315
[7]	validation_0-mlogloss:1.89135
[8]	validation_0-mlogloss:1.87103
[9]	validation_0-mlogloss:1.85107
[10]	validation_0-mlogloss:1.8328
[11]	validation_0-mlogloss:1.81694
[12]	validation_0-mlogloss:1.80223
[13]	validation_0-mlogloss:1.78927
[14]	validation_0-mlogloss:1.77705
[15]	validation_0-mlogloss:1.76394
[16]	validation_0-mlogloss:1.75294
[17]	validation_0-mlogloss:1.7427
[18]	validation_0-mlogloss:1.73349
[19]	validation_0-mlogloss:1.72252
[20]	validation_0-mlogloss:1.71392
[21]	validation_0-mlogloss:1.70545
[22]	validation_0-mlogloss:1.69741
[23]	validation_0-mlogloss:1.69005
[24]	validation_0-mlogloss:1.68293
[25]	validation_0-mlogloss:1.67487
[26]	validation_0-mlogloss:1.66773
[27]	validation_0-mlogloss:1.66044
[28]	validation_0-mlogloss:1.6545

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
predict_y = model.predict_proba(train_x_TFIDF)
print("The train log loss is:",log_loss(y_train, predict_y, labels=model.classes_, eps=1e-15))
predict_y = model.predict_proba(cv_x_TFIDF)
print("The cross validation log loss is:",log_loss(y_cv, predict_y, labels=model.classes_, eps=1e-15))
predict_y = model.predict_proba(test_x_TFIDF)
print("The test log loss is:",log_loss(y_test, predict_y, labels=model.classes_, eps=1e-15))
y_pred=model.predict(test_x_TFIDF)

The train log loss is: 1.2689320914304365
The cross validation log loss is: 1.426903634556198
The test log loss is: 1.4202971918746492


In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.67      0.59      0.63       114
           2       0.40      0.56      0.47        91
           3       0.18      0.33      0.24        18
           4       0.69      0.48      0.57       137
           5       0.30      0.44      0.36        48
           6       0.56      0.36      0.44        55
           7       0.52      0.52      0.52       191
           8       1.00      0.25      0.40         4
           9       0.50      0.71      0.59         7

    accuracy                           0.51       665
   macro avg       0.54      0.47      0.47       665
weighted avg       0.55      0.51      0.52       665



TFIDF + SMOTE + XGBOOST

In [0]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
train_X_ros, train_y_ros = smote.fit_sample(train_x_TFIDF, y_train)
train_X_ros.shape



(2721, 26463)

In [0]:
df_y=pd.Series(train_y_ros,name='Class')
df_y.value_counts().sort_index()

1    363
2    289
3     57
4    439
5    155
6    176
7    609
8    609
9     24
Name: Class, dtype: int64

In [0]:
#from sklearn.metrics.classification import accuracy_score, log_loss

model = xgb.XGBClassifier()

eval_set = [(cv_x_TFIDF, y_cv)]
model.fit(train_X_ros, train_y_ros, eval_metric="mlogloss", eval_set=eval_set, verbose=True)
# make predictions for test data


[0]	validation_0-mlogloss:2.10847
[1]	validation_0-mlogloss:2.03742
[2]	validation_0-mlogloss:1.97953
[3]	validation_0-mlogloss:1.92833
[4]	validation_0-mlogloss:1.88494
[5]	validation_0-mlogloss:1.847
[6]	validation_0-mlogloss:1.81364
[7]	validation_0-mlogloss:1.7852
[8]	validation_0-mlogloss:1.75658
[9]	validation_0-mlogloss:1.73104
[10]	validation_0-mlogloss:1.70931
[11]	validation_0-mlogloss:1.68885
[12]	validation_0-mlogloss:1.67061
[13]	validation_0-mlogloss:1.65366
[14]	validation_0-mlogloss:1.63724
[15]	validation_0-mlogloss:1.62171
[16]	validation_0-mlogloss:1.60788
[17]	validation_0-mlogloss:1.59339
[18]	validation_0-mlogloss:1.58242
[19]	validation_0-mlogloss:1.57027
[20]	validation_0-mlogloss:1.55836
[21]	validation_0-mlogloss:1.54735
[22]	validation_0-mlogloss:1.53682
[23]	validation_0-mlogloss:1.52738
[24]	validation_0-mlogloss:1.51774
[25]	validation_0-mlogloss:1.50857
[26]	validation_0-mlogloss:1.50084
[27]	validation_0-mlogloss:1.49238
[28]	validation_0-mlogloss:1.4854

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
predict_y = model.predict_proba(train_x_TFIDF)
print("The train log loss is:",log_loss(y_train, predict_y, labels=model.classes_, eps=1e-15))
predict_y = model.predict_proba(cv_x_TFIDF)
print("The cross validation log loss is:",log_loss(y_cv, predict_y, labels=model.classes_, eps=1e-15))
predict_y = model.predict_proba(test_x_TFIDF)
print("The test log loss is:",log_loss(y_test, predict_y, labels=model.classes_, eps=1e-15))
y_pred=model.predict(test_x_TFIDF)

The train log loss is: 1.1888824513246938
The cross validation log loss is: 1.2823418193125635
The test log loss is: 1.3124606512711012


In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.69      0.51      0.59       114
           2       0.68      0.21      0.32        91
           3       0.00      0.00      0.00        18
           4       0.65      0.50      0.56       137
           5       0.33      0.33      0.33        48
           6       0.58      0.25      0.35        55
           7       0.48      0.94      0.64       191
           8       1.00      0.25      0.40         4
           9       0.50      0.29      0.36         7

    accuracy                           0.54       665
   macro avg       0.55      0.36      0.40       665
weighted avg       0.57      0.54      0.50       665



  _warn_prf(average, modifier, msg_start, len(result))
