In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import seaborn as sns
import math

In [2]:
# Loading training variants. Comma seperated file
data_variants = pd.read_csv('training/training_variants')
# Loading training_tex dataset. This is seperated by ||
data_text = pd.read_csv('training/training_text',sep="\|\|",engine='python',names=["ID","TEXT"],skiprows=1)

# Data overview

### Data Variants

In [3]:
data_variants.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


__ID__: row id used to link mutation to clinical evidence.

__Gene__: the gene where this genetic mutation is located.

__Variation__: the aminoacid change for this mutation.

__Class__: the class value 1-9 this genetic mutation has been classified on

In [5]:
data_variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3321 non-null   int64 
 1   Gene       3321 non-null   object
 2   Variation  3321 non-null   object
 3   Class      3321 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 103.9+ KB


In [6]:
data_variants.describe()

Unnamed: 0,ID,Class
count,3321.0,3321.0
mean,1660.0,4.365854
std,958.834449,2.309781
min,0.0,1.0
25%,830.0,2.0
50%,1660.0,4.0
75%,2490.0,7.0
max,3320.0,9.0


In [7]:
data_variants.shape

(3321, 4)

In [24]:
data_variants.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

We can say that it is an imbalanced set

In [8]:
data_variants.columns

Index(['ID', 'Gene', 'Variation', 'Class'], dtype='object')

### Data text

In [9]:
data_text.head()

Unnamed: 0,ID,TEXT
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


__ID__ row id used to link text(evidence) with mutation and class.

__text__ : text column has research paper regarding that disease

In [10]:
data_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3321 non-null   int64 
 1   TEXT    3316 non-null   object
dtypes: int64(1), object(1)
memory usage: 52.0+ KB


In [13]:
data_text.shape

(3321, 2)

So in short, 

__data_variants__ (ID,Gene,Variations,Class) (3321 X 4)

__data_text__ (ID,text) ( 3321 X 2)

We want to predict about class of cancer through its given text, gene, & Variation.  

In [14]:
data_variants.Class.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

We can say that its a __multi class classification__ problem.

As it's a medical related problem, __correct__ results are very important.Error can be really costly and hence __time taken by model is not a big factor over accuracy__.

Evaluation can be done by __Multi class log-loss__ and __Confusion matrix__.


In [29]:
import nltk
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, normalized_mutual_info_score
from sklearn.metrics.classification import accuracy_score,log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')
from mlxtend.classifier import StackingClassifier
from imblearn.over_sampling import SMOTE 

# Text preprocessing
Let's pre process the huge amount of text data.

In [30]:
# We would like to remove all stop words like a,is,an,the etc
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adist\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [31]:
def data_text_preprocess(total_text, ind, col):
    # remove int values from the text data
    if type(total_text) is not int:
        string = ""
        # replacing all special char, multiple spaces with single space
        # making all letters lowercase
        total_text = re.sub('[^a-zA-Z0-9\n]',' ',str(total_text))
        total_text = re.sub('\s+',' ',str(total_text))
        total_text = total_text.lower()
        
        for word in total_text.split(' '):
            # remove stop words from text data
            if not word in stop_words:
                string += word + ' '
        
        data_text[col][ind] = string

In [32]:
for index, row in data_text.iterrows():
    if type(row['TEXT']) is str:
        data_text_preprocess(row['TEXT'],index,'TEXT')

In [33]:
data_text.head()

Unnamed: 0,ID,TEXT
0,0,cyclin dependent kinases cdks regulate variety...
1,1,abstract background non small cell lung cance...
2,2,abstract background non small cell lung cance...
3,3,recent evidence demonstrated acquired uniparen...
4,4,oncogenic mutations monomeric casitas b lineag...


In [34]:
# Merging the dataframes

In [37]:
df1 = pd.merge(data_variants,data_text,on='ID',how='left')
df1.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cance...
2,2,CBL,Q249E,2,abstract background non small cell lung cance...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


# Cleaning data

In [39]:
df1[df1.isnull().any(axis=1)]

Unnamed: 0,ID,Gene,Variation,Class,TEXT
1109,1109,FANCA,S1088F,1,
1277,1277,ARID5B,Truncating Mutations,1,
1407,1407,FGFR3,K508M,6,
1639,1639,FLT1,Amplification,6,
2755,2755,BRAF,G596C,7,


In [41]:
# Data imputatiion/ Handling missisng data 
# We can drop these rows but since data is small we should impute data.

In [44]:
# We merge Gene and Variation columns and fill those vlaues in text column.
df1.loc[result['TEXT'].isnull(),'TEXT'] = df1['Gene'] + ' ' + df1['Variation']

In [50]:
df1.loc[df1['ID']==1109]

Unnamed: 0,ID,Gene,Variation,Class,TEXT
1109,1109,FANCA,S1088F,1,FANCA S1088F


In [51]:
df1[df1.isnull().any(axis=1)]

Unnamed: 0,ID,Gene,Variation,Class,TEXT


# Training, Test & Validation data

In [126]:
# Getting labels as 'Class' column
y_true = df1['Class'].values
# Dropping 'Class' column
df2 = df1.drop(columns=['Class'])
# Replacing spaces with '_'
df1.Gene = df1.Gene.str.replace('\s+','_')
df1.Variation = df1.Variation.str.replace('\s+','_')

In [127]:
y_true

array([1, 2, 2, ..., 1, 4, 4], dtype=int64)

In [128]:
df2.head()

Unnamed: 0,ID,Gene,Variation,TEXT
0,0,FAM58A,Truncating_Mutations,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,abstract background non small cell lung cance...
2,2,CBL,Q249E,abstract background non small cell lung cance...
3,3,CBL,N454D,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,oncogenic mutations monomeric casitas b lineag...


In [129]:
X_train, test_df, y_train, y_test = train_test_split(df1, y_true, stratify=y_true, test_size=0.2)
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [130]:
train_df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
1879,1879,MTOR,A1459P,7,genes encoding components pi3k akt mtor signal...
1914,1914,SMO,D384N,2,introduction uncontrolled activation hedgehog ...
580,580,SMAD4,G508S,1,smad4 dpc4 tumour suppressor1 inactivated near...
278,278,EGFR,G465E,2,abstract colorectal cancer crc third common c...
1002,1002,TSC1,G305R,3,tuberous sclerosis complex tsc autosomal domin...


In [131]:
cv_df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
2088,2088,AGO2,Amplification,2,argonaute ago 2 catalytic engine mammalian rna...
745,745,ERBB2,L755S,7,overexpression erbb2 kinase observed one third...
1726,1726,APC,R640G,4,1 introduction familial adenomatous polyposis ...
1980,1980,CTNNB1,T41A,7,screened 75 primary hepatocellular carcinomas ...
1779,1779,CREBBP,Truncating_Mutations,1,relapsed acute lymphoblastic leukaemia leading...


In [132]:
test_df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
462,462,TP53,G266R,4,zbp 89 widely expressed kr ppeltype zinc finge...
386,386,TP53,V274F,1,inactivation p53 function loss sensitivity fas...
3214,3214,KDM5A,Amplification,2,lysine specific demethylase 5a kdm5a enzyme re...
1988,1988,CTNNB1,S23R,7,screened 75 primary hepatocellular carcinomas ...
1132,1132,MET,X1009_splice,7,non small cell lung cancer nsclc difficult dis...


In [133]:
print("Shape of train data:",train_df.shape)
print("Shape of cross validation data:",cv_df.shape)
print("Shape of test data:",test_df.shape)

Shape of train data: (2124, 5)
Shape of cross validation data: (532, 5)
Shape of test data: (665, 5)


In [134]:
# Distribution of data in train,test & validation set.
train_dist = np.unique(y_train,return_counts = True)
test_dist = np.unique(y_test,return_counts = True)
cv_dist = np.unique(y_cv,return_counts = True)

In [135]:
# helper function for printing distribution values
def show_dist(arr_dist):
    sm = np.sum(arr_dist[1])
    for i in range(len(arr_dist[0])):
        print('No.of data pnts in class {}:  {} ({} %)'.format(arr_dist[0][i],arr_dist[1][i],np.around((arr_dist[1][i]/sm)*100,decimals=3)))

In [136]:
show_dist(train_dist)

No.of data pnts in class 1:  363 (17.09 %)
No.of data pnts in class 2:  289 (13.606 %)
No.of data pnts in class 3:  57 (2.684 %)
No.of data pnts in class 4:  439 (20.669 %)
No.of data pnts in class 5:  155 (7.298 %)
No.of data pnts in class 6:  176 (8.286 %)
No.of data pnts in class 7:  609 (28.672 %)
No.of data pnts in class 8:  12 (0.565 %)
No.of data pnts in class 9:  24 (1.13 %)


In [137]:
show_dist(test_dist)

No.of data pnts in class 1:  114 (17.143 %)
No.of data pnts in class 2:  91 (13.684 %)
No.of data pnts in class 3:  18 (2.707 %)
No.of data pnts in class 4:  137 (20.602 %)
No.of data pnts in class 5:  48 (7.218 %)
No.of data pnts in class 6:  55 (8.271 %)
No.of data pnts in class 7:  191 (28.722 %)
No.of data pnts in class 8:  4 (0.602 %)
No.of data pnts in class 9:  7 (1.053 %)


In [138]:
show_dist(cv_dist)

No.of data pnts in class 1:  91 (17.105 %)
No.of data pnts in class 2:  72 (13.534 %)
No.of data pnts in class 3:  14 (2.632 %)
No.of data pnts in class 4:  110 (20.677 %)
No.of data pnts in class 5:  39 (7.331 %)
No.of data pnts in class 6:  44 (8.271 %)
No.of data pnts in class 7:  153 (28.759 %)
No.of data pnts in class 8:  3 (0.564 %)
No.of data pnts in class 9:  6 (1.128 %)


In [125]:
df1.to_csv('Clean_File.csv')