In [604]:
#Classifier choices for outcome prediction for AML patients
#KNeighborsClassifier, LogisticRegression, RandomForestClassifier, RidgeClassifier, Perceptron, LinearSVC
#LogisticRegression and LinearSVC performed best with accuracy=0.925
#Perceptron was a good option also with no false positives

#Preprocessing of data
#Transposed the initial file format to get features as columns
#Removed features with more than 90% NAs
#Converted categorical variables to numerics
#Did not play with scaling and imputing for this exercise

#Cross validation 
#Used 5-fold cross validation in logistic regression

#Predictive features
#Top 4 features were
#1.['patient.days_to_last_followup']
#2.['patient.molecular_analysis_abnormality_testing_results.molecular_analysis_abnormality_testing_result_values.molecular_analysis_abnormality_testing_result']
#3.['patient.lab_procedure_abnormal_lymphocyte_result_percent_value']
#4.['patient.lab_procedure_bone_marrow_band_cell_result_percent_value']
#While 1st feature had the most predictive power (largest coefficient), this correlation is expected so not much information
#content with it. 


In [605]:
import pandas as pd
import numpy as np
import warnings
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoLarsCV, ElasticNet
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve,train_test_split, KFold
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler, OrdinalEncoder, LabelEncoder
warnings.filterwarnings('ignore')
sns.set(style='white', context='notebook', palette='deep')
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [606]:
df = pd.read_csv('gdac.broadinstitute.org_LAML.Merge_Clinical.Level_1.2016012800.0.0/LAML.merged_only_clinical_clin_format.txt'
                 ,sep='\t')

In [607]:
tdf = df.transpose()
tdf.columns = tdf.iloc[0]
tdf = tdf.drop('V1')

In [608]:
y = tdf['patient.vital_status']
#tdf = tdf.drop(['patient.vital_status','patient.days_to_last_followup'], axis=1)

tdf = tdf.drop('patient.vital_status', axis=1)

In [609]:
#impute

#remove features with >90% NA (58 features removed)
tdf_na = (tdf.isnull().sum() / len(tdf)) * 100
newtdf = tdf.drop(tdf_na[tdf_na >90].index,axis=1)


In [610]:
#any skewed features?

#remove uuid, patient id columns

In [611]:
features = newtdf.columns.values

for f in features:
    lbl = LabelEncoder() 
    lbl.fit(list(newtdf[f].values)) 
    newtdf[f] = lbl.transform(list(newtdf[f].values))

lbl.fit(list(y.values))
y = lbl.transform(list(y.values))

from sklearn.decomposition import PCA

pca = PCA()
#Todo: Fit and transform X using PCA (function params: training data and labels)
pca = pca.fit(newtdf, y)
X = pca.transform(newtdf)

In [612]:
#from sklearn.model_selection import train_test_split
train, test, y_train, y_test = train_test_split(newtdf, y, test_size=0.2, random_state=1)


In [613]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(train, y_train) 
y_pred = knn.predict(test)
knn.score(test,y_test)

0.775

In [614]:
confusion_matrix(y_test, y_pred).ravel()

array([10,  7,  2, 21])

In [629]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0, solver='lbfgs',multi_class='multinomial').fit(train, y_train)
y_pred = clf.predict(test)
clf.score(test,y_test)

0.925

In [616]:
confusion_matrix(y_test, y_pred).ravel()

array([15,  2,  1, 22])

In [630]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')

0.9246615087040618

In [617]:
for i in np.argsort(clf.coef_,axis=1).T[-4:]:
    print (i,features[i])

[71] ['patient.lab_procedure_bone_marrow_band_cell_result_percent_value']
[69] ['patient.lab_procedure_abnormal_lymphocyte_result_percent_value']
[95] ['patient.molecular_analysis_abnormality_testing_results.molecular_analysis_abnormality_testing_result_values.molecular_analysis_abnormality_testing_result']
[23] ['patient.days_to_last_followup']


In [618]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(train, y_train)
y_pred = clf.predict(test)
clf.score(test,y_test)

0.875

In [619]:
confusion_matrix(y_test, y_pred).ravel()

array([12,  5,  0, 23])

In [620]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier().fit(train, y_train)
y_pred = clf.predict(test)
clf.score(test,y_test)

0.9

In [621]:
confusion_matrix(y_test, y_pred).ravel()

array([14,  3,  1, 22])

In [627]:
from sklearn.linear_model import Perceptron

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(train, y_train)
y_pred = clf.predict(test)
clf.score(test,y_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')

0.900501253132832

In [623]:
confusion_matrix(y_test, y_pred).ravel()

array([17,  0,  4, 19])

In [624]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(train, y_train)
y_pred = clf.predict(test)
clf.score(test,y_test)

0.925

In [625]:
confusion_matrix(y_test, y_pred).ravel()

array([15,  2,  1, 22])