In [1]:
import numpy as np
import sklearn
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import ensemble,tree,linear_model
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier,GradientBoostingRegressor)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split,cross_val_score
# from sklearn.cross_validation import KFold
import tensorflow as tf
import warnings
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)
%reload_ext autoreload
%autoreload 2
%matplotlib inline
warnings.filterwarnings('ignore')

pd.options.display.max_columns=99

  from ._conv import register_converters as _register_converters


In [None]:
train = pd.read_csv('data/TADPOLE_TargetData_train.csv')
train.shape

In [82]:
Input_Data = pd.read_csv('data/Input_interp_filledCat.csv')
Input_Data['EXAMDATE'] = pd.to_datetime(Input_Data['EXAMDATE'], errors='coerce')
Input_Data['Ventricles'] = Input_Data['Ventricles']/Input_Data['ICV']
Input_Data = Input_Data.rename(index=str,columns={'Ventricles':'Ventricles_Norm'})
Input_Data=Input_Data[['PTID_Key','EXAMDATE','DX','ADAS13','Ventricles_Norm','MMSE']]
Input_new=Input_Data
print("Total number of missing values: ", Input_Data.isnull().sum().sum())
Input_Data.head()


0


Unnamed: 0,PTID_Key,EXAMDATE,DX,ADAS13,Ventricles_Norm,MMSE
0,1.0,2010-12-10,MCI,21.0,0.022785,27.0
1,1.0,2011-04-07,MCI,23.5,0.022753,25.5
2,1.0,2011-09-08,MCI,26.0,0.02368,24.0
3,2.0,2006-07-21,AD,27.67,0.044479,25.0
4,2.0,2007-01-16,AD,30.33,0.046498,24.0


In [84]:
list = np.unique(Input_new.DX.values)
list
# Input_new['DX'].isnull().sum()

array(['AD', 'CN', 'MCI'], dtype=object)

In [85]:
train = pd.read_csv('data/train_preprocessed.csv')
train['CN_Diag'] = train['CN_Diag'].astype('int')
train['MCI_Diag'] = train['MCI_Diag'].astype('int')
train['AD_Diag'] = train['AD_Diag'].astype('int')

# Encode one-hot encoding back to label encoding (0: CN_Diag, 1: MCI_Diag, 2: AD_Diag)
train['Diag'] = np.argmax(train[['CN_Diag','MCI_Diag','AD_Diag']].values,axis=1)

train=train.rename(index=str,columns={"ADAS13":"ADAS13_raw", "Ventricles_Norm":"Ventricles_Norm_raw", "MMSE":"MMSE_raw"})
train.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13_raw,Ventricles_Norm_raw,MMSE_raw,Diag
0,2013-03-28,8,0,1,0,5.0,0.012128,30.0,1
1,2013-10-31,8,0,1,0,5.0,0.012128,30.0,1
2,2014-04-28,8,0,1,0,5.0,0.012128,30.0,1
3,2013-02-04,18,0,1,0,9.0,0.020526,30.0,1
4,2013-09-03,18,0,1,0,10.5,0.020526,29.5,1


In [86]:
val=pd.read_csv('data/val_preprocessed.csv')
val['CN_Diag'] = val['CN_Diag'].astype('int')
val['MCI_Diag'] = val['MCI_Diag'].astype('int')
val['AD_Diag'] = val['AD_Diag'].astype('int')

# Encode one-hot encoding back to label encoding (0: CN_Diag, 1: MCI_Diag, 2: AD_Diag)
val['Diag'] = np.argmax(val[['CN_Diag','MCI_Diag','AD_Diag']].values,axis=1)

val=val.rename(index=str,columns={"ADAS13":"ADAS13_raw", "Ventricles_Norm":"Ventricles_Norm_raw", "MMSE":"MMSE_raw"})

val.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13_raw,Ventricles_Norm_raw,MMSE_raw,Diag
0,2013-02-13,5,1,0,0,16.0,0.019279,28.0,0
1,2013-08-14,5,0,0,0,19.5,0.019279,27.5,0
2,2014-02-20,5,0,1,0,23.0,0.019279,27.0,1
3,2015-03-05,5,0,0,1,29.0,0.019279,27.0,2
4,2016-04-28,5,0,0,1,26.0,0.019279,19.0,2


In [87]:
Input_new.head()

Unnamed: 0,PTID_Key,EXAMDATE,DX,ADAS13,Ventricles_Norm,MMSE
0,1.0,2010-12-10,MCI,21.0,0.022785,27.0
1,1.0,2011-04-07,MCI,23.5,0.022753,25.5
2,1.0,2011-09-08,MCI,26.0,0.02368,24.0
3,2.0,2006-07-21,AD,27.67,0.044479,25.0
4,2.0,2007-01-16,AD,30.33,0.046498,24.0


In [88]:
def prep_comp(data):   
    # Fill baseline values
    fill_features = ['ADAS13','Ventricles_Norm','MMSE','DX']
    # print(data.shape)
    ID_data = np.unique(data.PTID_Key.values).tolist()
    data['ADAS13'] = ""
    data['Ventricles_Norm'] = ""
    data['MMSE'] = ""
    data['DX'] = ""
    # print(data.shape)
    data.head()
    for ID in ID_data:
        for feature in fill_features:
    #         print(feature)
            baseline = Input_new[Input_new['PTID_Key']==ID][feature].values[-1]

            idx = data[data['PTID_Key']==ID].index.values
            data.loc[idx,feature] = baseline
    # map those values into numbers
    DX_mapping = {"CN": 0, "MCI": 1, "AD": 2}
    data['DX'] = data['DX'].map(DX_mapping)

    return data

In [89]:
train=prep_comp(train)
train.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13_raw,Ventricles_Norm_raw,MMSE_raw,Diag,ADAS13,Ventricles_Norm,MMSE,DX
0,2013-03-28,8,0,1,0,5.0,0.012128,30.0,1,6,0.0121268,28,0
1,2013-10-31,8,0,1,0,5.0,0.012128,30.0,1,6,0.0121268,28,0
2,2014-04-28,8,0,1,0,5.0,0.012128,30.0,1,6,0.0121268,28,0
3,2013-02-04,18,0,1,0,9.0,0.020526,30.0,1,9,0.01972,30,1
4,2013-09-03,18,0,1,0,10.5,0.020526,29.5,1,9,0.01972,30,1


In [90]:
val=prep_comp(val)
val.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13_raw,Ventricles_Norm_raw,MMSE_raw,Diag,ADAS13,Ventricles_Norm,MMSE,DX
0,2013-02-13,5,1,0,0,16.0,0.019279,28.0,0,13,0.0175011,29,0
1,2013-08-14,5,0,0,0,19.5,0.019279,27.5,0,13,0.0175011,29,0
2,2014-02-20,5,0,1,0,23.0,0.019279,27.0,1,13,0.0175011,29,0
3,2015-03-05,5,0,0,1,29.0,0.019279,27.0,2,13,0.0175011,29,0
4,2016-04-28,5,0,0,1,26.0,0.019279,19.0,2,13,0.0175011,29,0


In [91]:
Input_new.isnull().sum().sum()

0

In [109]:
def get_score(prediction, labels):
    print('R2: {}'.format(r2_score(prediction, labels)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction,labels))))

In [110]:
def print_score(data):
    print("ADAS13:")
    get_score(data['ADAS13'],data['ADAS13_raw'])
    
    print("Ventricles_Norm:")
    get_score(data['Ventricles_Norm'],data['Ventricles_Norm_raw'])
    
    print("MMSE:")
    get_score(data['MMSE'],data['MMSE_raw'])

In [111]:
print("Training data")
print_score(train)
print("Validation data")
print_score(val)

Training data
ADAS13:
R2: 0.6027650766761312
RMSE: 5.974455953882005
Ventricles_Norm:
R2: 0.9163458094779479
RMSE: 0.0032326207676972784
MMSE:
R2: -0.008722061408816817
RMSE: 2.5382067915670823
Validation data
ADAS13:
R2: 0.3819570551705981
RMSE: 6.896409772101716
Ventricles_Norm:
R2: 0.9399331515037006
RMSE: 0.003053187388965918
MMSE:
R2: -0.1375732153454441
RMSE: 2.561833141596738


In [104]:
def transform(y_val,predictions):
    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder()
    y_val_cls = ohe.fit_transform(y_val.reshape(-1,1)).toarray()

    predictions_cls = ohe.fit_transform(predictions.reshape(-1,1)).toarray()
    cn_cls = y_val_cls[:,0]
    mci_cls = y_val_cls[:,1]
    ad_cls = y_val_cls[:,2]

    cn_pred = predictions_cls[:,0]
    mci_pred = predictions_cls[:,1]
    ad_pred = predictions_cls[:,2]
    return cn_cls,mci_cls,ad_cls,cn_pred,mci_pred,ad_pred
def metrics(DX,y_test,predictions):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
#     pred_prob = model.predict_proba(X_test)[:,1]
    accuracy = accuracy_score(y_test, predictions)
    precision=precision_score(y_test, predictions)
    recall=recall_score(y_test, predictions)
#     roc=roc_auc_score(y_test,pred_prob)
    print("%s Accuracy: %.2f%% " % (DX,accuracy *100))
    print("%s Precision: %.2f%% " % (DX,precision *100))
    print("%s Recall: %.2f%% " % (DX,recall * 100))
#     print("%s AUC: %.2f%% " % (DX,roc *100))
    return  

In [98]:
acc = accuracy_score(train['DX'],train['Diag'])
print('Training Accuracy: %0.2f (+/- %0.2f)' % (acc.mean(), acc.std()*2))

acc = accuracy_score(val['DX'],val['Diag'])
print('Validation Accuracy: %0.2f (+/- %0.2f)' % (acc.mean(), acc.std()*2))

Training Accuracy: 0.86 (+/- 0.00)
Validation Accuracy: 0.82 (+/- 0.00)


In [105]:
cn_cls,mci_cls,ad_cls,cn_pred,mci_pred,ad_pred = transform(train['Diag'],train['DX'])
metrics('CN_Diag',cn_cls,cn_pred)
print('*'*30)
metrics('MCI_Diag',mci_cls,mci_pred)
print('*'*30)
metrics('AD_Diag',ad_cls,ad_pred)

CN_Diag Accuracy: 92.85% 
CN_Diag Precision: 91.29% 
CN_Diag Recall: 88.01% 
******************************
MCI_Diag Accuracy: 86.32% 
MCI_Diag Precision: 78.49% 
MCI_Diag Recall: 94.44% 
******************************
AD_Diag Accuracy: 91.90% 
AD_Diag Precision: 98.10% 
AD_Diag Recall: 63.39% 
