In [8]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score,roc_auc_score,make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [9]:
#df = pd.read_csv('ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv',low_memory=False)
df = pd.read_csv('oasis_longitudinal.csv',low_memory=False)

In [10]:
 df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [11]:
#df.drop(['RID','studyID','ScanDate'],
 #       axis=1,inplace=True)
#df.head()

df = df.loc[df['Visit']==1] # use first visit data only because of the analysis we're doing
df = df.reset_index(drop=True) # reset index after filtering first visit data
df['M/F'] = df['M/F'].replace(['F','M'], [0,1]) # M/F column
df['Group'] = df['Group'].replace(['Converted'], ['Demented']) # Target variable
df['Group'] = df['Group'].replace(['Demented', 'Nondemented'], [1,0]) # Target variable
df = df.drop(['MRI ID', 'Visit', 'Hand'], axis=1) # Drop unnecessary columns

In [36]:
df_dropna = df.dropna(axis=0, how='any')
pd.isnull(df_dropna).sum()

df.groupby(['EDUC'])['SES'].median()
df["SES"].fillna(df.groupby("EDUC")["SES"].transform("median"), inplace=True)
pd.isnull(df['SES']).value_counts()

Y = df['Group'].values # Target for the model
X = df[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']] # Features we use

# splitting into three sets
X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, random_state=0)

from sklearn.preprocessing import MinMaxScaler 

# Feature scaling
scaler = MinMaxScaler().fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

Y = df_dropna['Group'].values # Target for the model
X = df_dropna[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']] # Features we use

# splitting into three sets
X_trainval_dna, X_test_dna, Y_trainval_dna, Y_test_dna = train_test_split(
    X, Y, random_state=0)

# Feature scaling
scaler = MinMaxScaler().fit(X_trainval_dna)
X_trainval_scaled_dna = scaler.transform(X_trainval_dna)
X_test_scaled_dna = scaler.transform(X_test_dna)

In [37]:
#df.drop(['Left-WM-hypointensities','Right-WM-hypointensities','Left-non-WM-hypointensities','Right-non-WM-hypointensities'],
 #       axis=1,inplace=True)

In [38]:
#df.dtypes

In [39]:
#X = df.drop([Y,axis=]).copy(
#y = df['y']).copy()

In [40]:
#sum(y)/len(y)

In [41]:
#X_train,X_test,y_train,y_test = train_test_split(X_encoded,y,random_state=42,startify=y)

In [42]:
#sum(y_train)/len(y_train)
#sum(y_test)/len(y_test)

In [44]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',missing= None,seed=42,use_label_encoder=False)
clf_xgb.fit(X_trainval_scaled_dna,
           Y_trainval_dna,
           verbose=True,
           early_stopping_rounds=10,
           eval_metric='aucpr',
           eval_set=[(X_test_scaled_dna,Y_test_dna)])

[0]	validation_0-aucpr:0.87855
[1]	validation_0-aucpr:0.88180
[2]	validation_0-aucpr:0.90231
[3]	validation_0-aucpr:0.90230
[4]	validation_0-aucpr:0.90445
[5]	validation_0-aucpr:0.91300
[6]	validation_0-aucpr:0.92741
[7]	validation_0-aucpr:0.92585
[8]	validation_0-aucpr:0.93374
[9]	validation_0-aucpr:0.93860
[10]	validation_0-aucpr:0.93083
[11]	validation_0-aucpr:0.92932
[12]	validation_0-aucpr:0.93068
[13]	validation_0-aucpr:0.93687
[14]	validation_0-aucpr:0.93478
[15]	validation_0-aucpr:0.92450
[16]	validation_0-aucpr:0.92253
[17]	validation_0-aucpr:0.91450
[18]	validation_0-aucpr:0.91230
[19]	validation_0-aucpr:0.91355


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)