# Classification with Tabular Data (Cognitive Tests and Demographics)

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sp
from joblib import dump, load
from collections import namedtuple

%matplotlib inline
plt.style.use('seaborn')
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

from google.colab import drive
import os

drive.mount('/content/gdrive', force_remount=True)
os.chdir('/content/gdrive/MyDrive/Lucas_Thimoteo/mmml-alzheimer-diagnosis')

Mounted at /content/gdrive


In [None]:
# !rm -r /content/gdrive/MyDrive/Lucas_Thimoteo/mmml-alzheimer-diagnosis/src.egg-info
# !python3 -m pip install -e /content/gdrive/MyDrive/Lucas_Thimoteo/mmml-alzheimer-diagnosis/.

Obtaining file:///content/gdrive/MyDrive/Lucas_Thimoteo/mmml-alzheimer-diagnosis
Installing collected packages: src
  Found existing installation: src 0.1.0
    Can't uninstall 'src'. No files were found to uninstall.
  Running setup.py develop for src
Successfully installed src


In [None]:
!pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/da/99/18f151991b0f06107af9723417c64e304ae2133587f85ea734a90136b4ae/pycaret-2.3.1-py3-none-any.whl (261kB)
[K     |█▎                              | 10kB 15.0MB/s eta 0:00:01[K     |██▌                             | 20kB 20.6MB/s eta 0:00:01[K     |███▊                            | 30kB 11.4MB/s eta 0:00:01[K     |█████                           | 40kB 9.0MB/s eta 0:00:01[K     |██████▎                         | 51kB 5.3MB/s eta 0:00:01[K     |███████▌                        | 61kB 5.8MB/s eta 0:00:01[K     |████████▊                       | 71kB 6.0MB/s eta 0:00:01[K     |██████████                      | 81kB 6.4MB/s eta 0:00:01[K     |███████████▎                    | 92kB 6.4MB/s eta 0:00:01[K     |████████████▌                   | 102kB 5.3MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 5.3MB/s eta 0:00:01[K     |███████████████                 | 122kB 5.3MB/s eta 

In [None]:
pip install interpret

Collecting interpret
  Downloading https://files.pythonhosted.org/packages/46/ce/444e5098422d15d28db4498da608866b14d8a18a8be68630af1146c80984/interpret-0.2.4-py3-none-any.whl
Collecting interpret-core[dash,debug,decisiontree,ebm,lime,linear,notebook,plotly,required,sensitivity,shap,skoperules,treeinterpreter]>=0.2.4
[?25l  Downloading https://files.pythonhosted.org/packages/d5/b6/e90ac757fda64caaea262c9fcce2d02fb4d141236aa40ce5f62c4d66efe1/interpret_core-0.2.4-py3-none-any.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 5.3MB/s 
[?25hCollecting gevent>=1.3.6; extra == "dash"
[?25l  Downloading https://files.pythonhosted.org/packages/3e/85/df3d1fd2b60a87455475f93012861b76a411d27ba4a0859939adbe2c9dc3/gevent-21.1.2-cp37-cp37m-manylinux2010_x86_64.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 42.0MB/s 
[?25hCollecting dash-table>=4.1.0; extra == "dash"
[?25l  Downloading https://files.pythonhosted.org/packages/97/f7/f4969a926f20a55d3e5970d01b85ff9ad510dba32

# ML Experimentation - Demographics + Cognitive Tests

## Base Code

In [100]:
# from src.model_training.mri_train_test_split import train_test_split_by_subject

from pycaret.utils import enable_colab
from pycaret.classification import *
from sklearn.metrics import fbeta_score,make_scorer

from interpret.glassbox import ExplainableBoostingClassifier
enable_colab()

Colab mode enabled.


In [101]:
def train_test_split_by_subject(df,test_size = 0.3,labels = ['AD','CN'],label_column='MACRO_GROUP',random_seed=42):
    
    '''
    Splits the dataset on train and test, at patient level (to avoid data leakage).
    
    This process executes a stratified random split, that is, it maintains the proportion of each class in the sets.
    
    Parameters
    ----------

    df: Reference dataframe containing information about patients.
    
    test_size: test dataset size. Value must be between bigger than 0 and less than 1.

    labels: Label of the classes.
    
    label_column: Column containing the label class to filter the final train and test set.


    Returns
    ----------
    Tuple with train and test reference datasets: df_train, df_test
    
    '''

    train = []
    test = []
    df_classes = df[df[label_column].isin(labels)]
    rng = np.random.default_rng(random_seed)
    patients_by_class = []

    for label in labels:
      label_patients = df_classes.query(label_column + "== @label")['SUBJECT'].unique()
      patients_by_class.append(label_patients)

    if len(patients_by_class) == 3:
      patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]) & set(patients_by_class[2]))
      patients_separated_all_classes =  np.array_split(patients_all_classes,3)
    else:
      patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]))
      patients_separated_all_classes =  np.array_split(patients_all_classes,2)

    for ii,label in enumerate(labels):
        patients_from_other_fold_classes = list(set(patients_all_classes) - set(patients_separated_all_classes[ii]))
        subjects = df_classes.query(label_column +" == @label and SUBJECT not in @patients_from_other_fold_classes")['SUBJECT'].unique()
        rng.shuffle(subjects)
        
        test_subjects_quantity = int(np.ceil(test_size * subjects.shape[0]))
        test_subjects = subjects[:test_subjects_quantity]
        train_subjects = subjects[test_subjects_quantity:]
        
        df_train_cl = df_classes.query("SUBJECT in @train_subjects")
        df_test_cl = df_classes.query("SUBJECT in @test_subjects")
        train.append(df_train_cl)
        test.append(df_test_cl)

    df_train = pd.concat(train).sample(frac=1).reset_index(drop=True).query(label_column + " in @labels")
    df_test = pd.concat(test).sample(frac=1).reset_index(drop=True).query(label_column + " in @labels")

    return df_train,df_test


In [102]:
import numpy as np
import pandas as pd

def stratified_fold_split_by_subject(df, n_splits=10, labels = ['AD','CN'], label_column = 'MACRO_GROUP', random_seed=42, return_indices=False):

    '''
    Provides train/test fold indices to split data at patient level, in order to avoid data leakage.
    
    This process executes a stratified random split, that is, it maintains the proportion of each class in the sets.
    
    Parameters
    ----------

    df: Reference dataframe containing information about patients.
    
    n_splits: number to determine the amount of fold splits in data.

    labels: Label of the classes.
    
    label_column: Column containing the label class to filter the final train and test set.

    return_indices: Flag to return the train/test indices. If False, it returns the entire reference dataframes.

    Returns
    ----------
    Tuple with train and test reference datasets: df_train, df_test

    Example
    ----------

    y = df_adni_merge['DIAGNOSIS']
    X = df_adni_merge.drop(['DIAGNOSIS'],axis=1)

    for train_index, test_index in stratified_fold_split_by_subject(df, n_splits=10,labels = ['AD','CN'],label_column = 'DIAGNOSIS_BASELINE',return_indices=True):
        X_train = df_adni_merge.query(index in @)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ...
        ...
        ...

    ---------

    y = df_adni_merge['DIAGNOSIS']
    X = df_adni_merge.drop(['DIAGNOSIS'],axis=1)

    results = sklearn.model_selection.cross_validate(
                                ExplainableBoostingClassifier(**ebm_params),
                                X,y,
                                cv=stratified_fold_split_by_subject(df, n_splits=10,labels = ['AD','CN'],label_column = 'DIAGNOSIS_BASELINE',return_indices=True),
                                scoring=my_auc,n_jobs=-1)
    '''

    train = []
    test = []
    df_classes = df[df[label_column].isin(labels)].copy()
    df_classes['FOLD'] = 0

    rng = np.random.default_rng(random_seed)

    patients_by_class = []

    for label in labels:
      label_patients = df_classes.query(label_column + "== @label")['SUBJECT'].unique()
      patients_by_class.append(label_patients)

    if len(patients_by_class) == 3:
      patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]) & set(patients_by_class[2]))
      patients_separated_all_classes =  np.array_split(patients_all_classes,3)
    else:
      patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]))
      patients_separated_all_classes =  np.array_split(patients_all_classes,2)

    for ii,label in enumerate(labels):
        
        patients_from_other_fold_classes = list(set(patients_all_classes) - set(patients_separated_all_classes[ii]))
        subjects = df_classes.query(label_column +" == @label and SUBJECT not in @patients_from_other_fold_classes")['SUBJECT'].unique()
        rng.shuffle(subjects)

        n_subjects = subjects.shape[0]
        fold_size = int(np.ceil(n_subjects / n_splits))

        subjects_by_fold = np.array_split(subjects,n_splits)
        for split in range(n_splits):
            fold_subjects = subjects_by_fold[split]
            df_classes.loc[df_classes['SUBJECT'].isin(fold_subjects),'FOLD'] = split

    if return_indices:
        for split in range(n_splits):
            train_index = df_classes.query("FOLD != @split").index
            test_index = df_classes.query("FOLD == @split").index
            yield train_index,test_index
    else:
        return df_classes

class StratifiedSubjectKFold:
    def __init__(self,df,
                    n_splits=10, 
                    labels = [0,1], 
                    label_column = 'MACRO_GROUP', 
                    random_seed=42, 
                    return_indices=True):
        self.df = df.copy()
        self.n_splits = n_splits
        self.labels = labels
        self.label_column = label_column
        self.random_seed = random_seed
        self.return_indices = return_indices

    '''
    Provides train/test fold indices to split data at patient level, in order to avoid data leakage.
    
    This process executes a stratified random split, that is, it maintains the proportion of each class in the sets.
    
    Parameters
    ----------

    df: Reference dataframe containing information about patients.
    
    n_splits: number to determine the amount of fold splits in data.

    labels: Label of the classes.
    
    label_column: Column containing the label class to filter the final train and test set.

    return_indices: Flag to return the train/test indices. If False, it returns the entire reference dataframes.

    Returns
    ----------
    Tuple with train and test reference datasets: df_train, df_test

    Example
    ----------

    y = df_adni_merge['DIAGNOSIS']
    X = df_adni_merge.drop(['DIAGNOSIS'],axis=1)

    for train_index, test_index in StratifiedSubjectKFold(df, n_splits=10,labels = ['AD','CN'],label_column = 'DIAGNOSIS_BASELINE',return_indices=True).split(X,y):
        X_train = df_adni_merge.query(index in @)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ...
        ...
        ...

    ---------

    y = df_adni_merge['DIAGNOSIS']
    X = df_adni_merge.drop(['DIAGNOSIS'],axis=1)

    results = sklearn.model_selection.cross_validate(
                                ExplainableBoostingClassifier(**ebm_params),
                                X,y,
                                cv=StratifiedSubjectKFold(df, n_splits=10,labels = ['AD','CN'],label_column = 'DIAGNOSIS_BASELINE',return_indices=True),
                                scoring=my_auc,n_jobs=-1)
    '''

    def split(self, X, y, groups=None):
        
        train = []
        test = []
        df_classes = self.df[self.df[self.label_column].isin(self.labels)].copy()
        df_classes['FOLD'] = 0

        rng = np.random.default_rng(self.random_seed)

        patients_by_class = []

        for label in self.labels:
            label_patients = df_classes.query(self.label_column + "== @label")['SUBJECT'].unique()
            patients_by_class.append(label_patients)

        if len(patients_by_class) == 3:
            patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]) & set(patients_by_class[2]))
            patients_separated_all_classes =  np.array_split(patients_all_classes,3)
        else:
            patients_all_classes = list(set(patients_by_class[0]) & set(patients_by_class[1]))
            patients_separated_all_classes =  np.array_split(patients_all_classes,2)

        for ii,label in enumerate(self.labels):
            
            patients_from_other_fold_classes = list(set(patients_all_classes) - set(patients_separated_all_classes[ii]))
            subjects = df_classes.query(self.label_column +" == @label and SUBJECT not in @patients_from_other_fold_classes")['SUBJECT'].unique()
            rng.shuffle(subjects)

            n_subjects = subjects.shape[0]
            fold_size = int(np.ceil(n_subjects / n_splits))

            subjects_by_fold = np.array_split(subjects,n_splits)
            for split in range(n_splits):
                fold_subjects = subjects_by_fold[split]
                df_classes.loc[df_classes['SUBJECT'].isin(fold_subjects),'FOLD'] = split

        if self.return_indices:
            for split in range(n_splits):
                train_index = df_classes.query("FOLD != @split").index
                test_index = df_classes.query("FOLD == @split").index
                yield train_index,test_index
        else:
            return df_classes



    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [None]:

def run_tabular_data_experiment(df_adni_merge,
                                experiment_name,
                                labels = [0,1],
                                label_column = 'DIAGNOSIS',
                                n_splits = 5,
                                selected_models = ['lr','svm','lightgbm','et',ExplainableBoostingClassifier()]):
  print("Setting up experiment...")

  base_experiment_params = {
    'categorical_features': ['MALE','HISPANIC','RACE_WHITE', 'RACE_BLACK', 'RACE_ASIAN','MARRIED', 'WIDOWED', 'DIVORCED', 'NEVER_MARRIED'],
    'numeric_features': ['AGE','YEARS_EDUCATION','CDRSB', 'ADAS11','ADAS13', 'ADASQ4', 'MMSE', 'RAVLT_immediate', 'RAVLT_learning','RAVLT_forgetting', 'RAVLT_perc_forgetting', 'TRABSCOR', 'FAQ', 'MOCA'],
    'target' : label_column,
    'transformation':True,
    'remove_multicollinearity' : False,
    'session_id':1,
    'silent':True,
    'verbose':0
  }

  df_train, df_test = train_test_split_by_subject(df_adni_merge,test_size=0.2,label_column=label_column,labels=labels)
  base_experiment_params['data'] = df_train.drop(["SUBJECT",'DIAGNOSIS_BASELINE'],axis=1)
  base_experiment_params['test_data'] = df_test.drop(["SUBJECT",'DIAGNOSIS_BASELINE'],axis=1)
  base_experiment_params['fold_strategy'] = StratifiedSubjectKFold(df_train,labels=labels,n_splits=n_splits,label_column=label_column)
  base_experiment_params['experiment_name'] = experiment_name
  exp_clinical = setup(**base_experiment_params)

  print("Training models...")
  trained_models = compare_models(include=selected_models,sort='AUC',n_select = 5,turbo=True,cross_validation = True,verbose=0)
  print("Models trained and validated!")
  print('-----------------------------------------------------')
  df_validation_results = pull().drop(['Kappa','MCC'],axis=1)
  print("Validation results: \n",df_validation_results)
  print('-----------------------------------------------------')
  df_test_results = []
  for model in trained_models:
    predict_model(model,verbose=0);
    test_performance = pull();
    df_test_results.append(test_performance)
  df_test_results = pd.concat(df_test_results).reset_index(drop=True).drop(['Kappa','MCC'],axis=1).sort_values('AUC',ascending=False)
  
  print("Test results: \n",df_test_results)
  print('-----------------------------------------------------')

  return df_validation_results,df_test_results,trained_models

## Data prep

In [103]:
df_adni_merge = pd.read_csv("./data/tabular/COGNITIVE_DATA_PROCESSED.csv")
organized_cols = ['SUBJECT','DIAGNOSIS','DIAGNOSIS_BASELINE','AGE', 'MALE', 'YEARS_EDUCATION', 'HISPANIC',
'RACE_WHITE', 'RACE_BLACK', 'RACE_ASIAN', 'MARRIED','WIDOWED','DIVORCED','NEVER_MARRIED', 'CDRSB', 'ADAS11', 'ADAS13', 'ADASQ4', 'MMSE',
'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting',
'RAVLT_perc_forgetting', 'TRABSCOR', 'FAQ', 'MOCA']
df_adni_merge = df_adni_merge[organized_cols]
df_adni_merge = df_adni_merge.dropna()
df_adni_merge.reset_index(drop=True,inplace=True)
df_adni_merge.shape


(6210, 26)

In [None]:
df_adni_merge.columns

Index(['SUBJECT', 'DIAGNOSIS', 'DIAGNOSIS_BASELINE', 'AGE', 'MALE',
       'YEARS_EDUCATION', 'HISPANIC', 'RACE_WHITE', 'RACE_BLACK', 'RACE_ASIAN',
       'MARRIED', 'WIDOWED', 'DIVORCED', 'NEVER_MARRIED', 'CDRSB', 'ADAS11',
       'ADAS13', 'ADASQ4', 'MMSE', 'RAVLT_immediate', 'RAVLT_learning',
       'RAVLT_forgetting', 'RAVLT_perc_forgetting', 'TRABSCOR', 'FAQ', 'MOCA'],
      dtype='object')

In [None]:
df_adni_merge['DIAGNOSIS_BASELINE'].value_counts()

MCI    3289
CN     2510
AD      411
Name: DIAGNOSIS_BASELINE, dtype: int64

In [None]:
df_adni_merge['DIAGNOSIS'].value_counts()

2    2902
0    2411
1     897
Name: DIAGNOSIS, dtype: int64

## CN vs AD

In [None]:
df_validation_results_cnad, df_test_results_cnad,trained_models = run_tabular_data_experiment(df_adni_merge,experiment_name = 'cn_ad',labels = [0,1],label_column = 'DIAGNOSIS',n_splits = 5,selected_models = ['lr','svm','lightgbm','et',ExplainableBoostingClassifier()])

Setting up experiment...
Training models...
Models trained and validated!
-----------------------------------------------------
Validation results: 
                                     Model  Accuracy  ...      F1  TT (Sec)
lr                    Logistic Regression    0.9931  ...  0.9872     0.050
lightgbm  Light Gradient Boosting Machine    0.9938  ...  0.9887     0.126
et                 Extra Trees Classifier    0.9958  ...  0.9922     0.558
svm                   SVM - Linear Kernel    0.9916  ...  0.9843     0.032

[4 rows x 7 columns]
-----------------------------------------------------
Test results: 
                              Model  Accuracy     AUC  Recall   Prec.      F1
1  Light Gradient Boosting Machine    0.9986  0.9996  0.9949  1.0000  0.9975
2           Extra Trees Classifier    0.9972  0.9995  0.9949  0.9949  0.9949
0              Logistic Regression    0.9972  0.9991  0.9949  0.9949  0.9949
3              SVM - Linear Kernel    0.9944  0.9930  0.9899  0.9899  0.989

## MCI vs AD

In [None]:
df_validation_results_mciad, df_test_results_mciad = run_tabular_data_experiment(df_adni_merge,experiment_name = 'mci_ad',labels = [1,2],label_column = 'DIAGNOSIS',n_splits = 5,selected_models = ['lr','svm','lightgbm','et',ExplainableBoostingClassifier()])

Setting up experiment...
Training models...
Models trained and validated!
Validation results: 
                              Model  Accuracy     AUC  ...   Prec.      F1  TT (Sec)
0              Logistic Regression    0.9202  0.9694  ...  0.9429  0.9477     0.064
4    ExplainableBoostingClassifier    0.9246  0.9689  ...  0.9411  0.9509     6.226
2  Light Gradient Boosting Machine    0.9243  0.9684  ...  0.9433  0.9505     0.134
3           Extra Trees Classifier    0.9232  0.9681  ...  0.9393  0.9500     0.642
1              SVM - Linear Kernel    0.9093  0.0000  ...  0.9419  0.9403     0.040

[5 rows x 7 columns]
Test results: 
                              Model  Accuracy     AUC  Recall   Prec.      F1
0              Logistic Regression    0.9485  0.9826  0.9589  0.9743  0.9665
2  Light Gradient Boosting Machine    0.9411  0.9809  0.9573  0.9665  0.9618
1    ExplainableBoostingClassifier    0.9448  0.9785  0.9604  0.9681  0.9643
3           Extra Trees Classifier    0.9374  0.9764  

## MCI vs CN

In [None]:
df_validation_results_mcicn, df_test_results_mcicn = run_tabular_data_experiment(df_adni_merge,experiment_name = 'mci_cn',labels = [0,2],label_column = 'DIAGNOSIS',n_splits = 5,selected_models = ['lr','svm','lightgbm','et',ExplainableBoostingClassifier()])

Setting up experiment...
Training models...
Models trained and validated!
Validation results: 
                              Model  Accuracy     AUC  ...   Prec.      F1  TT (Sec)
0              Logistic Regression    0.9046  0.9543  ...  0.9084  0.9136     0.080
4    ExplainableBoostingClassifier    0.9054  0.9530  ...  0.9065  0.9146    11.992
2  Light Gradient Boosting Machine    0.8996  0.9494  ...  0.9062  0.9088     0.152
3           Extra Trees Classifier    0.8937  0.9494  ...  0.8963  0.9040     0.724
1              SVM - Linear Kernel    0.9019  0.0000  ...  0.9015  0.9112     0.050

[5 rows x 7 columns]
Test results: 
                              Model  Accuracy     AUC  Recall   Prec.      F1
1    ExplainableBoostingClassifier    0.9007  0.9499  0.9297  0.8891  0.9089
0              Logistic Regression    0.9017  0.9474  0.9279  0.8919  0.9096
3           Extra Trees Classifier    0.8942  0.9462  0.9227  0.8838  0.9028
2  Light Gradient Boosting Machine    0.8895  0.9460  

# Downloading Images

In [106]:
df_adni_merge = pd.read_csv("./data/tabular/ADNIMERGE.csv").query("DX == DX")

In [107]:
df_adni_merge = df_adni_merge[['RID','PTID','VISCODE','SITE','COLPROT','EXAMDATE','IMAGEUID','DX','DX_bl']].query("IMAGEUID == IMAGEUID")
df_adni_merge['IMAGEUID'] = df_adni_merge['IMAGEUID'].astype(int)

In [None]:
subjects = df_adni_merge['IMAGEUID'].shape[0]
chunks = 1000 
max_count =  len(range(0,df_adni_merge['IMAGEUID'].shape[0],chunks))
for count,i in enumerate(range(0,subjects,chunks)):
    print(f"Images to download: {count+1}/{max_count}")
    image_chunk = df_adni_merge.iloc[i:i+chunks]['IMAGEUID'].tolist()
    print("Chunk size:",len(image_chunk))
    print(image_chunk)
    print("-------------------------------")

Images to download: 1/8
Chunk size: 1000
[35475, 32237, 31863, 35576, 88252, 64631, 64636, 59357, 79178, 132252, 32246, 31885, 31906, 200385, 123986, 33025, 33034, 62348, 72792, 160861, 59366, 64649, 32270, 31915, 94368, 94377, 59375, 64654, 59391, 87012, 129232, 33066, 66087, 148889, 32306, 31928, 31933, 200390, 124731, 32332, 31970, 200399, 128523, 32409, 31992, 65902, 200416, 162111, 391070, 788060, 119180, 119181, 82360, 133373, 65833, 151328, 33074, 33079, 66092, 149623, 119182, 119183, 64260, 34607, 34621, 87622, 130109, 118697, 118850, 118796, 118799, 118829, 87036, 132330, 171375, 389161, 387866, 120056, 119794, 118996, 139021, 171155, 59411, 119184, 119185, 33086, 33089, 66106, 163333, 119273, 35819, 34488, 47889, 65424, 88309, 132770, 35486, 31996, 66945, 86965, 35825, 34495, 47898, 87631, 119795, 119186, 119187, 64269, 133479, 274523, 118828, 119796, 118830, 74240, 118832, 118847, 134207, 89591, 132341, 171380, 119188, 119189, 63023, 134637, 274525, 388019, 35833, 34502, 479

In [119]:
df_adni_merge = pd.read_csv("./data/tabular/ADNIMERGE.csv").query("DX == DX")
df_adni_merge = df_adni_merge[['RID','PTID','VISCODE','SITE','COLPROT','EXAMDATE','IMAGEUID','DX','DX_bl']]
df_adni_merge['IMAGEUID'] = df_adni_merge['IMAGEUID'].fillna(-1)
df_adni_merge['IMAGEUID'] = df_adni_merge['IMAGEUID'].astype(int)

df_adni_merge.loc[df_adni_merge['DX'] == 'Dementia','DX'] = 'AD'
df_adni_merge.loc[df_adni_merge['DX_bl'] == 'LMCI','DX_bl'] = 'MCI'
df_adni_merge.loc[df_adni_merge['DX_bl'] == 'EMCI','DX_bl'] = 'MCI'
df_adni_merge.loc[df_adni_merge['DX_bl'] == 'SMC','DX_bl'] = 'CN'

df_adni_merge.rename(inplace=True,
    columns={
    'DX':'DIAGNOSIS',
    'DX_bl':'DIAGNOSIS_BASELINE',
    'PTID':'SUBJECT'
})

df_adni_merge.to_csv("./data/tabular/ENSEMBLE_REFERENCE.csv")