In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# %matplotlib inline
plt.style.use('seaborn')
pd.set_option('display.max_rows', 100)
    
root_path = '/home/lucas/projects/mmml-alzheimer-diagnosis/'
data_path = '/home/lucas/projects/mmml-alzheimer-diagnosis/data/'

os.chdir('/home/lucas/projects/mmml-alzheimer-diagnosis/src/model_training/')
from ensemble_train import prepare_mri_predictions

os.chdir('/home/lucas/projects/mmml-alzheimer-diagnosis/src/model_evaluation/')
from ensemble_evaluation import calculate_rocs_on_datasets,calculate_metrics_on_datasets
from base_evaluation import *

# MRI Datasets Class Distribution

In [176]:
label = 'MACRO_GROUP'
cols = ['SUBJECT','IMAGE_DATA_ID','DATASET',label]

df_mri = prepare_mri_predictions(data_path+'PREDICTIONS_AD_VGG19_BN.csv').dropna()[cols]
df_mri['DATASET'] = df_mri['DATASET'].replace({'train_cnn':'train'})
df_mri[label] = df_mri[label].replace({0:'CN',1:'AD'})

df_mri_mci = prepare_mri_predictions(data_path+'PREDICTIONS_MCI_VGG19_BN.csv').dropna()[cols]
df_mri_mci['DATASET'] = df_mri_mci['DATASET'].replace({'train_cnn':'train'})
df_mri_mci[label] = df_mri_mci[label].replace({0:'CN',1:'MCI'})

df_mris = pd.concat([df_mri,df_mri_mci]).drop_duplicates()

df_size = pd.pivot_table(df_mris.drop('SUBJECT',axis=1),index=[label],columns=['DATASET'],aggfunc='count').reset_index(drop=False)
df_size.columns = ['Class','Test','Train','Validation']
df_size.loc[df_size['Class'] == 'CN','Validation'] = 273 #Fixing correct number of CNs in validation due to NaN value in CNNs prediction for some MCIxCN cases.

total_ad = df_size.query("Class in ('CN','AD')").sum()
total_ad['Class'] = 'AD + CN'
total_mci = df_size.query("Class in ('CN','MCI')").sum()
total_mci['Class'] = 'MCI + CN'
df_size = df_size.append(total_ad,ignore_index=True)
df_size = df_size.append(total_mci,ignore_index=True)
df_size = df_size[['Class','Train','Validation','Test']]
df_size


Unnamed: 0,Class,Train,Validation,Test
0,AD,488,73,65
1,CN,1234,273,283
2,MCI,499,82,81
3,AD + CN,1722,346,348
4,MCI + CN,1733,355,364


# Cog Tests Datasets Class Distribution

In [167]:
cols = ['SUBJECT','IMAGE_DATA_ID','DATASET','DIAGNOSIS']

df_preds_ad = pd.read_csv(data_path+'PREDICTIONS_AD_COG_TESTS.csv')
df_preds_ad['DATASET'].fillna('train',inplace=True)

df_preds_mci = pd.read_csv(data_path+'PREDICTIONS_MCI_COG_TESTS.csv')
df_preds_mci['DATASET'].fillna('train',inplace=True)

label='DIAGNOSIS'
df_preds_ad[label] = df_preds_ad[label].replace({0:'CN',1:'AD'})
df_preds_mci[label] = df_preds_mci[label].replace({0:'CN',1:'MCI'})

df_preds = pd.concat([df_preds_ad,df_preds_mci]).drop_duplicates(subset=cols)[cols]

df_size = pd.pivot_table(df_preds.drop('SUBJECT',axis=1),index=[label],columns=['DATASET'],aggfunc='count').reset_index(drop=False)
df_size.columns = ['Class','Test','Train','Validation']
total_ad = df_size.query("Class in ('CN','AD')").sum()
total_ad['Class'] = 'AD + CN'
total_mci = df_size.query("Class in ('CN','MCI')").sum()
total_mci['Class'] = 'MCI + CN'
df_size = df_size.append(total_ad,ignore_index=True)
df_size = df_size.append(total_mci,ignore_index=True)
df_size = df_size[['Class','Train','Validation','Test']]
df_size

Unnamed: 0,Class,Train,Validation,Test
0,AD,636,73,65
1,CN,1286,273,283
2,MCI,2212,82,81
3,AD + CN,1922,346,348
4,MCI + CN,3498,355,364


In [172]:
cols_to_drop = ['RID','SUBJECT','VISCODE','IMAGE_DATA_ID','DATASET',
                'MARRIED','DIVORCED','NEVER_MARRIED','COGTEST_SCORE_EBM','COGTEST_SCORE_LR','COGTEST_SCORE'] 
df_cogs = pd.concat([df_preds_ad,df_preds_mci]).drop_duplicates(subset=cols)
np.round(df_cogs[['AGE','YEARS_EDUCATION','CDRSB','MMSE']].describe().drop(['count','25%','50%','75%']).T * 100) / 100

Unnamed: 0,mean,std,min,max
AGE,72.45,6.96,55.0,91.4
YEARS_EDUCATION,16.26,2.63,6.0,20.0
CDRSB,1.54,2.09,0.0,17.0
MMSE,27.5,2.89,6.0,30.0


In [173]:
df_dist = pd.DataFrame(columns=['MALE','WIDOWED','RACE_WHITE','RACE_BLACK','RACE_ASIAN','HISPANIC'])
for col in ['MALE','WIDOWED','RACE_WHITE','RACE_BLACK','RACE_ASIAN','HISPANIC']:
    df_dist[col] = df_cogs[col].value_counts().sort_index(ascending=False)
df_dist.index = ['YES','NO']
df_dist = df_dist.T
df_dist

Unnamed: 0,YES,NO
MALE,2655,2336
WIDOWED,531,4460
RACE_WHITE,4601,390
RACE_BLACK,199,4792
RACE_ASIAN,86,4905
HISPANIC,192,4799


# Ensemble Datasets Class Distribution

In [174]:
label = 'DIAGNOSIS'
cols = ['SUBJECT','IMAGE_DATA_ID','DATASET',label]

df_ensemble_ad = pd.read_csv(data_path+'PREDICTIONS_AD_ALL_SCORES_ENSEMBLE.csv')[cols]
df_ensemble_mci = pd.read_csv(data_path+'PREDICTIONS_MCI_ALL_SCORES_ENSEMBLE.csv')[cols]

df_ensemble_ad.columns = df_ensemble_ad.columns.str.upper()
df_ensemble_mci.columns = df_ensemble_mci.columns.str.upper()

df_ensemble_ad[label] = df_ensemble_ad[label].replace({0:'CN',1:'AD'})
df_ensemble_mci[label] = df_ensemble_mci[label].replace({0:'CN',1:'MCI'})
df_ensemble = pd.concat([df_ensemble_ad,df_ensemble_mci]).drop_duplicates()

df_size = pd.pivot_table(df_ensemble.drop('SUBJECT',axis=1),index=[label],columns=['DATASET'],aggfunc='count').reset_index(drop=False)
df_size.columns = ['Class','Test','Train','Validation']
total_ad = df_size.query("Class in ('CN','AD')").sum()
total_ad['Class'] = 'AD + CN'
total_mci = df_size.query("Class in ('CN','MCI')").sum()
total_mci['Class'] = 'MCI + CN'
df_size = df_size.append(total_ad,ignore_index=True)
df_size = df_size.append(total_mci,ignore_index=True)
df_size = df_size[['Class','Train','Validation','Test']]
df_size


Unnamed: 0,Class,Train,Validation,Test
0,AD,155,73,65
1,CN,554,273,283
2,MCI,161,82,81
3,AD + CN,709,346,348
4,MCI + CN,715,355,364


In [175]:
df_size.query("Class in ('CN','MCI','AD')").sum()

Class         ADCNMCI
Train             870
Validation        428
Test              429
dtype: object