# Applying ML to neurodevelopmental discorders detection

In [78]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from problem import get_cv
from problem import get_all_data
from problem import split_data

from download_data import fetch_fmri_time_series

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.multiclass import unique_labels
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.utils.multiclass import unique_labels

from nilearn.connectome import ConnectivityMeasure

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading the data

We start by downloading the data for both datasets

In [71]:
data, labels = get_all_data()

In [74]:
data.head()

Unnamed: 0_level_0,participants_site,participants_sex,participants_age,participants_dataset,anatomy_lh_bankssts_area,anatomy_lh_caudalanteriorcingulate_area,anatomy_lh_caudalmiddlefrontal_area,anatomy_lh_cuneus_area,anatomy_lh_entorhinal_area,anatomy_lh_fusiform_area,...,fmri_basc064,fmri_basc122,fmri_basc197,fmri_craddock_scorr_mean,fmri_harvard_oxford_cort_prob_2mm,fmri_msdl,fmri_power_2011,fmri_motions,fmri_select,repetition_time
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adhd700021,204,F,11.17,adhd200,1124,743,2886,1535,589,3731,...,data/fmri_adhd/basc064/adhd700021/run_1/adhd70...,data/fmri_adhd/basc122/adhd700021/run_1/adhd70...,data/fmri_adhd/basc197/adhd700021/run_1/adhd70...,data/fmri_adhd/craddock_scorr_mean/adhd700021/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd700021/run_1/adhd70002...,data/fmri_adhd/power_2011/adhd700021/run_1/adh...,data/fmri_adhd/motions/adhd700021/run_1/motion...,1,2.0
adhd242402,204,F,13.24,adhd200,1039,595,1966,1568,411,2769,...,data/fmri_adhd/basc064/adhd242402/run_1/adhd24...,data/fmri_adhd/basc122/adhd242402/run_1/adhd24...,data/fmri_adhd/basc197/adhd242402/run_1/adhd24...,data/fmri_adhd/craddock_scorr_mean/adhd242402/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd242402/run_1/adhd24240...,data/fmri_adhd/power_2011/adhd242402/run_1/adh...,data/fmri_adhd/motions/adhd242402/run_1/motion...,1,2.0
adhd972340,204,F,13.75,adhd200,601,492,1554,1345,455,2452,...,data/fmri_adhd/basc064/adhd972340/run_1/adhd97...,data/fmri_adhd/basc122/adhd972340/run_1/adhd97...,data/fmri_adhd/basc197/adhd972340/run_1/adhd97...,data/fmri_adhd/craddock_scorr_mean/adhd972340/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd972340/run_1/adhd97234...,data/fmri_adhd/power_2011/adhd972340/run_1/adh...,data/fmri_adhd/motions/adhd972340/run_1/motion...,1,2.0
adhd055645,204,F,11.18,adhd200,699,521,1773,1251,341,2628,...,data/fmri_adhd/basc064/adhd055645/run_1/adhd05...,data/fmri_adhd/basc122/adhd055645/run_1/adhd05...,data/fmri_adhd/basc197/adhd055645/run_1/adhd05...,data/fmri_adhd/craddock_scorr_mean/adhd055645/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd055645/run_1/adhd05564...,data/fmri_adhd/power_2011/adhd055645/run_1/adh...,data/fmri_adhd/motions/adhd055645/run_1/motion...,1,2.0
adhd436785,204,F,11.41,adhd200,799,605,2564,1688,419,2508,...,data/fmri_adhd/basc064/adhd436785/run_1/adhd43...,data/fmri_adhd/basc122/adhd436785/run_1/adhd43...,data/fmri_adhd/basc197/adhd436785/run_1/adhd43...,data/fmri_adhd/craddock_scorr_mean/adhd436785/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd436785/run_1/adhd43678...,data/fmri_adhd/power_2011/adhd436785/run_1/adh...,data/fmri_adhd/motions/adhd436785/run_1/motion...,1,2.0


In [114]:
print('Number of subjects in the dataset: {}'.format(labels.size))

Number of subjects in the dataset: 1671


In [76]:
data.participants_dataset.value_counts()

abide      1150
adhd200     521
Name: participants_dataset, dtype: int64

In [77]:
fig = px.histogram(data, y="participants_dataset", color="participants_dataset", title="Datasets distribution")
fig.show()

Age distribution for each dataset

In [75]:
fig = px.violin(data, y="participants_dataset", x="participants_age", color="participants_dataset", box=True, points="outliers",
                title="Age distribution for ABIDE and ADHD datasets", 
                 labels={
                     "participants_dataset": "Dataset",
                     "participants_age": "Age"},)
fig.update_layout(showlegend=False)
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 5
    )
)

fig.show()

Splitting data to train and test

In [87]:
data_train, labels_train, data_test, labels_test = split_data(data, labels)

In [101]:
print('Number of subjects in the training tests: {}'.format(labels_train.size))

Number of subjects in the training tests: 1336


In [108]:
print('Number of subjects in the training tests: {}'.format(labels_test.size))

Number of subjects in the training tests: 335


## Participant features

In [83]:
data_train_participants = data_train[[col for col in data_train.columns if col.startswith('participants')]]
data_train_participants.head()

Unnamed: 0_level_0,participants_site,participants_sex,participants_age,participants_dataset
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18405996504862694428,26,M,16.57,abide
adhd107198,203,F,20.31,adhd200
adhd440078,207,M,12.17,adhd200
adhd039483,208,M,14.08,adhd200
adhd725655,204,M,8.59,adhd200


## Structural MRI features

A set of structural features have been extracted for each subject: (i) normalized brain volume computed using subcortical segmentation of FreeSurfer and (ii) cortical thickness and area for right and left hemisphere of FreeSurfer.

In [84]:
data_train_anatomy = data_train[[col for col in data_train.columns if col.startswith('anatomy')]]
data_train_anatomy.head()

Unnamed: 0_level_0,anatomy_lh_bankssts_area,anatomy_lh_caudalanteriorcingulate_area,anatomy_lh_caudalmiddlefrontal_area,anatomy_lh_cuneus_area,anatomy_lh_entorhinal_area,anatomy_lh_fusiform_area,anatomy_lh_inferiorparietal_area,anatomy_lh_inferiortemporal_area,anatomy_lh_isthmuscingulate_area,anatomy_lh_lateraloccipital_area,...,anatomy_SupraTentorialVolNotVentVox,anatomy_MaskVol,anatomy_BrainSegVol.to.eTIV,anatomy_MaskVol.to.eTIV,anatomy_lhSurfaceHoles,anatomy_rhSurfaceHoles,anatomy_SurfaceHoles,anatomy_EstimatedTotalIntraCranialVol,anatomy_eTIV,anatomy_select
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18405996504862694428,1059,658,3516,1872,424,3640,4916,4018,987,6046,...,1183424,1791707,0.817225,1.084659,96,77,173,1651862.0,1651862.0,1
adhd107198,976,820,2438,1497,355,2854,4140,4065,991,5323,...,950767,1466196,0.785895,1.051924,43,38,81,1393823.0,1393823.0,1
adhd440078,1234,703,2584,1895,477,3141,4648,3738,1037,4961,...,1014120,1537783,0.757436,1.003438,39,32,71,1532514.0,1532514.0,1
adhd039483,1191,568,3001,1991,621,4043,5715,3709,984,7199,...,1168096,1794146,0.825342,1.100723,74,74,148,1629970.0,1629970.0,1
adhd725655,1082,468,2219,1515,326,3111,4342,3366,1030,5623,...,958172,1445812,0.856478,1.121018,55,67,122,1289731.0,1289731.0,1


Note that the column `anatomy_select` contain a label affected during a manual quality check (i.e. `0` and `3` reject, `1` accept, `2` accept with reserve). This column can be used during training to exclude noisy data for instance.

In [11]:
data_train_anatomy['anatomy_select'].head()

subject_id
adhd700021    1
adhd242402    1
adhd972340    1
adhd055645    1
adhd436785    1
Name: anatomy_select, dtype: int64

## Functional MRI features

In [12]:
data_train_functional = data_train[[col for col in data_train.columns if col.startswith('fmri')]]
data_train_functional.head()

Unnamed: 0_level_0,fmri_basc064,fmri_basc122,fmri_basc197,fmri_craddock_scorr_mean,fmri_harvard_oxford_cort_prob_2mm,fmri_msdl,fmri_power_2011,fmri_motions,fmri_select
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adhd700021,data/fmri_adhd/basc064/adhd700021/run_1/adhd70...,data/fmri_adhd/basc122/adhd700021/run_1/adhd70...,data/fmri_adhd/basc197/adhd700021/run_1/adhd70...,data/fmri_adhd/craddock_scorr_mean/adhd700021/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd700021/run_1/adhd70002...,data/fmri_adhd/power_2011/adhd700021/run_1/adh...,data/fmri_adhd/motions/adhd700021/run_1/motion...,1
adhd242402,data/fmri_adhd/basc064/adhd242402/run_1/adhd24...,data/fmri_adhd/basc122/adhd242402/run_1/adhd24...,data/fmri_adhd/basc197/adhd242402/run_1/adhd24...,data/fmri_adhd/craddock_scorr_mean/adhd242402/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd242402/run_1/adhd24240...,data/fmri_adhd/power_2011/adhd242402/run_1/adh...,data/fmri_adhd/motions/adhd242402/run_1/motion...,1
adhd972340,data/fmri_adhd/basc064/adhd972340/run_1/adhd97...,data/fmri_adhd/basc122/adhd972340/run_1/adhd97...,data/fmri_adhd/basc197/adhd972340/run_1/adhd97...,data/fmri_adhd/craddock_scorr_mean/adhd972340/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd972340/run_1/adhd97234...,data/fmri_adhd/power_2011/adhd972340/run_1/adh...,data/fmri_adhd/motions/adhd972340/run_1/motion...,1
adhd055645,data/fmri_adhd/basc064/adhd055645/run_1/adhd05...,data/fmri_adhd/basc122/adhd055645/run_1/adhd05...,data/fmri_adhd/basc197/adhd055645/run_1/adhd05...,data/fmri_adhd/craddock_scorr_mean/adhd055645/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd055645/run_1/adhd05564...,data/fmri_adhd/power_2011/adhd055645/run_1/adh...,data/fmri_adhd/motions/adhd055645/run_1/motion...,1
adhd436785,data/fmri_adhd/basc064/adhd436785/run_1/adhd43...,data/fmri_adhd/basc122/adhd436785/run_1/adhd43...,data/fmri_adhd/basc197/adhd436785/run_1/adhd43...,data/fmri_adhd/craddock_scorr_mean/adhd436785/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd436785/run_1/adhd43678...,data/fmri_adhd/power_2011/adhd436785/run_1/adh...,data/fmri_adhd/motions/adhd436785/run_1/motion...,1


Unlike the anatomical and participants data, the available data are filename to CSV files in which the time-series information are stored. We show in the next section how to read and extract meaningful information from those data.

Similarly to the anatomical data, the column `fmri_select` gives information about the manual quality check.

In [13]:
data_train_functional['fmri_select'].head()

subject_id
adhd700021    1
adhd242402    1
adhd972340    1
adhd055645    1
adhd436785    1
Name: fmri_select, dtype: int64

## Evaluations functions 

The framework is evaluated with a cross-validation approach. The metrics used are the AUC under the ROC and the accuracy.

In [91]:
def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(pipe, X, y, scoring=('roc_auc', 'accuracy'), cv=cv,
                             verbose=1, return_train_score=True,
                             n_jobs=1)
    
    return results

## Using only anatomical features

#### FeatureExtractor

The available structural data can be used directly to make some classification. In this regard, we will use a feature extractor (i.e. `FeatureExtractor`). This extractor will only select only the anatomical features, dropping any information regarding the fMRI-based features.

In [93]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        # get only the anatomical information
        X = X_df[[col for col in X_df.columns if col.startswith('anatomy')]]
        return X.drop(columns='anatomy_select')

#### Classifier

We propose to use a logistic classifier preceded from a scaler which will remove the mean and standard deviation computed on the training set.

In [94]:
class Classifier(BaseEstimator):
    def __init__(self):
        self.clf = make_pipeline(StandardScaler(), LogisticRegression(solver='lbfgs', max_iter=500))

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
        
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


We can test locally our pipeline using `evaluation` function that we defined earlier.

In [96]:
results_anat = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results_anat['train_roc_auc']),
                                                        np.std(results_anat['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results_anat['test_roc_auc']),
                                                          np.std(results_anat['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results_anat['train_accuracy']),
                                                         np.std(results_anat['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results_anat['test_accuracy']),
                                                           np.std(results_anat['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 0.800 +- 0.006
Validation score ROC-AUC: 0.661 +- 0.030 

Training score accuracy: 0.726 +- 0.008
Validation score accuracy: 0.632 +- 0.027


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s finished


## fMRI-derived features

In [97]:
def _load_fmri(fmri_filenames):
    """Load time-series extracted from the fMRI using a specific atlas."""
    return np.array([pd.read_csv(subject_filename,
                                 header=None).values
                     for subject_filename in fmri_filenames])


class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # make a transformer which will load the time series and compute the
        # connectome matrix
        self.transformer_fmri = make_pipeline(
            FunctionTransformer(func=_load_fmri, validate=False),
            ConnectivityMeasure(kind='tangent', vectorize=True))
        
    def fit(self, X_df, y):
        # get only the time series for the MSDL atlas
        fmri_filenames = X_df['fmri_msdl']
        self.transformer_fmri.fit(fmri_filenames, y)
        return self

    def transform(self, X_df):
        fmri_filenames = X_df['fmri_msdl']
        return self.transformer_fmri.transform(fmri_filenames)


In [98]:
class Classifier(BaseEstimator):
    def __init__(self):
        self.clf = make_pipeline(StandardScaler(), LogisticRegression(C=1.))

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
       
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


In [100]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 1.000 +- 0.000
Validation score ROC-AUC: 0.599 +- 0.037 

Training score accuracy: 1.000 +- 0.001
Validation score accuracy: 0.584 +- 0.025


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.4min finished


## More elaborate pipeline: combining anatomy and fMRI

In [109]:
def _load_fmri(fmri_filenames):
    """Load time-series extracted from the fMRI using a specific atlas."""
    return np.array([pd.read_csv(subject_filename,
                                 header=None).values
                     for subject_filename in fmri_filenames])


class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # make a transformer which will load the time series and compute the
        # connectome matrix
        self.transformer_fmri = make_pipeline(
            FunctionTransformer(func=_load_fmri, validate=False),
            ConnectivityMeasure(kind='tangent', vectorize=True))
    
    def fit(self, X_df, y):
        fmri_filenames = X_df['fmri_msdl']
        self.transformer_fmri.fit(fmri_filenames, y)
        return self

    def transform(self, X_df):
        fmri_filenames = X_df['fmri_msdl']
        X_connectome = self.transformer_fmri.transform(fmri_filenames)
        X_connectome = pd.DataFrame(X_connectome, index=X_df.index)
        X_connectome.columns = ['connectome_{}'.format(i)
                                for i in range(X_connectome.columns.size)]
        # get the anatomical information
        X_anatomy = X_df[[col for col in X_df.columns
                          if col.startswith('anatomy')]]
        X_anatomy = X_anatomy.drop(columns='anatomy_select')
        # concatenate both matrices
        return pd.concat([X_connectome, X_anatomy], axis=1)


We will create a classifier (i.e. a random forest classifier) which will used both connectome and anatomical features.

In [110]:
class Classifier(BaseEstimator):
    def __init__(self):
        self.clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


In [111]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 1.000 +- 0.000
Validation score ROC-AUC: 0.678 +- 0.017 

Training score accuracy: 1.000 +- 0.000
Validation score accuracy: 0.627 +- 0.020


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.7min finished


We can propose a
more complex classifier than the previous one. We will train 2 single classifiers independetly on the sMRI-derived and fMRI-derived features. Then, a meta-classifier will be used to combine both information. We left out some data to be able to train the meta-classifier.

In [112]:
class Classifier(BaseEstimator):
    def __init__(self):
        self.clf_connectome = make_pipeline(StandardScaler(),
                                            LogisticRegression(C=1.))
        self.clf_anatomy = make_pipeline(StandardScaler(),
                                         LogisticRegression(C=1.))
        self.meta_clf = LogisticRegression(C=1.)

    def fit(self, X, y):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]
        train_idx, validation_idx = train_test_split(range(y.size),
                                                     test_size=0.33,
                                                     shuffle=True,
                                                     random_state=42)
        X_anatomy_train = X_anatomy.iloc[train_idx]
        X_anatomy_validation = X_anatomy.iloc[validation_idx]
        X_connectome_train = X_connectome.iloc[train_idx]
        X_connectome_validation = X_connectome.iloc[validation_idx]
        y_train = y[train_idx]
        y_validation = y[validation_idx]

        self.clf_connectome.fit(X_connectome_train, y_train)
        self.clf_anatomy.fit(X_anatomy_train, y_train)

        y_connectome_pred = self.clf_connectome.predict_proba(
            X_connectome_validation)
        y_anatomy_pred = self.clf_anatomy.predict_proba(
            X_anatomy_validation)

        self.meta_clf.fit(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1),
            y_validation)
        self.classes_ = unique_labels(y)
        return self
    
    def predict(self, X):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)

        return self.meta_clf.predict(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1))

    def predict_proba(self, X):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)

        return self.meta_clf.predict_proba(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1))


In [113]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 0.890 +- 0.024
Validation score ROC-AUC: 0.660 +- 0.046 

Training score accuracy: 0.806 +- 0.037
Validation score accuracy: 0.618 +- 0.025


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.6min finished
