# Applying ML to neurodevelopmental discorders detection

In [1]:
from problem import get_cv
from problem import get_all_data

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.svm import SVC
from sklearn.utils.multiclass import unique_labels
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV, cross_validate
from sklearn.utils.multiclass import unique_labels

from nilearn.connectome import ConnectivityMeasure

## Loading the data

We start by downloading the data for both datasets

In [2]:
data, labels = get_all_data()

In [3]:
data.head()

Unnamed: 0_level_0,participants_site,participants_sex,participants_age,participants_dx,participants_dataset,anatomy_lh_bankssts_area,anatomy_lh_caudalanteriorcingulate_area,anatomy_lh_caudalmiddlefrontal_area,anatomy_lh_cuneus_area,anatomy_lh_entorhinal_area,...,fmri_basc197,fmri_basc325,fmri_basc444,fmri_craddock_scorr_mean,fmri_harvard_oxford_cort_prob_2mm,fmri_msdl,fmri_power_2011,fmri_motions,fmri_select,repetition_time
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adhd700021,204,F,11.17,1,adhd200,1124,743,2886,1535,589,...,data/fmri_adhd/basc197/adhd700021/run_1/adhd70...,data/fmri_adhd/basc325/adhd700021/run_1/adhd70...,data/fmri_adhd/basc444/adhd700021/run_1/adhd70...,data/fmri_adhd/craddock_scorr_mean/adhd700021/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd700021/run_1/adhd70002...,data/fmri_adhd/power_2011/adhd700021/run_1/adh...,data/fmri_adhd/motions/adhd700021/run_1/motion...,1,2.0
adhd242402,204,F,13.24,1,adhd200,1039,595,1966,1568,411,...,data/fmri_adhd/basc197/adhd242402/run_1/adhd24...,data/fmri_adhd/basc325/adhd242402/run_1/adhd24...,data/fmri_adhd/basc444/adhd242402/run_1/adhd24...,data/fmri_adhd/craddock_scorr_mean/adhd242402/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd242402/run_1/adhd24240...,data/fmri_adhd/power_2011/adhd242402/run_1/adh...,data/fmri_adhd/motions/adhd242402/run_1/motion...,1,2.0
adhd972340,204,F,13.75,0,adhd200,601,492,1554,1345,455,...,data/fmri_adhd/basc197/adhd972340/run_1/adhd97...,data/fmri_adhd/basc325/adhd972340/run_1/adhd97...,data/fmri_adhd/basc444/adhd972340/run_1/adhd97...,data/fmri_adhd/craddock_scorr_mean/adhd972340/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd972340/run_1/adhd97234...,data/fmri_adhd/power_2011/adhd972340/run_1/adh...,data/fmri_adhd/motions/adhd972340/run_1/motion...,1,2.0
adhd055645,204,F,11.18,0,adhd200,699,521,1773,1251,341,...,data/fmri_adhd/basc197/adhd055645/run_1/adhd05...,data/fmri_adhd/basc325/adhd055645/run_1/adhd05...,data/fmri_adhd/basc444/adhd055645/run_1/adhd05...,data/fmri_adhd/craddock_scorr_mean/adhd055645/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd055645/run_1/adhd05564...,data/fmri_adhd/power_2011/adhd055645/run_1/adh...,data/fmri_adhd/motions/adhd055645/run_1/motion...,1,2.0
adhd436785,204,F,11.41,1,adhd200,799,605,2564,1688,419,...,data/fmri_adhd/basc197/adhd436785/run_1/adhd43...,data/fmri_adhd/basc325/adhd436785/run_1/adhd43...,data/fmri_adhd/basc444/adhd436785/run_1/adhd43...,data/fmri_adhd/craddock_scorr_mean/adhd436785/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd436785/run_1/adhd43678...,data/fmri_adhd/power_2011/adhd436785/run_1/adh...,data/fmri_adhd/motions/adhd436785/run_1/motion...,1,2.0


In [4]:
print('Number of subjects in the dataset: {}'.format(labels.size))

Number of subjects in the dataset: 1671


In [5]:
# Number of subjects for each dataset
data.participants_dataset.value_counts()

participants_dataset
abide      1150
adhd200     521
Name: count, dtype: int64

In [6]:
# Number of control vs neurodevelopmental disorder subjects for merged data
data.participants_dx.value_counts()

participants_dx
0    939
1    732
Name: count, dtype: int64

In [7]:
# Number of control vs neurodevelopmental disorder subjects for ADHD data
adhd = data[data['participants_dataset'] == 'adhd200']
adhd.participants_dx.value_counts()

participants_dx
0    338
1    183
Name: count, dtype: int64

In [8]:
# Number of control vs neurodevelopmental disorder subjects for ABIDE data
abide = data[data['participants_dataset'] == 'abide']
abide.participants_dx.value_counts()

participants_dx
0    601
1    549
Name: count, dtype: int64

## Participant features

In [9]:
data_participants = data[[col for col in data.columns if col.startswith('participants')]]
data_participants.head()

Unnamed: 0_level_0,participants_site,participants_sex,participants_age,participants_dx,participants_dataset
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
adhd700021,204,F,11.17,1,adhd200
adhd242402,204,F,13.24,1,adhd200
adhd972340,204,F,13.75,0,adhd200
adhd055645,204,F,11.18,0,adhd200
adhd436785,204,F,11.41,1,adhd200


## Structural MRI features

A set of structural features have been extracted for each subject: (i) normalized brain volume computed using subcortical segmentation of FreeSurfer and (ii) cortical thickness and area for right and left hemisphere of FreeSurfer.

In [10]:
data_anatomy = data[[col for col in data.columns if col.startswith('anatomy')]]
data_anatomy.head()

Unnamed: 0_level_0,anatomy_lh_bankssts_area,anatomy_lh_caudalanteriorcingulate_area,anatomy_lh_caudalmiddlefrontal_area,anatomy_lh_cuneus_area,anatomy_lh_entorhinal_area,anatomy_lh_fusiform_area,anatomy_lh_inferiorparietal_area,anatomy_lh_inferiortemporal_area,anatomy_lh_isthmuscingulate_area,anatomy_lh_lateraloccipital_area,...,anatomy_SupraTentorialVolNotVentVox,anatomy_MaskVol,anatomy_BrainSegVol.to.eTIV,anatomy_MaskVol.to.eTIV,anatomy_lhSurfaceHoles,anatomy_rhSurfaceHoles,anatomy_SurfaceHoles,anatomy_EstimatedTotalIntraCranialVol,anatomy_eTIV,anatomy_select
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adhd700021,1124,743,2886,1535,589,3731,4220,3491,890,4989,...,1034028,1544179,0.81054,1.055481,46,51,97,1463009.0,1463009.0,1
adhd242402,1039,595,1966,1568,411,2769,4235,2698,885,5246,...,942270,1413593,0.819411,1.071676,61,70,131,1319049.0,1319049.0,1
adhd972340,601,492,1554,1345,455,2452,3761,2273,837,3741,...,788484,1279934,0.76916,1.063212,91,61,152,1203837.0,1203837.0,1
adhd055645,699,521,1773,1251,341,2628,4026,2882,746,3951,...,823592,1303815,0.806114,1.079301,42,42,84,1208018.0,1208018.0,1
adhd436785,799,605,2564,1688,419,2508,5048,3616,1215,5496,...,1023783,1532056,0.843271,1.094796,59,81,140,1399398.0,1399398.0,1


Note that the column `anatomy_select` contain a label affected during a manual quality check (i.e. `0` and `3` reject, `1` accept, `2` accept with reserve). This column can be used during training to exclude noisy data for instance.

In [11]:
data_anatomy['anatomy_select'].head()

subject_id
adhd700021    1
adhd242402    1
adhd972340    1
adhd055645    1
adhd436785    1
Name: anatomy_select, dtype: int64

## Functional MRI features

In [12]:
data_functional = data[[col for col in data.columns if col.startswith('fmri')]]
data_functional.head()

Unnamed: 0_level_0,fmri_basc064,fmri_basc122,fmri_basc197,fmri_basc325,fmri_basc444,fmri_craddock_scorr_mean,fmri_harvard_oxford_cort_prob_2mm,fmri_msdl,fmri_power_2011,fmri_motions,fmri_select
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
adhd700021,data/fmri_adhd/basc064/adhd700021/run_1/adhd70...,data/fmri_adhd/basc122/adhd700021/run_1/adhd70...,data/fmri_adhd/basc197/adhd700021/run_1/adhd70...,data/fmri_adhd/basc325/adhd700021/run_1/adhd70...,data/fmri_adhd/basc444/adhd700021/run_1/adhd70...,data/fmri_adhd/craddock_scorr_mean/adhd700021/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd700021/run_1/adhd70002...,data/fmri_adhd/power_2011/adhd700021/run_1/adh...,data/fmri_adhd/motions/adhd700021/run_1/motion...,1
adhd242402,data/fmri_adhd/basc064/adhd242402/run_1/adhd24...,data/fmri_adhd/basc122/adhd242402/run_1/adhd24...,data/fmri_adhd/basc197/adhd242402/run_1/adhd24...,data/fmri_adhd/basc325/adhd242402/run_1/adhd24...,data/fmri_adhd/basc444/adhd242402/run_1/adhd24...,data/fmri_adhd/craddock_scorr_mean/adhd242402/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd242402/run_1/adhd24240...,data/fmri_adhd/power_2011/adhd242402/run_1/adh...,data/fmri_adhd/motions/adhd242402/run_1/motion...,1
adhd972340,data/fmri_adhd/basc064/adhd972340/run_1/adhd97...,data/fmri_adhd/basc122/adhd972340/run_1/adhd97...,data/fmri_adhd/basc197/adhd972340/run_1/adhd97...,data/fmri_adhd/basc325/adhd972340/run_1/adhd97...,data/fmri_adhd/basc444/adhd972340/run_1/adhd97...,data/fmri_adhd/craddock_scorr_mean/adhd972340/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd972340/run_1/adhd97234...,data/fmri_adhd/power_2011/adhd972340/run_1/adh...,data/fmri_adhd/motions/adhd972340/run_1/motion...,1
adhd055645,data/fmri_adhd/basc064/adhd055645/run_1/adhd05...,data/fmri_adhd/basc122/adhd055645/run_1/adhd05...,data/fmri_adhd/basc197/adhd055645/run_1/adhd05...,data/fmri_adhd/basc325/adhd055645/run_1/adhd05...,data/fmri_adhd/basc444/adhd055645/run_1/adhd05...,data/fmri_adhd/craddock_scorr_mean/adhd055645/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd055645/run_1/adhd05564...,data/fmri_adhd/power_2011/adhd055645/run_1/adh...,data/fmri_adhd/motions/adhd055645/run_1/motion...,1
adhd436785,data/fmri_adhd/basc064/adhd436785/run_1/adhd43...,data/fmri_adhd/basc122/adhd436785/run_1/adhd43...,data/fmri_adhd/basc197/adhd436785/run_1/adhd43...,data/fmri_adhd/basc325/adhd436785/run_1/adhd43...,data/fmri_adhd/basc444/adhd436785/run_1/adhd43...,data/fmri_adhd/craddock_scorr_mean/adhd436785/...,data/fmri_adhd/harvard_oxford_cort_prob_2mm/ad...,data/fmri_adhd/msdl/adhd436785/run_1/adhd43678...,data/fmri_adhd/power_2011/adhd436785/run_1/adh...,data/fmri_adhd/motions/adhd436785/run_1/motion...,1


Unlike the anatomical and participants data, the available data are filename to CSV files in which the time-series information are stored. We show in the next section how to read and extract meaningful information from those data.

Similarly to the anatomical data, the column `fmri_select` gives information about the manual quality check.

In [13]:
data_functional['fmri_select'].head()

subject_id
adhd700021    1
adhd242402    1
adhd972340    1
adhd055645    1
adhd436785    1
Name: fmri_select, dtype: int64

## Evaluations functions 

The framework is evaluated with a cross-validation approach. The metrics used are the AUC under the ROC and the accuracy.

In [14]:
def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(pipe, X, y, scoring=('roc_auc', 'accuracy'), cv=cv,
                             verbose=1, return_train_score=True,
                             n_jobs=1)
    
    return results

## Using only anatomical features

#### FeatureExtractor

The available structural data can be used directly to make some classification. In this regard, we will use a feature extractor (i.e. `FeatureExtractor`). This extractor will only select only the anatomical features, dropping any information regarding the fMRI-based features.

In [15]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        # get only the anatomical information
        X = X_df[[col for col in X_df.columns if col.startswith('anatomy')]]
        return X.drop(columns='anatomy_select')

#### Classifier

We propose to use a logistic classifier preceded from a scaler which will remove the mean and standard deviation computed on the training set.

In [16]:
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.clf = make_pipeline(StandardScaler(), LogisticRegression(C=1., solver='liblinear'))

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
        
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


We can test locally our pipeline using `evaluation` function that we defined earlier.

In [17]:
# splitting data in train and test sets 
X_train, X_test, y_train, y_test = train_test_split(data,labels, 
                                                    test_size=0.3, 
                                                    random_state=101) 

# creating a pipe using the make_pipeline method 
pipe = make_pipeline(FeatureExtractor(), 
                     Classifier()) 
  
#fitting data into the model 
pipe.fit(X_train, y_train)
  
# predicting values 
y_pred = pipe.predict(X_test) 
  
# calculating accuracy score 
accuracy = accuracy_score(y_pred,y_test)
roc_auc = roc_auc_score(y_pred,y_test)
print('accuracy score : ',accuracy)
print('roc_auc score : ',roc_auc)

accuracy score :  0.6095617529880478
roc_auc score :  0.6023325437557573


In [18]:
results_anat = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results_anat['train_roc_auc']),
                                                        np.std(results_anat['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results_anat['test_roc_auc']),
                                                          np.std(results_anat['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results_anat['train_accuracy']),
                                                         np.std(results_anat['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results_anat['test_accuracy']),
                                                           np.std(results_anat['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 0.800 +- 0.006
Validation score ROC-AUC: 0.661 +- 0.030 

Training score accuracy: 0.726 +- 0.008
Validation score accuracy: 0.632 +- 0.027


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.0s finished


## fMRI-derived features

In [19]:
def _load_fmri(fmri_filenames):
    """Load time-series extracted from the fMRI using a specific atlas."""
    return np.array([pd.read_csv(subject_filename,
                                 header=None).values
                     for subject_filename in fmri_filenames], dtype=object)


class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # make a transformer which will load the time series and compute the
        # connectome matrix
        self.transformer_fmri = make_pipeline(
            FunctionTransformer(func=_load_fmri, validate=False),
            ConnectivityMeasure(kind='tangent', vectorize=True))
        
    def fit(self, X_df, y):
        # get only the time series for the MSDL atlas
        fmri_filenames = X_df['fmri_msdl']
        self.transformer_fmri.fit(fmri_filenames, y)
        return self

    def transform(self, X_df):
        fmri_filenames = X_df['fmri_msdl']
        return self.transformer_fmri.transform(fmri_filenames)


In [20]:
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.clf = make_pipeline(StandardScaler(), LogisticRegression(C=1., solver='liblinear'))

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
       
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


In [21]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 1.000 +- 0.000
Validation score ROC-AUC: 0.598 +- 0.037 

Training score accuracy: 1.000 +- 0.001
Validation score accuracy: 0.584 +- 0.025


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.9min finished


## More elaborate pipeline: combining anatomy and fMRI

In [22]:
def _load_fmri(fmri_filenames):
    """Load time-series extracted from the fMRI using a specific atlas."""
    return np.array([pd.read_csv(subject_filename,
                                 header=None).values
                     for subject_filename in fmri_filenames], dtype=object)


class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # make a transformer which will load the time series and compute the
        # connectome matrix
        self.transformer_fmri = make_pipeline(
            FunctionTransformer(func=_load_fmri, validate=False),
            ConnectivityMeasure(kind='tangent', vectorize=True))
    
    def fit(self, X_df, y):
        fmri_filenames = X_df['fmri_msdl']
        self.transformer_fmri.fit(fmri_filenames, y)
        return self

    def transform(self, X_df):
        fmri_filenames = X_df['fmri_msdl']
        X_connectome = self.transformer_fmri.transform(fmri_filenames)
        X_connectome = pd.DataFrame(X_connectome, index=X_df.index)
        X_connectome.columns = ['connectome_{}'.format(i)
                                for i in range(X_connectome.columns.size)]
        # get the anatomical information
        X_anatomy = X_df[[col for col in X_df.columns
                          if col.startswith('anatomy')]]
        X_anatomy = X_anatomy.drop(columns='anatomy_select')
        # concatenate both matrices
        return pd.concat([X_connectome, X_anatomy], axis=1)


We will create a classifier (i.e. a random forest classifier) which will used both connectome and anatomical features.

In [23]:
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    def fit(self, X, y):
        self.clf.fit(X, y)
        self.classes_ = unique_labels(y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


In [24]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 1.000 +- 0.000
Validation score ROC-AUC: 0.672 +- 0.018 

Training score accuracy: 1.000 +- 0.000
Validation score accuracy: 0.613 +- 0.014


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.0min finished


We can propose a
more complex classifier than the previous one. We will train 2 single classifiers independetly on the sMRI-derived and fMRI-derived features. Then, a meta-classifier will be used to combine both information. We left out some data to be able to train the meta-classifier.

In [25]:
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.clf_connectome = make_pipeline(StandardScaler(),
                                            LogisticRegression(C=1., solver='liblinear'))
        self.clf_anatomy = make_pipeline(StandardScaler(),
                                         LogisticRegression(C=1., solver='liblinear'))
        self.meta_clf = LogisticRegression(C=1., solver='liblinear')

    def fit(self, X, y):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]
        train_idx, validation_idx = train_test_split(range(y.size),
                                                     test_size=0.33,
                                                     shuffle=True,
                                                     random_state=42)
        X_anatomy_train = X_anatomy.iloc[train_idx]
        X_anatomy_validation = X_anatomy.iloc[validation_idx]
        X_connectome_train = X_connectome.iloc[train_idx]
        X_connectome_validation = X_connectome.iloc[validation_idx]
        y_train = y[train_idx]
        y_validation = y[validation_idx]

        self.clf_connectome.fit(X_connectome_train, y_train)
        self.clf_anatomy.fit(X_anatomy_train, y_train)

        y_connectome_pred = self.clf_connectome.predict_proba(
            X_connectome_validation)
        y_anatomy_pred = self.clf_anatomy.predict_proba(
            X_anatomy_validation)

        self.meta_clf.fit(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1),
            y_validation)
        self.classes_ = unique_labels(y)
        return self
    
    def predict(self, X):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)

        return self.meta_clf.predict(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1))

    def predict_proba(self, X):
        X_anatomy = X[[col for col in X.columns if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)

        return self.meta_clf.predict_proba(
            np.concatenate([y_connectome_pred, y_anatomy_pred], axis=1))


In [26]:
results = evaluation(data, labels)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training score ROC-AUC: 0.890 +- 0.024
Validation score ROC-AUC: 0.660 +- 0.046 

Training score accuracy: 0.805 +- 0.035
Validation score accuracy: 0.615 +- 0.025


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.9min finished
