In [35]:
%reload_ext autoreload
%autoreload 3
import math
import numpy as np
import xarray as xr
import pandas as pd
from copy import deepcopy


import skopt
from skopt.space import Real, Categorical, Integer

from sklearn import feature_selection, svm, preprocessing, model_selection, ensemble
from sklearn import neighbors, decomposition
from sklearn.pipeline import Pipeline

from nilearn import datasets as nilean_datasets, plotting as nilearn_plotting

import matplotlib.pyplot as plt
import seaborn as sns; sns.set('paper')
import plotly.express as px
from tqdm import tqdm

import xgboost
import lightgbm

from python.acnets.datasets.connectivity import load_connectivity


In [37]:
X, y, feature_names = load_connectivity(
  parcellation='dosenbach2007',
  kind='tangent',
  vectorize=True,
  return_y=True,
  only_diagonal=False,
  binarization_threshold=1.0,
  return_feature_names=True,
  discard_diagonal=False,
  discard_cerebellum=False,)

# encode y as integers
y_encoder = preprocessing.LabelEncoder()
y = y_encoder.fit_transform(y)

# test/train splits
train, test = model_selection.train_test_split(
  range(len(X)),
  test_size=0.2,
  shuffle=True,
  stratify=y,)

Binarizing connectivity matrix... done!


In [39]:
# Pipeline candidates
pipe = Pipeline([
  ('zerovar', 'passthrough'),
  ('reduce', 'passthrough'),
  ('model', svm.SVC())
], verbose=False)

# parameter grids

rfc_param_space = {
  'reduce': Categorical([decomposition.PCA()]),
  'model': Categorical([
    ensemble.RandomForestClassifier(
      bootstrap=True, oob_score=True, warm_start=False, n_jobs=-1)
  ]),
  'reduce__n_components': Real(0.9, .999),
  'reduce__whiten': Categorical([False, True]),
  'model__n_estimators': Integer(10, 1000),
  'model__criterion': Categorical(['entropy', 'gini']),
  'model__max_features': Categorical([None, 'sqrt', 'log2']),
  'model__max_depth': Integer(10, 100),
}

svc_param_space = {
  'zerovar': [
    feature_selection.VarianceThreshold()
  ],
  'reduce': ['passthrough'],
  'model': [
    svm.SVC(probability=True)
  ],
  'model__C': Real(.1, 10000, prior='log-uniform'),
  'model__gamma': Real(.1, 10, prior='log-uniform'),
  'model__degree': Integer(1,8),
  'model__kernel': Categorical(['linear', 'poly', 'rbf']),
}


xgb_param_space = {
  'model': [
    xgboost.XGBClassifier(
      verbosity=0,
      use_label_encoder=False,
      objective='binary:logistic', eval_metric='auc')
  ],
  # 'model__colsample_bylevel': Real(.6, .7),
  # 'model__colsample_bytree': Real(.6, .7),
  'model__gamma': Real(.01, 1.0),
  'model__learning_rate': Real(.0001, 1),
  # 'model__max_delta_step': Real(.1, 10),  
  'model__max_depth': Integer(4, 100),
  # 'model__min_child_weight': Integer(10, 500),
  'model__n_estimators': Integer(10,1000),
  # 'model__reg_alpha': Real(0, 1),
  # 'model__reg_lambda': Real(0, 1),
  # 'model__subsample': Real(.5, .8),
  # 'model__objective': ['binary:hinge', 'binary:logistic'],
  # 'model__tree_method': ['exact', 'hist'],
}

gbm_param_space = {
  'model': Categorical([
    lightgbm.LGBMClassifier(metric='auc', objective='binary', n_jobs=-1)
  ]),
  'model__n_estimators': Integer(1,100),
  'model__min_child_samples': Integer(1,10),
  'model__max_depth': Integer(-1,10),
  'model__num_leaves': Integer(2, 10),
  # 'model__early_stopping_rounds': [30],
  # 'model__min_data_in_leaf': [30, 50, 100, 300, 400],
  'model__reg_alpha': Real(0, .5),
  'model__reg_lambda': Real(0, .5)
}

gbc_param_space = {
  'model': Categorical([
    ensemble.GradientBoostingClassifier()
  ]),
  'model__max_depth': Integer(1, 5),
  'model__learning_rate': Real(10e-5, 1, 'log-uniform'),
  'model__max_features': Integer(1, X.shape[1]),
  'model__min_samples_split': Integer(2, 30),
  'model__min_samples_leaf': Integer(1, 16)
}

In [40]:
opt = skopt.BayesSearchCV(
    pipe,[
        # (svc_param_space, 32),
        # (gbc_param_space, 32),
        # (gbm_param_space, 32),
        (xgb_param_space, 32),
        # (rfc_param_space, 32)
     ],
    n_points=2,
    scoring='roc_auc',
    verbose=0,
)

opt.fit(
    X[train], y[train], callback=[
      skopt.callbacks.DeadlineStopper(10),
    ]
)

print('best params:', opt.best_params_)
print('train score:', opt.best_score_)
print('test  score:', opt.score(X[test], y[test]))

In [34]:
print('best params:', opt.best_params_)

cv_scores = model_selection.cross_val_score(
    opt.best_estimator_, X, y, n_jobs=-1, scoring='roc_auc',
    cv=model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=20))

print(f'5-Fold AUC (20 repeats): {cv_scores.mean():.3f}'
      f' \N{plus-minus sign} {cv_scores.std():.3f}')

# obs_score, rnd_scores, pvalue = model_selection.permutation_test_score(
#     grid.best_estimator_, X, y, n_jobs=-1, scoring='roc_auc',
#     cv=model_selection.StratifiedKFold(n_splits=5, shuffle=True),
#     n_permutations=100,
# )

best params: OrderedDict([('model', RandomForestClassifier(max_depth=45, max_features='log2', n_estimators=978,
                       n_jobs=-1, oob_score=True)), ('model__criterion', 'gini'), ('model__max_depth', 45), ('model__max_features', 'log2'), ('model__n_estimators', 978), ('reduce', PCA(n_components=0.9092497810146796, whiten=True)), ('reduce__n_components', 0.9092497810146796), ('reduce__whiten', True)])
5-Fold AUC (20 repeats): 0.831 ± 0.157
