In [1]:
import numpy as np
from mriqc_learn.datasets import abide
from mriqc_learn.model_selection import split

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from mriqc_learn.models import preprocess as pp
from sklearn.pipeline import Pipeline

In [3]:
(train_x, train_y), (_, _) = abide.load_data(split_strategy="none")
train_x["site"] = train_y.site

# Massage targets
train_y = train_y[["rater_3"]].values.squeeze()
print(f"Discard={100 * (train_y == -1).sum() / len(train_y)}")
print(f"Doubtful={100 * (train_y == 0).sum() / len(train_y)}")
print(f"Accept={100 * (train_y == 1).sum() / len(train_y)}")
train_y[train_y == 0] = -1
train_y += 1
train_y[train_y > 0] = 1

Discard=14.168937329700272
Doubtful=1.5440508628519527
Accept=84.28701180744777


In [4]:
inner_cv = split.LeavePSitesOut(1, robust=True)
outer_cv = split.LeavePSitesOut(1, robust=True)
inner_cv.get_n_splits(X=train_x, y=train_y)

14

In [5]:
steps = [
    ("drop_ft", pp.DropColumns(
        drop=[f"size_{ax}" for ax in "xyz"] + [f"spacing_{ax}" for ax in "xyz"]
    )),
    ("scale", pp.SiteRobustScaler()),
    ("site_pred", pp.SiteCorrelationSelector()),
    # ("print1", pp.PrintColumns()),
    ("winnow", pp.NoiseWinnowFeatSelect(use_classifier=True)),
    ("drop_site", pp.DropColumns(drop=["site"])),
    # ("print2", pp.PrintColumns()),
    ("svc", SVC()),
]

pipe = Pipeline(steps)

In [6]:
# clf = pipe.fit(train_x, y=train_y)

In [7]:
p_grid = [{
    "scale__unit_variance": [True, False],
    "scale__with_centering": [True, False],
    "site_pred__disable": [False, True],
    "winnow__disable": [False, True],
    "svc__kernel": ["rbf"],
    "svc__C": [10],
    "svc__gamma": [0.1],
}]

In [None]:
# Nested CV with parameter optimization
clf = GridSearchCV(
    estimator=pipe,
    param_grid=p_grid,
    cv=inner_cv,
    verbose=3,
    n_jobs=30,
    scoring="roc_auc",
)
clf.fit(train_x, y=train_y)

Fitting 14 folds for each of 16 candidates, totalling 224 fits


In [None]:
nested_score = cross_val_score(
    clf,
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="roc_auc",
    verbose=3,
    n_jobs=30,
)
nested_score.mean()

In [None]:
clf.cv_results_