In [1]:
import tensorflow as tf

2023-12-03 13:16:20.698622: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV



In [3]:
plt.rcParams.update(
    {
        "axes.grid": True,
        "grid.alpha": 0.5,
        "grid.linestyle": "--"
    }
)


## Import data

In [4]:
df = pd.read_csv("../data/FS_features_ABIDE_males.csv", sep=";")
df = df.set_index("FILE_ID")

# drop target
y = df["AGE_AT_SCAN"]
df = df.drop(["AGE_AT_SCAN", "SEX"], axis=1)

In [5]:
from sklearn.pipeline import Pipeline


linear_regressor = Pipeline(
    [
        ('scaler', PowerTransformer()),
        ("principal_components", PCA(n_components=20, svd_solver = 'full')),
        ("model",  Lasso())
    ]
)

param_distr = {"model__alpha": np.arange(1, 20)}


In [393]:
search = GridSearchCV(linear_regressor, param_distr, return_train_score=True, scoring="r2", cv=3)
search.fit(df.values, y.values)
print("Done")


Done


In [398]:
results_regressor = pd.DataFrame(search.cv_results_)
no_overfit = results_regressor["mean_train_score"] - results_regressor["mean_test_score"] < 0.03

(
    results_regressor[no_overfit]
    .sort_values("mean_test_score", ascending=False)
    .to_csv("best_model_regressor.csv", index=False)
)

In [399]:
pd.read_csv("best_model_regressor.csv").head(1).T

Unnamed: 0,0
mean_fit_time,0.483502
std_fit_time,0.008364
mean_score_time,0.013183
std_score_time,0.000285
param_model__alpha,12
params,{'model__alpha': 12}
split0_test_score,0.428186
split1_test_score,0.565683
split2_test_score,0.487599
mean_test_score,0.493822


## Neural network

In [297]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from scikeras.wrappers import KerasRegressor
from itertools import product
from sklearn.model_selection import RandomizedSearchCV


In [322]:
def twoLayerFeedForward(hidden_layer_dim, meta):
    clf = Sequential()
    X_shape_ = (meta["X_shape_"][1],)

    for i, layer in enumerate(hidden_layer_dim):
        if i == 0:
            clf.add(Dense(layer, activation='relu', input_shape=X_shape_))
        else:
            clf.add(Dense(layer, activation='relu'))
            clf.add(Dropout(0.1))
    clf.add(Dense(1))
    return clf

callback = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    restore_best_weights=True,

)


mlp = KerasRegressor(
    twoLayerFeedForward, 
    epochs=200, 
    loss="mse",
    callbacks=[callback],
    validation_split=0.2,
    hidden_layer_dim=[2],
    verbose=0
)

In [323]:
valid_entries = [2, 4, 8]

hidden_layers = []
for i in range(1, 5):
    hidden_layers.extend(list(product(valid_entries, repeat=i)))

In [329]:
model = Pipeline(
    [
        ('scaler', PowerTransformer()),
        ("principal_components", PCA(n_components=20, svd_solver = 'full')),
        ("mlp", mlp)
    ]
)

params = {
    "mlp__hidden_layer_dim": hidden_layers,
    "mlp__optimizer__learning_rate": [0.001, 0.001, 0.01],
}

In [330]:
gs = RandomizedSearchCV(model, params, refit=False, cv=3, return_train_score=True, scoring="r2", n_iter=60)

In [331]:
gs.fit(df.values, y.values)

In [332]:
df_results = pd.DataFrame(gs.cv_results_)

In [337]:
no_overfit = df_results["mean_train_score"] - df_results["mean_test_score"] < 0.03

In [342]:
(
    df_results[no_overfit]
    .sort_values("mean_test_score", ascending=False)
    .to_csv("best_model_1.csv", index=False)
)

## Plot

In [347]:
best_model = pd.read_csv("best_model.csv")
best_model.head(1).T

Unnamed: 0,0
mean_fit_time,1.267042
std_fit_time,0.08602
mean_score_time,0.07543
std_score_time,0.000804
param_mlp__optimizer__learning_rate,0.01
param_mlp__hidden_layer_dim,"(4, 2, 8, 4)"
params,"{'mlp__optimizer__learning_rate': 0.01, 'mlp__..."
split0_test_score,0.657957
split1_test_score,0.754671
split2_test_score,0.558742
