In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, ParameterGrid, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import ElasticNet
sns.set()

In [5]:
df = pd.read_csv('../data/beta_event_amplitude.csv',index_col=0)
label_names = ['L2 prox gbar', 'L5 prox gbar', 'L2 dist gbar', 'L5 dist gbar', 'Prox variance', 'Dist variance',
               'Prox mean time', 'Dist mean time', 'Amplitude']
df.columns = label_names

data_dict = {'gbar_evprox_1_L2Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evprox_1_L5Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evdist_1_L2Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evdist_1_L5Pyr_ampa': (1e-10, 1e-1),
               'sigma_t_evprox_1': (1, 100),
               'sigma_t_evdist_1': (1, 100),
               't_evprox_1': (200, 300),
               't_evdist_1': (200, 300),
               'amplitude': (-10000, 0)}

X = df.iloc[:, df.columns != 'Amplitude']
y = df.iloc[:, df.columns == 'Amplitude'].values

In [8]:
# Same pipeline applies to both questions
std_ftrs = np.array(label_names)
std_ftrs = std_ftrs[std_ftrs != 'Amplitude']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

In [3]:
def MLpipe_KFold_RMSE(X, y, preprocessor, ML_algo, param_grid):
    '''
    This function splits the data to other/test (80/20) and then applies KFold with 4 folds to other.
    The RMSE is minimized in cross-validation.
    '''
    nr_states = 5
    test_scores = np.zeros(nr_states)
    final_models = []

    results_dict = {
        'random_state': list(),
        'models': list(),
        'train_scores': list(),
        'validation_scores': list(),
        'params': list()

    }
    for i in range(nr_states):
        # first split to separate out the training set
        X_other, X_test, y_other, y_test = train_test_split(X,y,train_size = 0.8,random_state=42*i)

        # K folds to separate out the validation and test sets
        kf = KFold(n_splits=4,shuffle=True,random_state=42*i)

        pipe = make_pipeline(preprocessor, ML_algo)

        grid = GridSearchCV(pipe, param_grid=param_grid, scoring = 'neg_root_mean_squared_error',
                            cv=kf, return_train_score = True, n_jobs=-1, verbose=True)

        grid.fit(X_other, y_other)
        results = pd.DataFrame(grid.cv_results_)

        # save the model
        final_models.append(grid)
        # calculate and save the test score
        y_test_pred = final_models[-1].predict(X_test)
        test_scores[i] = mean_squared_error(y_test,y_test_pred, squared=False)

    best_idx = np.argmin(test_scores)
    print(f'Best Test Score: {np.mean(test_scores)}±{np.std(test_scores)}')
    print(f'Best params: {final_models[best_idx].best_params_}')
    
    return final_models

In [None]:
def make_train_val_plots(results_dict):

    return

In [9]:
param_grid = {'elasticnet__max_iter': [1e6], 'elasticnet__alpha': np.logspace(-2, 2, 10),
              'elasticnet__l1_ratio': np.linspace(0.1, 1.0, 10),    
              'elasticnet__random_state': [123]}
res = MLpipe_KFold_RMSE(X, y, preprocessor, ElasticNet(), param_grid)

Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Best Test Score: 912.4482520501784±3.9391288288940376
Best params: {'elasticnet__alpha': 0.027825594022071243, 'elasticnet__l1_ratio': 1.0, 'elasticnet__max_iter': 1000000.0, 'elasticnet__random_state': 123}


In [12]:
np.logspace(1,1,1)

array([10.])