In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dill
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, ParameterGrid, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, ElasticNet
sns.set()

In [18]:
df = pd.read_csv('../data/beta_event_amplitude.csv',index_col=0)
label_names = ['L2 prox gbar', 'L5 prox gbar', 'L2 dist gbar', 'L5 dist gbar', 'Prox variance', 'Dist variance',
               'Prox mean time', 'Dist mean time', 'Amplitude']
df.columns = label_names

data_dict = {'gbar_evprox_1_L2Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evprox_1_L5Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evdist_1_L2Pyr_ampa': (1e-10, 1e-1), 
               'gbar_evdist_1_L5Pyr_ampa': (1e-10, 1e-1),
               'sigma_t_evprox_1': (1, 100),
               'sigma_t_evdist_1': (1, 100),
               't_evprox_1': (200, 300),
               't_evdist_1': (200, 300),
               'amplitude': (-10000, 0)}

X = df.iloc[:, df.columns != 'Amplitude']
y = df.iloc[:, df.columns == 'Amplitude'].values

In [19]:
# Same pipeline applies to both questions
std_ftrs = np.array(label_names)
std_ftrs = std_ftrs[std_ftrs != 'Amplitude']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

In [20]:
# Store preprocessed train/test/validation split from multiple random states
data_split_nsr = list()
for nsr in range(5):
    X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.4, random_state=123 * nsr)
    X_test, X_val, y_test, y_val = train_test_split(X_other, y_other, test_size=0.5, random_state=123 * nsr)

    X_train_prep = clf.fit_transform(X_train)
    X_val_prep = clf.transform(X_val)
    X_test_prep = clf.transform(X_test)

    split_dict = {
        'X_train': X_train_prep,
        'y_train': y_train,
        'X_val': X_val_prep,
        'y_val': y_val,
        'X_test': X_test_prep,
        'y_test': y_test,
    }

    data_split_nsr.append(split_dict)

# Save splitting regime
with open('../results/data_split_nsr.pkl', 'wb') as file:
    dill.dump(data_split_nsr, file)

In [21]:
# Load previously saved preprocessed data splits
with open('../results/data_split_nsr.pkl', 'rb') as file:
    data_split_nsr = dill.load(file)

In [37]:
def MLpipe_R2(ML_algo, param_grid, X_train, y_train, X_val, y_val, X_test, y_test, random_model=False):
    """ML regression pipeline assessed via R2 score"""

    reg = ML_algo()
    pg = ParameterGrid(param_grid)

    #Store score_dict across random states
    results = list()

    # Only loop through random states if model is non-deterministic
    if random_model:
        num_random_states = 5
    else:
        num_random_states = 1

    for nsr in range(num_random_states):
        print(f'Random State: {nsr}')
        random_state = 123 * nsr

        # Store results from parameter sweep
        score_dict = {
            'random_state': random_state,
            'train_scores': list(),
            'validation_scores': list(),
            'params': list(),
            'best_idx': None,
            'best_params': None,
            'best_model': None,
            'test_score': None
        }

        # Loop through params in parameter grid and store train/validation scores
        for params in pg:
            if random_model:
                params['random_state'] = random_state
            reg.set_params(**params)

            reg.fit(X_train, y_train)
            y_train_pred = reg.predict(X_train)
            y_val_pred = reg.predict(X_val)

            score_dict['train_scores'].append(r2_score(y_train, y_train_pred))
            score_dict['validation_scores'].append(r2_score(y_val, y_val_pred))
            score_dict['params'].append(params)
            #score_dict['models'].append(reg.copy())

        # Find best parameters from validation scores, and calculate test score
        best_idx = np.argmax(score_dict['validation_scores'])
        best_params = pg[best_idx]
        print(f'Best Params: {best_params}')

        reg.set_params(**best_params)
        reg.fit(X_train, y_train)
        y_test_pred = reg.predict(X_test)
        test_score = r2_score(y_test, y_test_pred)
        print(f'Test Score: {test_score}')

        score_dict['test_score'] = test_score
        score_dict['best_idx'] = best_idx
        score_dict['best_params'] = best_params
        score_dict['best_model'] = reg

        results.append(score_dict)

    return results

In [38]:
def make_train_val_plots(results_dict):

    return

In [39]:
# Linear Regression
param_grid = dict()
res_list = list()
for idx in range(5):
    X_train, X_val, X_test = data_split_nsr[idx]['X_train'], data_split_nsr[idx]['X_val'], data_split_nsr[idx]['X_test']
    y_train, y_val, y_test = data_split_nsr[idx]['y_train'], data_split_nsr[idx]['y_val'], data_split_nsr[idx]['y_test']

    res = MLpipe_R2(LinearRegression, param_grid, X_train, y_train, X_val, y_val, X_test, y_test, random_model=False)
    res_list.append(res)

# Linear regression results
with open('../results/linear_regression_results.pkl', 'wb') as file:
    dill.dump(res_list, file)


Random State: 0
Best Params: {}
Test Score: 0.21723733204942763


In [9]:
param_grid = {'max_iter': [1e6], 'alpha': np.logspace(-2, 2, 10),
              'l1_ratio': np.linspace(0.1, 1.0, 10),    
              'random_state': [123]}
res = MLpipe_KFold_RMSE(X, y, preprocessor, ElasticNet(), param_grid)

Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Fitting 4 folds for each of 100 candidates, totalling 400 fits
Best Test Score: 912.4482520501784±3.9391288288940376
Best params: {'elasticnet__alpha': 0.027825594022071243, 'elasticnet__l1_ratio': 1.0, 'elasticnet__max_iter': 1000000.0, 'elasticnet__random_state': 123}


In [12]:
np.logspace(1,1,1)

array([10.])