# Various models for ML-CUP dataset

## About the dataset

The dataset is made up of 11 columns of continuous real numbers. The last two columns represent the targets.<br>
So our goal is to develop a multivariate regression model.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
import time
start=time.time()

In [None]:
from keras.layers import Dense, Input, Dropout
from keras.models import Sequential
from keras import Model
from keras.optimizers import Adam

In [None]:
df = pd.read_csv('ML-CUP22-TR.csv', comment='#', skip_blank_lines=True, index_col=0, header=None)
df.index.name = None  #removes index name 
print(f'Columns number: {df.shape[1]}')
print(f'Rows number: {df.shape[0]}')
#rename target columns
df.rename(columns={10: "Target_1", 11: "Target_2"}, inplace=True)
df.head()

In [None]:
print(f'Missing values number: {df.isna().sum().sum()}')

Let's check the pairplots.

In [None]:
sns.pairplot(df)
plt.show()

Let's compare linear and rank correlations of the attributes with regard to the targets.

In [None]:
corr_matrix1 = df.corr(method='pearson').iloc[:-2, -2:]
corr_matrix2 = df.corr(method='spearman').iloc[:-2, -2:]

# merge the dataframes
pd.concat([corr_matrix1, corr_matrix2], axis=1)

## Data preparation
- divide attributes from targets 
- transform to numpy array

In [None]:
# split attributes from targets
X_df = df.drop(['Target_1', 'Target_2'], axis=1)
y_df = df[["Target_1", "Target_2"]]

# from df to arrays
X = X_df.to_numpy()
y = y_df.to_numpy()

# Workflow

- ## Validation schema
K-Fold CV (K=5) for assessment with internal hold out (25%) for model selection. 
Final model selection with hold-out (20%) and retrain. 

- ## Common framework
 In order to compare results obtained starting from different hypermodels, we developed a class as a way of sharing methods among them. Thus, each architecture (treated as an instance of the class) follows the same path for model selection and assessment. 
 - `SKLEARN_module.SklNet`: class for Sklearn models  
 - `KERAS_module.KerasNet`: class for Keras models
 - separate handling for Pytorch models




- ## Hyperparameters space searching strategy: **bayesian optimization**. <br>
   Because it is a good trade-off between randomness and computational efficiency. Set number of performed trials to 15% of the dimension of the hyperparameters space (computed by the auxiliary function `count_combinations`). 

- ## Regularization techniques
    Preventing overfitting has been a crucial topic in our analysis. 
    Namely, we set an artificially high number of epochs (500) for training so that it continues until **early stopping** based on the validation score.
    Moreover we decided to **decrease the learning rate** when reaching a plateu on validation loss, so to help approaching a local minimum.
    Other regularization techniques relative to each model will be explained in the relative section.

- ## Algorithms and hypermodels 
    In this analysis the following models will be compared using the forementioned procedure:
    - ridge regression 
    - KNN regressor 
    - SVR 
    - deep Regression (batch and minibatch)
    - RandNN 
    - CNN
    - Cascade Correlation
    - linear regression (pytorch)
    - shallow regression (pytorch)

In [None]:
#import classes and auxiliary functions
from SKLEARN_module import SklNet
from KERAS_module import KerasNet
import tools_for_classes as tools
import models_list as models_importer
from keras.utils.layer_utils import count_params
from sklearn.metrics import  make_scorer
from sklearn.exceptions import ConvergenceWarning

First we'll intialize two lists containing the MEE score and (an estimation of) the number of effective free parameters for each model.
We'll compare the performances of the models with these metrics in the end and decide the best one.

In [None]:
final_MEE = []
final_dimension = []
final_names = []

# Ridge Regression 
Construct two linear regressors (one for each target) with penalty term (`alpha`) to bound weights norm.

In [None]:
hyperParameters_SKLEARN = {
                       'estimator__alpha': [0.001, 0.00001, 0.7, 0.9, 1, 2], #penalty term (regularization)
                       'estimator__eta0' : [0.01, 0.1, 0.001, 1] #init learning rate
                       
}

# define a custom score for GridSearchCV
score = make_scorer(tools.MEE_metric, greater_is_better=False)
scoring = score

model = models_importer.build_RidgeRegressor(max_iter=500)
mode = 'regressione' 
modelName='ridge'

net = SklNet(modelName=modelName, mode=mode, model=model, X=X, y=y, param_grid=hyperParameters_SKLEARN, scoring=scoring)

best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best Params")
print(best_params)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(2*X.shape[1])

#append results for final plot
final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(2*X.shape[1])
final_names.append(modelName)

# KNN

The number of neighbours ranges from 5 to 35 to manage model complexity.

In [None]:
hyperParameters_SKLEARN = {
                      "model__estimator__n_neighbors": np.arange(5, 35, 2),
                      "model__estimator__weights": ["uniform",  
                                                 "distance"], 
                      "model__estimator__metric": ["euclidean", "cityblock"], }

# define a custom score for GridSearchCV
score = make_scorer(tools.MEE_metric, greater_is_better=False)
scoring = score

model = models_importer.build_KNN_Pipe_Reg()
mode = 'reg' 
modelName = 'knn'

net = SklNet(modelName, mode, model, X, y, hyperParameters_SKLEARN, scoring)

best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best Params")
print(best_params)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")

#vd_dim=number of patterns/k (during training 80% of total data is seen -> 1194)
print(1194/best_params['model__estimator__n_neighbors'])

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(1194/best_params['model__estimator__n_neighbors'])
final_names.append(modelName)

# SVR

In [None]:
hyperParameters_SKLEARN = {
                         'estimator__C':[0.01, 0.1, 0.3, 0.7, 1.0], # penalty terms (slack variables)
                         'estimator__epsilon' : [0.01, 0.1, 0.9, 1.5, 2], #size of the eps-tube
                         'estimator__kernel': ['linear','rbf']
}

# define a custom score for GridSearchCV
score = make_scorer(tools.MEE_metric, greater_is_better=False)
scoring = score

model = models_importer.build_SVR(500)
mode = 'reg' 
modelName = 'svr'

net = SklNet(modelName, mode, model, X, y, hyperParameters_SKLEARN, scoring)

with warnings.catch_warnings():
    warnings.simplefilter('ignore', category=ConvergenceWarning)
    best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best Params")
print(best_params)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')

if best_params['estimator__kernel']=='rbf':
    #complexity expected to be infinite so we'll assign at the end the maximum complexity found
    complexity= -1
if best_params['estimator__kernel']=='linear':
    complexity = 2*X.shape[1]
print("Effective free parameters")
print(complexity)

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(complexity)
final_names.append(modelName)

# deep Regression (keras)
 We selected 5 hyperparameters: `units` and `depth` to decide the net's structure,`learning_rate` and `decay` to tweak the optimizer's parameters and `dropout` for regularization purposes.
 

We decided to use different batch sizes to compare our model's performance varying this hyperparameter. We will try  mini batch with size 64 and batch (batch_size=len(X)).


In [None]:
# mini batch size 64
mode='regression'
hyperp = {
                'output_units' : 2,
                'units' : [4,5,6,7], #number of units in each hidden layer
                'dropout' : [1e-3, 0.0],
                'learning_rate': [1e-2, 1e-3],
                'decay': [0.0, 1e-3],
                'depth':[2,4,1], #number of hidden layers
                'activation_hidden': 'relu',
                'activation_output':'linear',
                'metric': 'MSE'
            }

models_importer.set_input_size(len(X[0])) #set input units
models_importer.set_hyperp(hyperp) #pass user defined hps to the model    
modelBuilder = models_importer.get_deepNN    #hypermodel function definition
tot_trials = tools.get_search_spaze_size(hyperp)    #total combinations of hps
modelName='deep_mb'

#auxiliary parameters for hps search
tunerParameters = {
            'directory' : 'tuner',
            'project_name':'deep_mb',
            'batch_size': 64,
            'max_trials' : 0.15*tot_trials,
        }

# hps search, train and test 
net = KerasNet(modelName=modelName, mode=mode, X=X, y=y, tunerParameters=tunerParameters, modelBuilder=modelBuilder )
best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

    
print("Best params")
print(best_params.values)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(count_params(best_model.trainable_weights))

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(count_params(best_model.trainable_weights))
final_names.append(modelName)

In [None]:
# batch
mode='regression'
hyperp = {
                'output_units' : 2,
                'units' : [4,5,6,7],
                'dropout' : [1e-3, 0.0],
                'learning_rate': [1e-2, 1e-3],
                'decay': [0.0, 1e-3],
                'depth':[2,4,1],
                'activation_hidden': 'relu',
                'activation_output':'linear',
                'metric': 'MSE'
            }


models_importer.set_input_size(len(X[0]))
models_importer.set_hyperp(hyperp)
modelBuilder = models_importer.get_deepNN

tot_trials = tools.get_search_spaze_size(hyperp)
modelName='deep_b'
tunerParameters = {
            'directory' : 'tuner',
            'project_name':'deep_b',
            'batch_size': 1492,
            'max_trials' : 0.15*tot_trials
}

net = KerasNet(modelName=modelName, mode=mode, X=X, y=y, tunerParameters=tunerParameters, modelBuilder=modelBuilder )
best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best params")
print(best_params.values)   
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(count_params(best_model.trainable_weights))

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(count_params(best_model.trainable_weights))
final_names.append(modelName)

# RandNN

In [None]:
trainingParameters_KERAS = {
            'directory' : 'tuner',
            'project_name':'regr_adam_kfold',
            'batch_size': 64,
            'max_trials' : 0.15*tot_trials,
        }

hyperp = {
                'units' : [5,7,10],
                'output_units' : 2, 
                'dropout' : [1e-3, 0.0],
                'learning_rate': [1e-2, 1e-3],
                'decay': [0.0, 1e-3],
                'depth':[2,6 ,1],
                'activation_hidden': 'relu',
                'activation_output':'linear',
                'metric': 'MSE'
            }
models_importer.set_input_size(len(X[0]))
models_importer.set_hyperp(hyperp)
modelBuilder = models_importer.get_RandNN
mode = 'reg'
modelName='RandNN'
net = KerasNet(modelName=modelName, mode=mode, X=X, y=y, tunerParameters=trainingParameters_KERAS,
                        modelBuilder=modelBuilder)

best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best params")
print(best_params.values)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(count_params(best_model.trainable_weights))

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(count_params(best_model.trainable_weights))
final_names.append(modelName)


# CNN

When implementing a CNN architecture, we implicitly assume that the attributes' position has some kind of significance 
(eg. column lying next to each other are somehow more related than further apart ones). 
Note that this is not directly implied by the data. However, the net could pick up on non-linear correlations between 
"close" attributes.
Although this model is mostly used for datasets which underly a pattern-like structure (eg. images), we tried it
anyway for experimentation's sake. 

In [None]:
trainingParameters_KERAS = {
            'directory' : 'tuner',
            'project_name':'regr_adam_kfold',
            'batch_size': 64,
            'max_trials' : 18,  #all of the possibilities
        }

hyperp = {
                'units' : [10,20,30],
                'output_units' : 2, 
                'learning_rate': [1e-2, 1e-3],
                'decay': [0.0, 1e-4, 1e-3],
                'activation_hidden': 'relu',
                'activation_output':'linear',
                'metric': 'MSE'
            }
models_importer.set_input_size(len(X[0]))
models_importer.set_hyperp(hyperp)
modelBuilder = models_importer.get_CNN
mode = 'reg'
modelName='cnn'
net = KerasNet(modelName=modelName, mode=mode, X=X, y=y, tunerParameters=trainingParameters_KERAS,
                        modelBuilder=modelBuilder)

best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_err = net.train()

print("Best params")
print(best_params.values)
print("Training Error")
print(best_training_err)
print("Validation Error")
print(best_model_MEE_val)
print("Test_MEE ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(count_params(best_model.trainable_weights))

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(count_params(best_model.trainable_weights))
final_names.append(modelName)

# Cascade Correlation

Library source code at https://github.com/mike-gimelfarb/cascade-correlation-neural-networks .  
Follows canonical cascade correlation algorithm.

In [None]:
trainingParameters_KERAS = {
            'directory' : 'tuner',
            'project_name':'regr_adam_kfold',
            'batch_size': 64,
            'max_trials' : 1,    #no hyperparameters
        }

hyperp = {}


models_importer.set_input_size(len(X[0]))
models_importer.set_hyperp(hyperp)
modelBuilder = models_importer.get_CC_units
mode = 'reg'
modelName='cc'
net = KerasNet(modelName=modelName, mode=mode, X=X, y=y, tunerParameters=trainingParameters_KERAS,
                        modelBuilder=modelBuilder)

best_model, best_model_MEE_val, best_params, mean_test_error, stdev_test_error, best_training_error = net.train()
print(best_params)

print("Training error")
print(best_training_error)
print("Validation Error")
print(best_model_MEE_val)
print("Test error ")
print(f'{mean_test_error} +/- {stdev_test_error}')
print("Effective free parameters")
print(tools.get_param_cc(n_in=9, n_out=2, n_hid=sum(best_params)))

final_MEE.append([mean_test_error, stdev_test_error])
final_dimension.append(tools.get_param_cc(n_in=9, n_out=2, n_hid=sum(best_params)))
final_names.append(modelName)

# Pytorch

Finally, we used Pytorch on two simple architectures:
- a one layer linear regressor 
- a shallow net with 4 hidden units and a linear activation function 

The validation strategy is the same as forementioned; however, for the final retraining, we chose to train over the whole dataset and pick the mean number of epochs, obtained from the cross validation. 


In [None]:
import torch
from torch import nn

from sklearn.model_selection import KFold, ShuffleSplit, train_test_split
from tools_for_Pytorch import EarlyStopping, weights_init_uniform_fan_in, count_parameters
from tools_for_classes import MEE_metric, save_plot

In [None]:
# transform attributes and targets to tensors
X = torch.from_numpy(X_df.to_numpy(dtype=np.float32))
y = torch.from_numpy(y_df.to_numpy(dtype=np.float32)) 

In [None]:
global test_mse_list, test_mee_list, epochs_list

test_mse_list = []
test_mee_list = []
epochs_list = []


def train(model, optimizer, X_train, y_train, X_val, y_val, X_test, y_test, name=None):

    '''Performs the forward and backwards training loop until early stopping, then computes the metric(s)'''

    loss_fn = nn.MSELoss()
    early_stopping = EarlyStopping()

    torch.manual_seed(42)

    epochs = 500
    epoch_count = []

    train_mse_values = []
    val_mse_values = []
    test_mse_values = []

    train_mee_values = []
    val_mee_values = []
    test_mee_values = []

    for epoch in range(epochs):

        # train mode
        model.train()

        # 1. Forward pass on train data
        train_pred = model(X_train)
        
        # 2. Calculate the loss
        train_mse = loss_fn(train_pred, y_train)
        train_mee = MEE_metric(y_train.numpy(), train_pred.detach().numpy())

        # 3. Zero grad of the optimizer
        optimizer.zero_grad()
        
        # 4. Backpropagation
        train_mse.backward()
        
        # 5. Progress the optimizer
        optimizer.step()
        
        # evaluation mode
        model.eval()
        
        # make predictions with model without gradient tracking 
        with torch.inference_mode():

            # 1. Forward pass on validation and test data
            val_pred = model(X_val)
            test_pred = model(X_test)

            # 2. Caculate mse and mee on validation and test data        
            val_mse = loss_fn(val_pred, y_val)                    
            test_mse = loss_fn(test_pred, y_test)
            val_mee = MEE_metric(y_val.numpy(), val_pred.numpy())                    
            test_mee = MEE_metric(y_test.numpy(), test_pred.numpy())        
        
        epoch_count.append(epoch)
        train_mse_values.append(train_mse)
        val_mse_values.append(val_mse)
        test_mse_values.append(test_mse)

        train_mee_values.append(train_mee)
        val_mee_values.append(val_mee)
        test_mee_values.append(test_mee)
    
        # early_stopping needs the validation loss to check if it has decreased
        early_stopping(val_mse, model)
        
        if early_stopping.early_stop:
            print("Early stopping")
            break
            
        if epoch % 10 == 0:
            print(f"Epoch is {epoch:<3} | Training MSE: {train_mse:.3f} | Validation MSE: {val_mse:.3f} | Trainining MEE: {train_mee:.3f} | Val MEE: {val_mee:.3f}")

    print(f"Epoch is {epoch:<3} \nTraining MSE: {train_mse:.3f} | Validation MSE: {val_mse:.3f} | Test MSE: {test_mse:.3f}")
    print(f"Training MEE: {train_mee:.3f} | Validation MEE: {val_mee:.3f} | Test MEE: {test_mee:.3f}")

    test_mse_list.append(test_mse_values[-1])
    test_mee_list.append(test_mee_values[-1])
    epochs_list.append(epoch_count[-1])

    if name: 
        fig,ax = plt.subplots()
        plt.plot(epoch_count, np.array(torch.tensor(train_mse_values).numpy()), label="Training MSE")
        plt.plot(epoch_count, val_mse_values, label="Validation MSE", linestyle='dashed')
        plt.title(name  + " TR and VL MSE")
        plt.ylabel("MSE")
        plt.xlabel("Epochs")
        plt.legend()
        folder = 'Pytorch-plots'
        save_plot(folder, name)
        plt.show()


In [None]:
def final_retraining(model, optimizer, X, y, epochs, name=None):
    
    '''Performs a final retraining over the whole dataset'''

    loss_fn = nn.MSELoss()

    epoch_count = []
    train_mse_values = []
    train_mee_values = []
    
    for epoch in range(epochs):

        # train mode
        model.train()

        # 1. Forward pass on train data
        train_pred = model(X)
        
        # 2. Calculate the loss
        train_mse = loss_fn(train_pred, y)
        train_mee = MEE_metric(y.numpy(), train_pred.detach().numpy())

        # 3. Zero grad of the optimizer
        optimizer.zero_grad()
        
        # 4. Backpropagation
        train_mse.backward()
        
        # 5. Progress the optimizer
        optimizer.step()

        epoch_count.append(epoch)
        train_mse_values.append(train_mse)
        train_mee_values.append(train_mee)
            
        if epoch % 10 == 0:
            print(f"Epoch is {epoch:<3} | Training MSE: {train_mse:.3f} | Trainining MEE: {train_mee:.3f}")

    print(f"Epoch is {epoch:<3} | Training MSE: {train_mse:.3f} | Training MEE: {train_mee:.3f}")

    if name: 
        fig,ax = plt.subplots()
        plt.plot(epoch_count, np.array(torch.tensor(train_mse_values).numpy()), label="Training MSE")
        plt.title(name  + " TR MSE")
        plt.ylabel("MSE")
        plt.xlabel("Epochs")
        plt.legend()
        folder = 'Pytorch-plots'
        save_plot(folder, name)
        plt.show()

In [None]:
K = 5   # number of folds used in k-fold
VAL_SPLIT = 1/(K-1) # validation split in k-fold
RANDOM_STATE = 42

outer_kfold = KFold(n_splits=K, shuffle=True, random_state=RANDOM_STATE)
inner_holdout = ShuffleSplit(n_splits=1, test_size=VAL_SPLIT, random_state=RANDOM_STATE)
w_init = weights_init_uniform_fan_in


## 1 Linear Layer

In [None]:
test_mee_list = []
epochs_list = []

name = 'pt_LR'

# outer loop for k times
for i, (dev_idx, test_idx) in enumerate(outer_kfold.split(X)):        
        X_dev, X_test = X[dev_idx], X[test_idx]
        y_dev, y_test = y[dev_idx], y[test_idx]

        print(f'\nFOLD N. {i+1}')

        # inner hold-out
        X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=VAL_SPLIT, shuffle=False)
        
        model = nn.Sequential(nn.Linear(in_features=9, out_features=2))

        # weights initialization 
        model.apply(w_init)

        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

        train(model, optimizer, X_train, y_train, X_val, y_val, X_test, y_test, name=name)

# computes the mean over the folds
mean_test_mee_error = np.mean(test_mee_list)
stdev_test_mee_error = np.std(test_mee_list)
mean_epochs = int(np.ceil(np.mean(epochs_list)))

print(f'\nMean epoch count: {mean_epochs} +/- {np.std(epochs_list):.0f}')
print(f'Test MEE: {mean_test_mee_error:.3f} +/- {stdev_test_mee_error:.3f}')

parameters_count = count_parameters(model)
print(f'Effective free paramters: {parameters_count}')

final_MEE.append([mean_test_mee_error, stdev_test_mee_error])
final_dimension.append(parameters_count)
final_names.append(name)


In [None]:
model = nn.Sequential(nn.Linear(in_features=9, out_features=2))

# weights initialization
model.apply(w_init)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

final_retraining(model, optimizer, X=X, y=y, epochs=mean_epochs, name=name)

## 4 hidden units

In [None]:
test_mee_list = []
epochs_list = []

name = 'pt_4hid'

# outer loop for k times
for i, (dev_idx, test_idx) in enumerate(outer_kfold.split(X)):        
        X_dev, X_test = X[dev_idx], X[test_idx]
        y_dev, y_test = y[dev_idx], y[test_idx]

        print(f'\nFOLD N. {i+1}')

        # inner hold-out
        X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=VAL_SPLIT, shuffle=False)
        
        model = nn.Sequential(
                nn.Linear(in_features=9, out_features=4),
                nn.Linear(in_features=4, out_features=2),
                )
        
        # weights initialization
        model.apply(w_init)

        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

        train(model, optimizer, X_train, y_train, X_val, y_val, X_test, y_test, name=name)

# computes the mean over the folds
mean_test_mee_error = np.mean(test_mee_list)
stdev_test_mee_error = np.std(test_mee_list)
mean_epochs = int(np.ceil(np.mean(epochs_list)))

print(f'\nMean epoch count: {mean_epochs} +/- {np.std(epochs_list):.0f}')
print(f'Test MEE: {mean_test_mee_error:.3f} +/- {stdev_test_mee_error:.3f}')

parameters_count = count_parameters(model)
print(f'Effective free parameters: {parameters_count}')

final_MEE.append([mean_test_mee_error, stdev_test_mee_error])
final_dimension.append(parameters_count)
final_names.append(name)

In [None]:
model = nn.Sequential(
        nn.Linear(in_features=9, out_features=4),
        nn.Linear(in_features=4, out_features=2),
        )

model.apply(w_init)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

final_retraining(model, optimizer, X=X, y=y, epochs=mean_epochs, name=name)



# Final comparison 

Now let's compare the model we have just built in relationship with their complexity.

In [None]:
#substitute rbf value with maximum of the list
final_dimension[final_dimension.index(-1)] = max(final_dimension)

#unpack mean and std
final_mean, final_std = zip(*final_MEE)

plt.figure()
fig, ax = plt.subplots(figsize=(10,10))

plt.xlabel('Effective free parameters')
plt.ylabel('MEE')

#scatter plot
for i, txt in enumerate(final_names):
    ax.errorbar(final_dimension[i], final_mean[i], yerr=final_std[i], label=final_names[i],fmt='.')
    ax.annotate(txt, (final_dimension[i], final_mean[i]))
folder = 'ML-CUP-plots'
tools.save_plot(folder, 'final')
#plt.legend()
plt.show()
print(f'Elapsed time: {time.time()-start}')
