In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from cup_helpers import read_ds
from sklearn.model_selection import train_test_split
from numpy import mean
from sklearn.metrics import make_scorer
import tensorflow as tf
from cup_helpers import SEED
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor


#the TR/VALIDATION and your internal TEST errors, in the original scale i.e. MEE for the 2023 cup (see next slides).




In [2]:
# Datasets Path
TR_PATH = "./data/ML-CUP23-TR.csv"
TS_PATH = "./data/ML-CUP23-TS.csv"

In [3]:
(X,y) = read_ds(TR_PATH)

In [4]:
cv_strategy = KFold(n_splits=5, random_state=42, shuffle=True)

In [5]:
# Train/internal test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

In [6]:
#custom mee metric
def mee(y_true,y_pred):
      if type(y_true) != tf.Tensor:
        y_true = np.array(y_true.astype(float))
      if type(y_pred) != tf.Tensor:
        y_pred = np.array(y_pred.astype(float))

      if y_true.ndim > 1:
          l2_norms = np.linalg.norm(np.subtract(y_pred, y_true), axis=1)
          return mean(l2_norms, axis=0)
      else:
          l2_norms = []
          for p in range(len(y_true)):
                l2_norms.append(np.linalg.norm(np.subtract(y_pred[p], y_true[p])))
          return mean(l2_norms)
      
custom_scores = {
    "mee": make_scorer(mee, greater_is_better=False),
}

In [7]:
# Function extracting each grid from dictionary of grids
def list_grids(grids_dict):
    return [grids_dict[item] for item in grids_dict]

In [8]:
GRID_SVR = {
  "linear_rbf_sigmoid": {
   "estimator__kernel": ['linear','rbf','sigmoid'],
   "estimator__C":[0.0001, 0.1, 10, 10000],
   "estimator__gamma" : ['scale', 'auto',1e-4,1e-1,1e1,1e4],
   "estimator__tol": [1e-4, 1e-8, 1e-2, 1e-1],
   "estimator__epsilon":[1e-4,1e-1,1e1,1e4],
   "estimator__max_iter": [10000]

  },
  "poly": {
    "estimator__kernel": ['poly'],
    "estimator__kernel": ['poly'],
    "estimator__C": [0.0001, 0.1, 10, 10000],
    "estimator__degree": [2, 3, 5, 7, 9],
    "estimator__gamma": ['scale', 'auto', 0.0001, 0.1, 10, 10000],
    "estimator__tol": [0.0001, 1e-8, 0.01, 0.1],
    "estimator__epsilon": [0.0001, 0.1, 10, 10000],
    "estimator__max_iter": [10000],
  }
}


GRID_LINEAR_REGRESSION = {"unique_entry": {"fit_intercept": [True]}}

GRID_LASSO = {
    "cyclic_selec": {
        "selection": ["cyclic"],
        "alpha": [0.0001, 0.1, 10.0, 10000.0],
        "positive": [False, True]
    },
    "random_selec": {
        "selection": ["random"],
        "alpha": [0.0001, 0.1, 10.0, 10000.0],
        "positive": [False, True],
        "random_state": [SEED],
    }
}

GRID_RIDGE = {
    "others": {
        "solver": ['saga', 'svd', 'lsqr', 'sag', 'sparse_cg', 'cholesky'],
        "alpha": [0.0001, 0.1, 10.0, 10000.0],
    },
    "lbfgs_positive": {
        "solver": ['lbfgs'],
        "alpha": [0.0001, 0.1, 10.0, 10000.0],
        "positive": [True]
    },
    "sag_saga": {
        "solver": ['sag', 'saga'],
        "alpha": [0.0001, 0.1, 10.0, 10000.0],
        "random_state": [SEED],
    }
}

GRID_KNR = {
     "unique_entry":{
     'n_neighbors': [5, 10, 15,20, 40, 60, 100,150, 200],
     'p' : [1,2,10,1000,100000],
     'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 
     'metric' : ['euclidean', 'minkowski'], 
     'weights' : ['distance', 'uniform'] 
}
}
GRID_RFR = {

    "unique_entry":{
        'n_estimators': [90,85,90,95],
        'criterion' : ['absolute_error','friedman_mse', 'squared_error'],
        'max_depth' : [8,7,8,9],  
        'min_samples_split': [4,6], 
        'min_samples_leaf': [1],
        'min_weight_fraction_leaf': [0,0.001], 
        'ccp_alpha' : [0.001, 0.002] 
    }
}

params_map = {
    "SVR": GRID_SVR,
    "linear_regression":GRID_LINEAR_REGRESSION,
    "lasso_regression":GRID_LASSO,
    "ridge_regression":GRID_RIDGE,
    "knr":GRID_KNR,
    "rfr":GRID_RFR
    }

In [9]:
def execute_gridesearch(X, y, model, model_name):
  cv = cv_strategy.split(X)
  params = params_map[model_name]
  params = list_grids(params)
  grid = GridSearchCV(model, params,scoring=custom_scores,refit="mee", cv= cv,n_jobs=-1,return_train_score=True).fit(X, y)
  results = pd.DataFrame(grid.cv_results_)
  print("mean validation error",abs(results.loc[grid.best_index_,"mean_test_mee"]))
  print("std validation error",abs(results.loc[grid.best_index_,"std_test_mee"]))
  print("mean",abs(results.loc[grid.best_index_,"mean_train_mee"]))
  print("std",abs(results.loc[grid.best_index_,"std_train_mee"]))
  return grid



In [10]:

def do_sklearn_GridSearchCV(X,y,X_test,y_test,model,model_name):
    print("Model Used: " + model_name)
    grid = execute_gridesearch(X, y, model, model_name)
    print("Model used: " + model_name + ", best parameters: " + str(grid.best_params_) )
    y_pred = grid.best_estimator_.predict(X_test)
    print("Execution time: " + str(grid.refit_time_) + " seconds")
    print("Test MEE:", mee(y_test, y_pred))
    print("------------------------------------------------------------------------------------------------------")


In [11]:
def grid_search_model(model,model_name,X_train,y_train,X_test,y_test):
    do_sklearn_GridSearchCV(X_train,y_train,X_test,y_test,model,model_name)

# SVR

In [None]:
grid_search_model(MultiOutputRegressor(SVR()),"SVR",X_train,y_train,X_test,y_test)

# Linear Regression

## Linear regression without regularization

In [None]:
grid_search_model(LinearRegression(),"linear_regression",X_train,y_train,X_test,y_test)

## Linear regression with L1 regularization

In [None]:
grid_search_model(Lasso(),"lasso_regression",X_train,y_train,X_test,y_test)

## Linear regression with L2 regularization (Thikonov regularization)

In [None]:
grid_search_model(Ridge(),"ridge_regression",X_train,y_train,X_test,y_test)

# KNR

In [None]:
grid_search_model(KNeighborsRegressor(),"knr",X_train,y_train,X_test,y_test)

# RFR CV

In [None]:
grid_search_model(RandomForestRegressor(),"rfr",X_train,y_train,X_test,y_test)