# This notebook aims to help set the hyperparameters to the ideal value to do this we use the training and tests sets to fit the hypterparameters, leaving a validation set to study once we have picked the best features (from the feature selection notebook) and hyperparameters. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import logging
from sklearn import tree
from sklearn.preprocessing import PolynomialFeatures, normalize, StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
dataTrain= pd.read_csv('../Data Scraping/DataTrainSet.csv')
dataVal = pd.read_csv('../Data Scraping/DataValidateSet.csv')

features = ['player','pos_x','age_x','team_id_x','g_x','mp_x','fg_per_poss','fga_per_poss','fg_pct_x'
              ,'fg3_per_poss','fg3a_per_poss','fg3_pct_x','fg2_per_poss','fg2a_per_poss','fg2_pct_x','ft_per_poss'
              ,'fta_per_poss','ft_pct_x','orb_per_poss','drb_per_poss','trb_per_poss','ast_per_poss','stl_per_poss'
              ,'blk_per_poss','tov_per_poss','pf_per_poss','pts_per_poss','off_rtg','def_rtg','per','ts_pct'
              ,'fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','drb_pct','trb_pct','ast_pct','stl_pct','blk_pct'
              ,'tov_pct','usg_pct','ows','dws','ws','ws_per_48','obpm','dbpm','bpm','vorp'
              ,'mp_per_g','fg_per_g','fga_per_g','fg3_per_g','fg3a_per_g'
              ,'fg2_per_g','fg2a_per_g','efg_pct','ft_per_g','fta_per_g','orb_per_g','drb_per_g','trb_per_g'
              ,'ast_per_g','stl_per_g','blk_per_g','tov_per_g','pf_per_g','pts_per_g','fg_per_mp','fga_per_mp'
              ,'fg3_per_mp','fg3a_per_mp','fg2_per_mp','fg2a_per_mp','ft_per_mp','fta_per_mp','orb_per_mp'
              ,'drb_per_mp','trb_per_mp','ast_per_mp','stl_per_mp','blk_per_mp','tov_per_mp','pf_per_mp','pts_per_mp'
              ,'votes_first','points_won']

feat = features[4:-2]
XTrainDF = dataTrain[feat]
XValDF = dataVal[feat]
XTrainDF = XTrainDF.fillna(0)
XValDF = XValDF.fillna(0)

XTrain = XTrainDF.to_numpy()
XVal = XValDF.to_numpy()

yTrain = dataTrain['points_won'].to_numpy()
yVal = dataVal['points_won'].to_numpy()


In [11]:
def pipeline(XTrain, yTrain, XVal, yVal, estimators, params, filename, poly_fit):
    
    minimal_error, best_estimator = None, None
    
    for estimator in estimators:
        
        try: 
            print(f"Starting with estimator: {estimator.__name__}")
            logging.info(f"Starting with estimator: {estimator.__name__}")

            for index, cur_params in enumerate(params[estimator.__name__]):
                print(cur_params)
                regressor = estimator(**cur_params)

                # To collect MSE over each split
                errors = []

                # to collect accuracies
                accuracies = []
                top_1_accs = []

                # Get train data
                train_x = XTrain
                train_y = yTrain
                
                # Validate over one season only
                val_x = XVal
                val_y = yVal
                #val_y = val_y.reshape(val_y.shape[0], )

                

                if poly_fit is not None:
                    train_x = poly_fit.fit_transform(train_x)
                    val_x = poly_fit.fit_transform(val_x)

                
                shuffle_x, shuffle_y = shuffle(train_x, train_y)
                

                regressor.fit(shuffle_x, shuffle_y)
                predicted_y = regressor.predict(val_x)

                sorted_indices = np.argsort(predicted_y)[::-1]
                correct_indices = np.arange(len(val_y))

                curr_error = mean_squared_error(val_y, predicted_y)
                errors.append(curr_error)
                mean_error = np.average(errors)
            
                logging.info(f"Params: {cur_params}, MSE over all splits is: {mean_error:.4f}")
                print(
                    f"Params: {cur_params}, MSE over all splits is: {mean_error:.4f}")

                if minimal_error is None or mean_error < minimal_error:
                    minimal_error = mean_error
                    best_estimator = estimator(**cur_params)
        except Exception:
            print(f"Exception: {estimator}")
            continue
            
    return best_estimator

In [15]:
estimators = [Ridge]

params = {Ridge.__name__: 
    [
        {
            'alpha': 1.0, 'solver':'svd'
        },
        {
            'alpha': 10.0, 'solver':'svd'
        },
        {
            'alpha': 30.0, 'solver':'svd'
        },
        {
            'alpha': 50.0, 'solver':'svd'
        },
    ],
         }

In [16]:
best_estimator = pipeline(
    XTrain = XTrain,
    yTrain = yTrain,
    XVal = XVal,
    yVal = yVal, 
    estimators=estimators,
    params=params,
    filename="log_reg_poly_2.txt",
    # scaler=MinMaxScaler(),
    poly_fit=PolynomialFeatures(degree=2, interaction_only=True),
)

TypeError: pipeline() got an unexpected keyword argument 'XVal'

In [7]:
estimators = [RandomForestRegressor]

params = {RandomForestRegressor.__name__: 
    [ 
        {
            'n_estimators':100, 
            'min_samples_split':2,
            'min_weight_fraction_leaf':0
        },
        {
            'n_estimators':100, 
            'min_samples_split':2,
            'warm_start':'True'
        },
        {
            'n_estimators':100, 
            'min_samples_split':2,
            'min_impurity_decrease':0.0
        },
    
    ],
         }

best_estimator = pipeline(
    data_frame=train_data,
    estimators=estimators,
    params=params,
    filename="log_reg_poly_2.txt",
    # scaler=MinMaxScaler(),
    poly_fit=PolynomialFeatures(degree=1, interaction_only=True),
)

NameError: name 'train_data' is not defined

In [163]:
estimators = [SGDRegressor]

params = {SGDRegressor.__name__: 
    [ 
        {
            'loss':'squared_epsilon_insensitive',
            'epsilon':0.0001,
            'learning_rate':'adaptive',
            'eta0':0.01
            
        },
        {
            'loss':'squared_epsilon_insensitive',
            'epsilon':0.0001,
            'learning_rate':'adaptive',
            'eta0':0.01,
            'warm_start': 'True',
        },
        {
            'loss':'squared_epsilon_insensitive',
            'epsilon':0.0001,
            'learning_rate':'adaptive',
            'eta0':0.01
        },
    
    ],
         }

best_estimator = pipeline(
    XTrain = XTrain,
    yTrain = yTrain,
    XVal, = XVal,
    yVal = yVal, 
    estimators=estimators,
    params=params,
    filename="log_reg_poly_2.txt",
    # scaler=MinMaxScaler(),
    poly_fit=PolynomialFeatures(degree=2, interaction_only=True),
)

Starting with estimator: SGDRegressor
{'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01}
Params: {'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01}, MSE over all splits is: 0.0175
{'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01, 'warm_start': 'True'}
Params: {'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01, 'warm_start': 'True'}, MSE over all splits is: 0.0178
{'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01}
Params: {'loss': 'squared_epsilon_insensitive', 'epsilon': 0.0001, 'learning_rate': 'adaptive', 'eta0': 0.01}, MSE over all splits is: 0.0174


In [169]:
estimators = [GradientBoostingRegressor]

params = {GradientBoostingRegressor.__name__: 
    [ 
        {
            'loss':'ls',
            'learning_rate':0.5,
            'n_estimators':100
        },
        {
            'loss':'ls',
            'learning_rate':0.5,
            'n_estimators':300
        },
        {
            'loss':'ls',
            'learning_rate':0.1,
            'n_estimators':100
        },
        {
            'loss':'ls',
            'learning_rate':0.1,
            'n_estimators':300
        },
    
    ],
         }

best_estimator = pipeline(
    XTrain = XTrain,
    yTrain = yTrain,
    XVal, = XVal,
    yVal = yVal, 
    estimators=estimators,
    params=params,
    filename="log_reg_poly_2.txt",
    # scaler=MinMaxScaler(),
    poly_fit=PolynomialFeatures(degree=2, interaction_only=True),
)

Starting with estimator: GradientBoostingRegressor
{'loss': 'ls', 'learning_rate': 0.5, 'n_estimators': 100}
Params: {'loss': 'ls', 'learning_rate': 0.5, 'n_estimators': 100}, MSE over all splits is: 0.0001
{'loss': 'ls', 'learning_rate': 0.5, 'n_estimators': 300}
Params: {'loss': 'ls', 'learning_rate': 0.5, 'n_estimators': 300}, MSE over all splits is: 0.0000
{'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 100}
Params: {'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 100}, MSE over all splits is: 0.0026
{'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 300}
Params: {'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 300}, MSE over all splits is: 0.0003


In [181]:
estimators = [MLPRegressor]

params = {MLPRegressor.__name__: 
    [ 
        {
            'hidden_layer_sizes':(1000,),
            'activation':'relu',
            'solver':'lbfgs'
        },
        {
            'hidden_layer_sizes':(1000,),
            'activation':'relu',
            'solver':'lbfgs'
        },
        {
            'hidden_layer_sizes':(1000,),
            'activation':'relu',
            'solver':'lbfgs'
        },
    
    ],
         }

best_estimator = pipeline(
    XTrain = XTrain,
    yTrain = yTrain,
    XVal, = XVal,
    yVal = yVal, 
    estimators=estimators,
    params=params,
    filename="log_reg_poly_2.txt",
    # scaler=MinMaxScaler(),
    poly_fit=PolynomialFeatures(degree=2, interaction_only=True),
)

Starting with estimator: MLPRegressor
{'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}
Params: {'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}, MSE over all splits is: 0.0094
{'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}
Params: {'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}, MSE over all splits is: 0.0097
{'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}
Params: {'hidden_layer_sizes': (1000,), 'activation': 'relu', 'solver': 'lbfgs'}, MSE over all splits is: 0.0098
