In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, median_absolute_error, mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import PLSRegression

In [2]:
# Number of cross validation folds, and scoring criteria
n_folds = 10
scoring = {'R2': make_scorer(r2_score), 
           'MAD': make_scorer(median_absolute_error, greater_is_better=False), 
           'RMSE': make_scorer(mean_squared_error, greater_is_better=False, squared=False)}

In [3]:
df = pd.read_csv('mm_data.csv')
Y = df['result_regr']
X = df.drop(columns=['result_class','result_regr'])
X.shape

(1334, 35)

### Elastic Net Regression

In [4]:
pipe = Pipeline(steps=[('enet', ElasticNet(fit_intercept=False, random_state=321))])
# pipe = Pipeline(steps=[('center_scale', StandardScaler()), ('enet', ElasticNet(fit_intercept=False, random_state=321))])
parameters = {
    "enet__l1_ratio" : [0.1, 0.25, 0.5, 0.75, 0.9],  # 1 = l1 regularization only; 0 = l2 regularization only
    "enet__alpha" : [.001, .005, 0.1, 0.5, 1, 2, 4],  # Regularization parameter (default = 1)
    }
reg = GridSearchCV(pipe, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_enet__l1_ratio', 'param_enet__alpha', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_enet__l1_ratio,param_enet__alpha,R2_metric,RMSE_metric,MAD_metric
10,0.1,0.1,0.428,10.96,7.55
11,0.25,0.1,0.428,10.96,7.55
21,0.25,1.0,0.434,10.91,7.56
12,0.5,0.1,0.429,10.96,7.56
19,0.9,0.5,0.432,10.92,7.57
22,0.5,1.0,0.433,10.92,7.57


In [5]:
enet = ElasticNet(l1_ratio=0.25, alpha=1, fit_intercept=False, random_state=321)
enet.fit(X,Y)
enet_coeffs = pd.DataFrame({'feature':X.columns, 'coefficient':enet.coef_})
enet_coeffs = enet_coeffs.loc[enet_coeffs['coefficient']!=0]
enet_coeffs.iloc[(-enet_coeffs['coefficient'].abs()).argsort()].iloc[0:10]

Unnamed: 0,feature,coefficient
0,srs_diff,1.030827
1,rank_diff,0.37981
3,pace_diff,-0.241265
4,fg3_diff_a,0.236867
14,stl_diff_a,0.207467
23,opp_tov_pct_diff,0.16288
11,ast_pct_diff,-0.158816
29,opp_blk_pct_diff,0.144491
5,fg3_diff_b,0.140102
9,opp_fta_diff,-0.134614


### Principal Component Regression

In [6]:
pipe = Pipeline(steps=[('pca', PCA(svd_solver='full', whiten=True)), ('ols', LinearRegression(fit_intercept=False))])
parameters = {
    "pca__n_components" : [0.95, 0.96, 0.97, 0.98, 0.99]
}
reg = GridSearchCV(pipe, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_pca__n_components', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_pca__n_components,R2_metric,RMSE_metric,MAD_metric
4,0.99,0.433,10.91,7.51
2,0.97,0.436,10.85,7.58
1,0.96,0.437,10.84,7.59
3,0.98,0.433,10.9,7.63
0,0.95,0.413,11.17,7.74


### PCA - Linear SVM

In [7]:
LinSVR = LinearSVR(fit_intercept=False, loss='squared_epsilon_insensitive', random_state=321)
pipe = Pipeline(steps=[('pca', PCA(svd_solver='full', whiten=True)), ('LinSVR', LinSVR)])
parameters = {
    "pca__n_components" : [0.95, 0.96, 0.97, 0.98, 0.99], 
    "LinSVR__C" : [.0001, .0003, .001, .003, 0.01, 0.03, .1, .3, 1],  # Regularization parameter (default = 1)
    }
reg = GridSearchCV(pipe, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_pca__n_components', 'param_LinSVR__C', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_pca__n_components,param_LinSVR__C,R2_metric,RMSE_metric,MAD_metric
19,0.99,0.003,0.432,10.98,7.31
14,0.99,0.001,0.405,11.31,7.31
15,0.95,0.003,0.411,11.24,7.36
18,0.98,0.003,0.432,10.98,7.41
13,0.98,0.001,0.404,11.31,7.45
12,0.97,0.001,0.407,11.27,7.47


### Nonlinear SVM

In [8]:
# pipe = Pipeline(steps=[('SVR', SVR())])
pipe = Pipeline(steps=[('pca', PCA(svd_solver='full', whiten=True)), ('SVR', SVR())])
parameters = {
    "pca__n_components" : [0.95, 0.96, 0.97, 0.98, 0.99], 
    "SVR__kernel" : ['poly', 'rbf', 'sigmoid'], 
    "SVR__C" : [.06, .125, .25, .5, 1, 2, 4, 8],  # Regularization parameter (default = 1)
    }
reg = GridSearchCV(pipe, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_pca__n_components', 'param_SVR__kernel', 'param_SVR__C', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_pca__n_components,param_SVR__kernel,param_SVR__C,R2_metric,RMSE_metric,MAD_metric
86,0.96,sigmoid,2.0,0.39,11.23,7.35
70,0.95,sigmoid,1.0,0.395,11.35,7.37
57,0.97,sigmoid,0.5,0.426,11.03,7.38
87,0.97,sigmoid,2.0,0.386,11.25,7.38
56,0.96,sigmoid,0.5,0.426,11.03,7.39
55,0.95,sigmoid,0.5,0.399,11.36,7.39


### Artificial Neural Network

In [9]:
pca = PCA(svd_solver='full', whiten=True)
ann = MLPRegressor(max_iter=2000, random_state=321)
pipe = Pipeline(steps=[('pca', pca), ('ann', ann)])
parameters = {
    "pca__n_components" : [0.96, 0.97, 0.98, 0.99], 
    "ann__activation" : ['logistic', 'relu'], 
    "ann__hidden_layer_sizes" : [(2,),(3,),(4,),(5,),(6,)]
    }
reg = GridSearchCV(pipe, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_pca__n_components', 'param_ann__activation', 'param_ann__hidden_layer_sizes', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_pca__n_components,param_ann__activation,param_ann__hidden_layer_sizes,R2_metric,RMSE_metric,MAD_metric
23,0.99,relu,"(2,)",0.375,11.41,7.47
35,0.99,relu,"(5,)",0.42,11.03,7.48
31,0.99,relu,"(4,)",0.425,10.97,7.52
15,0.99,logistic,"(5,)",0.372,11.49,7.53
24,0.96,relu,"(3,)",0.43,10.89,7.54
33,0.97,relu,"(5,)",0.427,10.93,7.55


### Partial Least Squares

In [10]:
pls = PLSRegression()
parameters = {
    "n_components" : [x+1 for x in list(range(10))]
}
reg = GridSearchCV(pls, param_grid=parameters, scoring=scoring, cv=n_folds, refit=False, n_jobs=-1)
reg.fit(X, Y)

# display top hyperparameter configurations
cv_out = pd.DataFrame.from_dict(reg.cv_results_)
cv_out['RMSE_metric'] = np.round(-cv_out['mean_test_RMSE'] + 0.5*cv_out['std_test_RMSE'], 2)
cv_out['MAD_metric'] = np.round(-cv_out['mean_test_MAD'] + 0.5*cv_out['std_test_MAD'], 2)
cv_out['R2_metric'] = np.round(cv_out['mean_test_R2'] - 0.5*cv_out['std_test_R2'], 3)
cv_out.sort_values(['MAD_metric', 'RMSE_metric'], ascending=True, inplace=True)
cv_out.iloc[0:6,:][['param_n_components', 
                     'R2_metric', 'RMSE_metric', 'MAD_metric']]

Unnamed: 0,param_n_components,R2_metric,RMSE_metric,MAD_metric
3,4,0.399,11.4,7.51
7,8,0.424,11.08,7.57
8,9,0.424,11.05,7.58
6,7,0.411,11.18,7.61
2,3,0.387,11.5,7.69
9,10,0.424,11.02,7.71
