# Regularization: Use Ridge, LASSO and ElasticNet

In [66]:
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model

In [67]:
df = pd.read_csv(os.path.join("data", "db", "CleanedObservations.csv"))
df.head()


Unnamed: 0,Player Id,Year,Position,Adjusted Salary,Log Adjusted Salary,Adjusted Team Payroll,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_SB,Batting_Career_RBI,...,Num_Post_Season_Appearances,Num_All_Star_Appearances,0.0,1B,2B,3B,C,MULTIPLE,P,SS
0,blanche01,2011,C,0.296,-1.218,0.221,0.522,0.297,0.008,0.158,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,bloomwi01,2011,MULTIPLE,0.266,-1.323,0.221,0.348,0.257,0.176,0.086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,blumge01,2011,MULTIPLE,0.399,-0.918,0.221,0.478,0.464,0.032,0.28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,branyru01,2011,1B,0.296,-1.218,0.221,0.522,0.341,0.024,0.271,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,demelsa01,2011,P,0.123,-2.092,0.221,0.0,0.012,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [68]:
# Get groups for CV
df.sort(['Player Id'], inplace=True)
player_ids = list(df['Player Id'].values)
groups = [player_ids.index(row['Player Id']) for index, row in df.iterrows()]

# X and y
y = np.asarray(df['Adjusted Salary'])
df = df.drop(['Player Id', 'Year', 'Log Adjusted Salary', 'Adjusted Salary', 'Position'], axis=1)
X = np.array(df)


  from ipykernel import kernelapp as app


## Ridge Regression

In [69]:
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10, 20, 100, 1000]
grid = {'alpha' : alpha_ridge}
regr = linear_model.Ridge()
regr = GridSearchCV(regr, grid, cv=GroupKFold(n_splits=5))
regr.fit(X, y, groups)

print("The best parameters are %s with a score of %0.2f"
      % (regr.best_params_, regr.best_score_))


The best parameters are {'alpha': 5} with a score of 0.62


In [70]:
regr = linear_model.Ridge(alpha=5)
regr.fit(X, y, groups)
regr.coef_

array([  0.56405102,  -0.19560965,  -6.04104233,  -1.16274783,
         3.00955534,  -1.33747773,   1.39889694,   2.47920027,
         2.85589117,   3.15226343,   1.39206338,   0.56139271,
        -1.3751761 ,  -2.29517226,   0.16138792,   5.73231245,
         3.52067645,   2.63769614,  -0.01526996,   0.28550994,
         5.11995647, -10.59877112,  -2.31141929,   0.        ,
         0.        ,  -0.20547784,   0.465587  ,   0.65918312,
         2.5841653 ,   0.17174075,   0.18745543,   2.21081241,
        -0.39057587,   0.28963831,   0.55677403,  -0.38383355,
        -0.03542169,  -0.09825079,  -0.08823198,   0.14990154])

In [71]:
regr.intercept_ 

-0.16080331004328285

In [72]:
ridge_coefficients = {}
for i, col in enumerate(df.columns):
    ridge_coefficients[col] = regr.coef_[i]

In [73]:
ridge_coefficients

{'0.0': -0.39057587143910538,
 '1B': 0.28963831227462217,
 '2B': 0.5567740265667418,
 '3B': -0.38383354872414599,
 'Adjusted Team Payroll': 0.56405102067908308,
 'Batting_Career_2B': 3.1522634262159572,
 'Batting_Career_3B': 1.392063383072341,
 'Batting_Career_AVG': -1.3374777290310322,
 'Batting_Career_G': -6.0410423257153951,
 'Batting_Career_H': 2.8558911690964304,
 'Batting_Career_HR': 0.56139270540886621,
 'Batting_Career_Num_Seasons': -0.19560965106335085,
 'Batting_Career_OBP': 0.16138792259196344,
 'Batting_Career_PSN': 1.3988969443216634,
 'Batting_Career_R': -2.2951722609059919,
 'Batting_Career_RBI': 3.0095553387131413,
 'Batting_Career_SB': -1.1627478339088211,
 'Batting_Career_SLG': 2.4792002682158638,
 'Batting_Career_TB': -1.3751761008729613,
 'C': -0.035421691264489852,
 'Fielding_Career_A': -0.20547783542690334,
 'Fielding_Career_E': 0.65918312303112425,
 'Fielding_Career_FPCT': 0.17174074638196138,
 'Fielding_Career_G': 2.5841653004703784,
 'Fielding_Career_PO': 0.465

In [86]:
print("{} variables selected.".format(sum([ridge_coefficients[var] != 0 for var in ridge_coefficients])))

38 variables selected.


## LASSO

In [77]:
alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10, 20, 100, 1000, 10000]
grid = {'alpha' : alpha_lasso}
regr = linear_model.Lasso()
regr = GridSearchCV(regr, grid, cv=GroupKFold(n_splits=5))
regr.fit(X, y, groups)

print("The best parameters are %s with a score of %0.2f"
      % (regr.best_params_, regr.best_score_))



The best parameters are {'alpha': 0.001} with a score of 0.62


In [75]:
regr = linear_model.Lasso(alpha=0.001)
regr.fit(X, y, groups)
lasso_coefficients = {}
for i, col in enumerate(df.columns):
    lasso_coefficients[col] = regr.coef_[i]

In [76]:
lasso_coefficients

{'0.0': -0.28592675529735534,
 '1B': 0.26181856934547587,
 '2B': 0.35404526164078254,
 '3B': -0.12171969885924695,
 'Adjusted Team Payroll': 0.54231569987963457,
 'Batting_Career_2B': 1.3545359477186045,
 'Batting_Career_3B': 0.53306092429719154,
 'Batting_Career_AVG': -0.24680673375303361,
 'Batting_Career_G': -0.0,
 'Batting_Career_H': 0.0,
 'Batting_Career_HR': 1.284782720712615,
 'Batting_Career_Num_Seasons': -1.3358330510491565,
 'Batting_Career_OBP': -0.0,
 'Batting_Career_PSN': 0.32905664259505402,
 'Batting_Career_R': -0.0,
 'Batting_Career_RBI': 0.0085320537162230231,
 'Batting_Career_SB': -0.0,
 'Batting_Career_SLG': 0.98473350955245387,
 'Batting_Career_TB': 0.52052229008278961,
 'C': 0.018720803576232986,
 'Fielding_Career_A': -0.0,
 'Fielding_Career_E': 0.0033981909511692017,
 'Fielding_Career_FPCT': 0.0,
 'Fielding_Career_G': 1.1812941142468409,
 'Fielding_Career_PO': 0.070324444252947163,
 'Fielding_G': 0.0,
 'Fielding_Num_Seasons': 0.0,
 'MULTIPLE': -0.00011525840125545

In [85]:
print("{} variables selected.".format(sum([lasso_coefficients[var] != 0 for var in lasso_coefficients])))

25 variables selected.


## Elastic Net

In [78]:
regr = linear_model.ElasticNet()
grid = {'alpha' : alpha_lasso,
        'l1_ratio' : [i / 10 for i in range(10+1)]}
regr = GridSearchCV(regr, grid, cv=GroupKFold(n_splits=5))
regr.fit(X, y, groups)

print("The best parameters are %s with a score of %0.2f"
      % (regr.best_params_, regr.best_score_))


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


The best parameters are {'l1_ratio': 0.6, 'alpha': 0.001} with a score of 0.62


In [79]:
regr = linear_model.ElasticNet(alpha=0.001, l1_ratio=0.6)
regr.fit(X, y, groups)
en_coefficients = {}
for i, col in enumerate(df.columns):
    en_coefficients[col] = regr.coef_[i]

In [80]:
en_coefficients

{'0.0': -0.3047726423733812,
 '1B': 0.25748750699647138,
 '2B': 0.35889132154662995,
 '3B': -0.14988674024090118,
 'Adjusted Team Payroll': 0.54625359519308125,
 'Batting_Career_2B': 0.84243456942856298,
 'Batting_Career_3B': 0.59913899613210242,
 'Batting_Career_AVG': -0.35894026434519272,
 'Batting_Career_G': -0.14729811028642528,
 'Batting_Career_H': 0.0,
 'Batting_Career_HR': 0.78105719697275222,
 'Batting_Career_Num_Seasons': -1.1793233106665073,
 'Batting_Career_OBP': 0.0,
 'Batting_Career_PSN': 0.52551886237215906,
 'Batting_Career_R': -0.0,
 'Batting_Career_RBI': 0.61225141161728402,
 'Batting_Career_SB': -0.17972207609872703,
 'Batting_Career_SLG': 1.1473103292071138,
 'Batting_Career_TB': 0.69807716951694032,
 'C': 0.0,
 'Fielding_Career_A': -0.0,
 'Fielding_Career_E': 0.071400782449289851,
 'Fielding_Career_FPCT': 0.0,
 'Fielding_Career_G': 1.1490235733182153,
 'Fielding_Career_PO': 0.17053361139204987,
 'Fielding_G': 0.0,
 'Fielding_Num_Seasons': 0.0,
 'MULTIPLE': -0.015179

In [87]:
print("{} variables selected.".format(sum([en_coefficients[var] != 0 for var in en_coefficients])))

28 variables selected.


## Comparison with manual feature selection

Below I use the features selected for their statistical significance. This is the model created with statsmodels:

In [94]:
predictors = ['Batting_Career_TB', 'Pitching_Career_IP', 'Pitching_Career_SO',
                          'Num_All_Star_Appearances', '0.0', '1B', '2B']
X = np.asarray(pd.DataFrame(df, columns=predictors))

In [95]:
scores = cross_val_score(regr, X, y, groups, cv=GroupKFold(n_splits=5))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.62 (+/- 0.15)


In [90]:
X = np.array(df)
regr = linear_model.ElasticNet(alpha=0.001, l1_ratio=0.6)
scores = cross_val_score(regr, X, y, groups, cv=GroupKFold(n_splits=5))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.62 (+/- 0.19)


In [91]:
df.head()

Unnamed: 0,Adjusted Team Payroll,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_SB,Batting_Career_RBI,Batting_Career_AVG,Batting_Career_PSN,Batting_Career_SLG,Batting_Career_H,Batting_Career_2B,...,Num_Post_Season_Appearances,Num_All_Star_Appearances,0.0,1B,2B,3B,C,MULTIPLE,P,SS
412,0.434,0.217,0.087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1857,0.31,0.13,0.043,0.0,0.0,0.125,0.0,0.042,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2363,0.289,0.174,0.067,0.0,0.0,0.125,0.0,0.042,0.0,0.0,...,0.077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
215,0.332,0.0,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
727,0.246,0.043,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
