In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV,LinearRegression 
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import RepeatedKFold,GridSearchCV
%matplotlib inline

**Read & inspect data**

In [2]:
df = pd.read_csv('Hitters.csv')
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [3]:
df.dropna(inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64

In [5]:
object_col= df.dtypes[df.dtypes == 'object'].index.to_list()

In [6]:
df= pd.get_dummies(df,drop_first= True)

In [7]:
df.columns

Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists',
       'Errors', 'Salary', 'League_N', 'Division_W', 'NewLeague_N'],
      dtype='object')

In [8]:
x= df.drop(['Salary'],axis=1)

In [9]:
y= df.Salary

In [10]:
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size= 0.3,random_state= 123)

In [11]:
x_train.shape

(184, 19)

In [12]:
x_test.shape

(79, 19)

# Linear Regression

In [13]:
lm=LinearRegression(normalize=True)

In [14]:
lm.fit(x_train,y_train)

LinearRegression(normalize=True)

In [15]:
pd.Series(lm.coef_,index= x.columns)

AtBat           -0.563506
Hits            -0.354348
HmRun           -7.293162
Runs            -0.006273
RBI              3.088396
Walks            4.504213
Years          -26.135771
CAtBat          -0.382344
CHits            1.628746
CHmRun           1.681093
CRuns            0.518174
CRBI            -0.102791
CWalks          -0.166230
PutOuts          0.322546
Assists          0.483734
Errors          -6.563007
League_N        80.763985
Division_W    -138.026423
NewLeague_N    -44.193046
dtype: float64

In [16]:
lm.score(x_train,y_train)

0.5845361088369815

In [17]:
lm.score(x_test,y_test)
# Overfitting

0.13869206595056605

In [18]:
mean_squared_error(y_test,lm.predict(x_test))

149149.64336733537

# Ridge Regression

The Ridge() function has an alpha argument ( λ , but with a different name!) that is used to tune the model

*We fit a ridge regression model on the training set, and evaluate its MSE on the test set, using λ=0* >> Result will be similar with using Linear Regression

In [19]:
ridge_0= Ridge(alpha = 0, normalize = True)

In [20]:
ridge_0.fit(x_train, y_train)

Ridge(alpha=0, normalize=True)

In [21]:
y0_predict= ridge_0.predict(x_train)

In [22]:
mean_squared_error(y_train,y0_predict) 

89345.15719488387

In [23]:
ridge_0.score(x_train,y_train)

0.5845361088369815

In [24]:
ridge_0.score(x_test,y_test)
# overfitting on test data

0.13869206595061478

In [25]:
mean_squared_error(y_test,ridge_0.predict(x_test))

149149.64336732693

In [26]:
pd.Series(ridge_0.coef_,index= x.columns)

AtBat           -0.563506
Hits            -0.354348
HmRun           -7.293162
Runs            -0.006273
RBI              3.088396
Walks            4.504213
Years          -26.135771
CAtBat          -0.382344
CHits            1.628746
CHmRun           1.681093
CRuns            0.518174
CRBI            -0.102791
CWalks          -0.166230
PutOuts          0.322546
Assists          0.483734
Errors          -6.563007
League_N        80.763985
Division_W    -138.026423
NewLeague_N    -44.193046
dtype: float64

*We fit a ridge regression model on the training set, and evaluate its MSE on the test set, using  λ=6*

In [27]:
ridge_4= Ridge(alpha = 6, normalize = True)
#we'll want to standardize the variables so that they are on the same scale. 
#To do this, we can use the normalize = True parameter

In [28]:
ridge_4.fit(x_train, y_train)

Ridge(alpha=6, normalize=True)

In [29]:
y4_predict= ridge_4.predict(x_train)

In [30]:
pd.Series(ridge_4.coef_,index= x.columns)

AtBat           0.076770
Hits            0.294408
HmRun           0.831747
Runs            0.496614
RBI             0.498261
Walks           0.749548
Years           2.569006
CAtBat          0.009155
CHits           0.035550
CHmRun          0.268082
CRuns           0.071786
CRBI            0.072486
CWalks          0.074921
PutOuts         0.057635
Assists         0.003983
Errors         -0.458950
League_N        6.016955
Division_W    -27.727768
NewLeague_N     5.773661
dtype: float64

In [31]:
mean_squared_error(y_train,y4_predict) 

135920.4902049868

In [32]:
ridge_4.score(x_train,y_train)
# reduce the performance on train data

0.36795616547885623

In [33]:
ridge_4.score(x_test,y_test)
# improving the overfitting on test data

0.3386451100301352

In [34]:
mean_squared_error(y_test,ridge_4.predict(x_test))
# an improvement on MSE vs Linear model

114524.48314796004

*Instead of arbitrarily choosing alpha  =4 , it would be better to use cross-validation to choose the tuning parameter alpha. We can do this using the cross-validated ridge regression function, RidgeCV()*

In [35]:
cv = RepeatedKFold(n_splits=2, n_repeats=3, random_state=1)
ridgecv = RidgeCV(alphas = np.arange(0, 100, 1),cv=cv,
                  scoring = 'r2', normalize = True)

In [36]:
ridgecv.fit(x_train, y_train)

RidgeCV(alphas=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
        cv=RepeatedKFold(n_repeats=3, n_splits=2, random_state=1),
        normalize=True, scoring='r2')

In [37]:
ridgecv.alpha_
# optimal alpha

1

In [38]:
ridge_op= Ridge(alpha = ridgecv.alpha_, normalize = True)

In [39]:
ridge_op.fit(x_train,y_train)

Ridge(alpha=1, normalize=True)

In [40]:
ridge_op.score(x_train,y_train)

0.49187659195272704

In [41]:
ridge_op.score(x_test,y_test)

0.353411616208574

In [42]:
mean_squared_error(y_test,ridge_op.predict(x_test))

111967.41959005104

*grid search alpha values and discover what works best for a dataset*

In [43]:
model = Ridge()

In [44]:
# define model evaluation method
cv = RepeatedKFold(n_splits=2, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = np.arange(0, 100, 1)
grid

{'alpha': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])}

In [45]:
# define search
search = GridSearchCV(model, grid, 
                      scoring='r2', cv=cv)

In [46]:
# perform the search
results = search.fit(x_train, y_train)

In [47]:
results.best_score_

0.30300778026157255

In [48]:
results.best_params_

{'alpha': 99}

In [49]:
ridge_99= Ridge(alpha=99, normalize= True)

In [50]:
ridge_99.fit(x_train,y_train)

Ridge(alpha=99, normalize=True)

In [51]:
ridge_99.score(x_train,y_train)

0.06046449666678755

In [52]:
ridge_99.score(x_test,y_test)

0.05810861088071728

# Lasso regression

In [53]:
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)

In [54]:
lassocv.fit(x_train, y_train)

LassoCV(cv=10, max_iter=100000, normalize=True)

In [59]:
lassocv.alpha_
# optimal alpha_

0.39980283411400025

In [56]:
lasso = Lasso(max_iter = 10000, normalize = True)

In [57]:
lasso.set_params(alpha=lassocv.alpha_)

Lasso(alpha=0.39980283411400025, max_iter=10000, normalize=True)

In [61]:
lasso.fit(x_train, y_train)

Lasso(alpha=0.39980283411400025, max_iter=10000, normalize=True)

In [62]:
lasso.score(x_train,y_train)

0.5558880739900794

In [63]:
lasso.score(x_test,y_test)

0.23017582941824644

In [66]:
pd.Series(lasso.coef_, index=x.columns)
#the lasso has a substantial advantage over ridge regression in that the resulting coefficient estimates are sparse. 
#Here we see that 6 of the 19 coefficient estimates are exactly zero

AtBat           -0.000000
Hits             0.142440
HmRun           -2.649994
Runs             0.000000
RBI              0.000000
Walks            2.865954
Years          -27.219902
CAtBat          -0.000000
CHits            0.036275
CHmRun           0.000000
CRuns            0.574750
CRBI             0.601100
CWalks          -0.086848
PutOuts          0.313247
Assists          0.162286
Errors          -4.369054
League_N        30.147192
Division_W    -146.138872
NewLeague_N      0.000000
dtype: float64