In [179]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',100)

In [192]:
final_df = pd.read_pickle('final_df.pickle')
final_df = final_df[final_df.tm != 'TOT']
final_df = pd.get_dummies(data=final_df, columns=['pos'])

final_df = final_df.groupby(['season','player']).sum().reset_index()
final_df.rename(columns={'year':'draft_year'}, inplace=True)
final_df = final_df.drop(['rk', 'from', 'to', 'per'],axis=1)

In [194]:
y = final_df['ws']
X = final_df.drop(['player','gs','ws','ftsy_pts','ws/48'],axis=1)

scale = MinMaxScaler()
transformed = scale.fit_transform(X)
X = pd.DataFrame(transformed, columns = X.columns)

  return self.partial_fit(X, y)


In [195]:
# Perform test train split
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

### Base prediction: Win share actual - mean win share

In [218]:
np.mean((final_df.ws-final_df.ws.mean())**2)

10.012270980790706

### First Models

In [196]:
# Build a Ridge, Lasso and regular linear regression model. 
# Note that in scikit learn, the regularization parameter is denoted by alpha (and not lambda)
ridge1 = Ridge(alpha=0.5)
ridge1.fit(X_train, y_train)

lasso1 = Lasso(alpha=0.5)
lasso1.fit(X_train, y_train)

lin = LinearRegression()
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [197]:
# Create preditions for training and test sets 
y_h_ridge_train1 = ridge1.predict(X_train)
y_h_ridge_test1 = ridge1.predict(X_test)

y_h_lasso_train1 = lasso1.predict(X_train)
y_h_lasso_test1 = lasso1.predict(X_test)

y_h_lin_train = lin.predict(X_train)
y_h_lin_test = lin.predict(X_test)

In [198]:
# Print errors 
print('Train Error Ridge Model', np.sum((y_train - y_h_ridge_train1)**2))
print('Test Error Ridge Model', np.sum((y_test - y_h_ridge_test1)**2))
print('\n')

print('Train Error Lasso Model', np.sum((y_train - y_h_lasso_train1)**2))
print('Test Error Lasso Model', np.sum((y_test - y_h_lasso_test1)**2))
print('\n')

print('Train Error Unpenalized Linear Model', np.sum((y_train - lin.predict(X_train))**2))
print('Test Error Unpenalized Linear Model', np.sum((y_test - lin.predict(X_test))**2))

Train Error Ridge Model 44.789669629150744
Test Error Ridge Model 19.008512927811644


Train Error Lasso Model 99917.86182071158
Test Error Lasso Model 44008.702344193094


Train Error Unpenalized Linear Model 28.132605617363456
Test Error Unpenalized Linear Model 11.636560883181387


In [199]:
print('Training r^2:', lin.score(X_train, y_train))
print('Testing r^2:', lin.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, lin.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, lin.predict(X_test)))

Training r^2: 0.9997184426777681
Testing r^2: 0.9997355816227331
Training MSE: 0.0027959258216421643
Testing MSE: 0.0026980201444890767


In [200]:
print('Training r^2:', ridge1.score(X_train, y_train))
print('Testing r^2:', ridge1.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, ridge1.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, ridge1.predict(X_test)))

Training r^2: 0.999551735107087
Testing r^2: 0.9995680682468742
Training MSE: 0.004451368478349309
Testing MSE: 0.00440726012701406


In [201]:
print('Training r^2:', lasso1.score(X_train, y_train))
print('Testing r^2:', lasso1.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, lasso1.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, lasso1.predict(X_test)))

Training r^2: 0.0
Testing r^2: -1.2785245767643616e-05
Training MSE: 9.930218825353963
Testing MSE: 10.203733444051263


In [202]:
# Print coefficients 
print('Ridge parameter coefficients:', ridge1.coef_)
print('Lasso parameter coefficients:', lasso1.coef_)
print('Linear model parameter coefficients:', lin.coef_)

Ridge parameter coefficients: [-2.74905222e-03  8.62908057e-02  1.48406968e-01  4.56529820e-01
  7.41611665e-01 -1.09762534e+00 -1.35629668e-02  6.53973421e-01
 -7.10735115e-01 -5.48509012e-04  5.41998432e-01 -8.56349846e-01
  2.06554741e-02  2.98559019e-02  8.43930131e-01 -3.06912624e-01
  1.64748145e-02  1.48691073e-01  5.18754354e-02  8.22362775e-02
  4.94949054e-01 -1.86937148e-01 -1.72761550e-01 -8.82398827e-01
 -7.31349454e-03  9.05307926e-01  9.73544529e-03 -2.68274139e-02
  1.52105821e-02 -1.12390753e-01 -1.98413881e-02  9.64055039e-02
 -8.02709951e-02  1.19037642e-01  1.44727012e-01 -1.08551279e-02
  2.97824400e-02  1.76164681e+01  1.34377116e+01  1.29891810e-01
 -2.44367009e-01  1.20198585e-02  1.23027958e+00 -1.50565402e-01
 -1.65814743e-02  2.09236136e-02 -3.20524069e-02 -2.94117174e-02
  4.14563106e-02 -8.38407667e-02 -3.78977844e-02]
Lasso parameter coefficients: [-0. -0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0. 

### Second set of Models

In [203]:
# Build a Ridge, Lasso and regular linear regression model. 
# Note that in scikit learn, the regularization parameter is denoted by alpha (and not lambda)
ridge2 = Ridge(alpha=0.3)
ridge2.fit(X_train, y_train)

lasso2 = Lasso(alpha=0.3)
lasso2.fit(X_train, y_train)

lin = LinearRegression()
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [204]:
# Create preditions for training and test sets 
y_h_ridge_train2 = ridge2.predict(X_train)
y_h_ridge_test2 = ridge2.predict(X_test)

y_h_lasso_train2 = lasso2.predict(X_train)
y_h_lasso_test2 = lasso2.predict(X_test)

y_h_lin_train = lin.predict(X_train)
y_h_lin_test = lin.predict(X_test)

In [205]:
# Print errors 
print('Train Error Ridge Model', np.sum((y_train - y_h_ridge_train2)**2))
print('Test Error Ridge Model', np.sum((y_test - y_h_ridge_test2)**2))
print('\n')

print('Train Error Lasso Model', np.sum((y_train - y_h_lasso_train2)**2))
print('Test Error Lasso Model', np.sum((y_test - y_h_lasso_test2)**2))
print('\n')

print('Train Error Unpenalized Linear Model', np.sum((y_train - lin.predict(X_train))**2))
print('Test Error Unpenalized Linear Model', np.sum((y_test - lin.predict(X_test))**2))

Train Error Ridge Model 35.325590958610796
Test Error Ridge Model 14.807650322799967


Train Error Lasso Model 59914.212023413435
Test Error Lasso Model 26697.93393361184


Train Error Unpenalized Linear Model 28.132605617363456
Test Error Unpenalized Linear Model 11.636560883181387


In [206]:
print('Training r^2:', ridge2.score(X_train, y_train))
print('Testing r^2:', ridge2.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, ridge2.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, ridge2.predict(X_test)))

Training r^2: 0.9996464536939151
Testing r^2: 0.9996635247382112
Training MSE: 0.0035107921843183063
Testing MSE: 0.003433259986737762


In [207]:
print('Training r^2:', lasso2.score(X_train, y_train))
print('Testing r^2:', lasso2.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, lasso2.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, lasso2.predict(X_test)))

Training r^2: 0.40036535078261604
Testing r^2: 0.39334100186706633
Training MSE: 5.954503281992987
Testing MSE: 6.190107566337083


In [208]:
# Print coefficients 
print('Ridge parameter coefficients:', ridge2.coef_)
print('Lasso parameter coefficients:', lasso2.coef_)
print('Linear model parameter coefficients:', lin.coef_)

Ridge parameter coefficients: [-2.21676793e-03  8.13812336e-02  9.72727057e-02  3.31936966e-01
  5.68309153e-01 -8.53520517e-01 -1.85901544e-02  5.27499299e-01
 -5.65103178e-01 -1.16778502e-04  4.06418759e-01 -6.60883875e-01
  2.50487086e-02  2.99662704e-02  6.59974177e-01 -2.84639635e-01
  7.97787459e-03  1.23764589e-01  2.16097533e-02  5.16973703e-02
  3.75214591e-01 -1.42994720e-01 -1.11349807e-01 -6.36811204e-01
 -7.88639157e-03  7.00801817e-01 -2.16848414e-02 -2.13319925e-02
  2.51068467e-02 -1.16919599e-01 -3.06921317e-02  9.99654608e-02
 -6.22898548e-02  1.03051108e-01  1.16054546e-01 -8.59703006e-03
  3.38250719e-02  1.81331714e+01  1.37686838e+01  1.04701251e-01
 -1.95840603e-01  9.49893159e-03  8.33628543e-01 -1.05283926e-01
 -1.53964922e-02  7.87164778e-03 -2.66772193e-02 -1.99414483e-02
  2.66717385e-02 -5.81310693e-02 -2.75644709e-02]
Lasso parameter coefficients: [ 0.         -0.          0.          0.          0.          0.
  0.          0.          0.          0.     

### IDK

In [209]:
# Build a Ridge, Lasso and regular linear regression model. 
# Note that in scikit learn, the regularization parameter is denoted by alpha (and not lambda)
ridge3 = Ridge(alpha=0.1)
ridge3.fit(X_train, y_train)

lasso3 = Lasso(alpha=0.1)
lasso3.fit(X_train, y_train)

lin = LinearRegression()
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [210]:
# Create preditions for training and test sets 
y_h_ridge_train3 = ridge3.predict(X_train)
y_h_ridge_test3 = ridge3.predict(X_test)

y_h_lasso_train3 = lasso3.predict(X_train)
y_h_lasso_test3 = lasso3.predict(X_test)

y_h_lin_train = lin.predict(X_train)
y_h_lin_test = lin.predict(X_test)

In [211]:
# Print errors 
print('Train Error Ridge Model', np.sum((y_train - y_h_ridge_train3)**2))
print('Test Error Ridge Model', np.sum((y_test - y_h_ridge_test3)**2))
print('\n')

print('Train Error Lasso Model', np.sum((y_train - y_h_lasso_train3)**2))
print('Test Error Lasso Model', np.sum((y_test - y_h_lasso_test3)**2))
print('\n')

print('Train Error Unpenalized Linear Model', np.sum((y_train - lin.predict(X_train))**2))
print('Test Error Unpenalized Linear Model', np.sum((y_test - lin.predict(X_test))**2))

Train Error Ridge Model 29.183840315201646
Test Error Ridge Model 12.084314004360543


Train Error Lasso Model 14863.512411572308
Test Error Lasso Model 6695.079707846835


Train Error Unpenalized Linear Model 28.132605617363456
Test Error Unpenalized Linear Model 11.636560883181387


In [212]:
print('Training r^2:', ridge3.score(X_train, y_train))
print('Testing r^2:', ridge3.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, ridge3.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, ridge3.predict(X_test)))

Training r^2: 0.9997079216890412
Testing r^2: 0.999725407297612
Training MSE: 0.0029004015419600124
Testing MSE: 0.002801834918701726


In [213]:
print('Training r^2:', lasso3.score(X_train, y_train))
print('Testing r^2:', lasso3.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, lasso3.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, lasso3.predict(X_test)))

Training r^2: 0.8512426893377405
Testing r^2: 0.8478672410351186
Training MSE: 1.477192646747397
Testing MSE: 1.5523022740196697


In [214]:
# Print coefficients 
print('Ridge parameter coefficients:', ridge3.coef_)
print('Lasso parameter coefficients:', lasso3.coef_)
print('Linear model parameter coefficients:', lin.coef_)

Ridge parameter coefficients: [-2.01178220e-03  5.67392496e-02  3.80038742e-02  1.65521655e-01
  2.79884905e-01 -4.31283258e-01 -4.65316045e-02  2.91027353e-01
 -3.04783445e-01  8.31119976e-04  1.89575702e-01 -3.26176677e-01
  3.28645935e-02  5.72630249e-02  3.50501843e-01 -1.88983431e-01
 -2.11312417e-03  6.81871938e-02  2.11550140e-03  2.08782224e-02
  1.94192526e-01 -8.34912780e-02 -1.89440535e-02 -2.94428901e-01
 -7.24617942e-03  3.56141981e-01 -3.37209143e-02 -1.63649527e-02
  3.79085911e-02 -1.30481326e-01 -7.31298822e-02  1.37126767e-01
 -4.06119458e-02  7.03509799e-02  4.87111888e-02 -1.65466666e-02
  3.17596929e-02  1.88139083e+01  1.41273904e+01  4.36102761e-02
 -1.00835144e-01 -7.92558999e-03  3.70289966e-01 -4.80114122e-02
 -1.06675339e-02 -1.19068648e-03 -2.64762035e-02 -1.36991148e-02
  7.40926748e-03 -3.48047954e-02 -2.02164592e-02]
Lasso parameter coefficients: [ 0.         -0.          0.          2.88296045  0.          0.
  0.          0.          0.         -0.     