In [127]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold


# 3.0 Model Evaluation and Tuning

## 3.0 Get Data to be Modeled


I used David Ziganto's metrics and error functions to calculate the training error and validation error


In [90]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [270]:
def cross_validate_errors(modeldata, modeltarget, model, folds):  
    """
    Function to take a model, folds and data/target and prints the train error and validation error for each fold
    """
    train_e_list = []
    val_e_list = []
    for data_idx, val_idx in folds.split(modeldata,modeltarget):
        trainX = modeldata.iloc[data_idx]
        testX = modeldata.iloc[val_idx]
        trainy = modeltarget.iloc[data_idx]
        testy = modeltarget.iloc[val_idx]
        
        train_e, val_e = calc_metrics(trainX, trainy, testX, testy, model)
        
        train_e_list.append(train_e)
        val_e_list.append(val_e)
        
    terror = np.mean(train_e_list)
    verror = np.mean(val_e_list)
    return print(str(round(terror,6)) + ' Training Error, ' + str(round(verror,6)) + ' Valid error')

In [193]:
dataraw = pd.read_pickle('data/modeldata.pkl')
dataraw = dataraw.reset_index(drop=True)

In [194]:
dataraw.shape

(998, 10)

In [195]:
dataraw['logavgVORP'] = np.log10(dataraw['avgVORP'] + 1e1)

In [379]:
dataraw.head()

Unnamed: 0,draftnbr,Age,height,FG%,3P%,avgPTS,avgMP,avgTRB,avgAST,avgVORP,logavgVORP
0,1,19.0,6.833333,0.623,0.15,14.2,32.0,10.4,1.3,3.4,1.127105
1,2,19.0,6.583333,0.491,0.255,11.9,31.1,7.4,1.9,0.433333,1.018423
2,3,19.0,6.416667,0.445,0.339,14.8,34.2,6.7,2.2,1.316667,1.053719
3,4,21.0,6.333333,0.453,0.348,9.7,20.4,1.9,2.0,-0.3125,0.986212
4,5,21.0,6.833333,0.525,0.5,9.8,18.7,7.3,1.0,-0.244444,0.989252


In [355]:
modeldata = dataraw[['draftnbr','Age','avgTRB','avgAST','avgPTS','FG%']]
modeltarget = dataraw[['logavgVORP']]

In [285]:
alpha = [1e-7,1e-6,1e-5,1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1]
folds = KFold(n_splits=10, shuffle=True, random_state=15)

In [372]:
lasso = Lasso(alpha=alpha[3], fit_intercept=True, random_state=77)
reg = LinearRegression()

In [376]:
cross_validate_errors(modeldata, modeltarget, lasso,folds)

0.001349 Training Error, 0.001377 Valid error


In [377]:
lasso.fit(modeldata,modeltarget)
lasso.intercept_d

array([ 1.07904533])

In [378]:
lasso.coef_

array([-0.00138467, -0.00388794,  0.00474214,  0.00860353, -0.00072074,
        0.01850763])