# Minnesota T-Pups Plus/Minus Modeling

##### By: Mitch Brinkman

## Package Import

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import patsy
from patsy import dmatrices
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge #ordinary linear regression + w/ ridge regularization
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import KFold

In [None]:
from bball_func import*

## Pickle Time

In [None]:
pd.options.display.float_format = "{:,.2f}".format

In [None]:
avg_data_table = pd.read_pickle('final_data_table.pickle')

In [None]:
avg_data_table.head()

In [None]:
avg_data_table.info()

## NBA team Dummies

In [None]:
#Creating dummy variables from all the NBA teams

dummies = pd.get_dummies(avg_data_table['opp'])
avg_data_table = pd.concat([avg_data_table,dummies],axis=1)

In [None]:
make_integers(['ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'NOH', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA',
       'WAS'],avg_data_table)

In [None]:
dummy_X = avg_data_table.loc[:,['ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'NOH', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA',
       'WAS']]

dummy_y = avg_data_table['plus_minus']

In [None]:
dummy_X.info()

In [None]:
#preparing the train-test split for dummy variables

dummy_X_train_val, dummy_X_test, dummy_y_train_val, dummy_y_test = train_test_split(dummy_X, dummy_y, test_size=0.2,random_state=21)
X_train, X_val, y_train, y_val = train_test_split(dummy_X_train_val, dummy_y_train_val, test_size=.25, random_state=5)

In [None]:
X_train.columns

In [None]:
X_train.shape

In [None]:
#checking on validation shape

X_val.shape

In [None]:
dummy_X_test.shape

In [None]:
selected_columns1 = ['ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'NOH', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA',
       'WAS']

In [None]:
#running Lasso to check which dummy variables are significant enough to stick around

lasso_model = Lasso(alpha = .1)
lasso_model.fit(X_train.loc[:,selected_columns1], y_train)

In [None]:
list(zip(selected_columns1, lasso_model.coef_))

# CHI, LAL, PHO & SAC were kept in as positive coefficient dummy variables, the rest were discarded after Lasso treatment

In [None]:
drop_columns(['ATL', 'BOS', 'BRK', 'CHA', 'CHO', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'MEM', 'MIA', 'MIL', 'NOH', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'POR', 'SAS', 'TOR', 'UTA',
       'WAS','opp'],avg_data_table)

In [None]:
avg_data_table.columns

In [None]:
#resaving my final data table back to pickle form for safe keeping

avg_data_table.to_pickle('wolves_data_table.pickle')

In [None]:
plt.figure(figsize=(15,12))
sns.set(font_scale = 1.4)
# sns.set_style("ticks",{'xtick.major_size':2})
sns.pairplot(avg_data_table)
# plt.title('Def. Rebounds vs. Plus/Minus')
# plt.xlabel("Defensive Rebounds")
# plt.ylabel('Plus / Minus')
plt.savefig('pairplot.png');

## Feature Engineering

In [None]:
avg_data_table.info()

In [None]:

avg_data_table['days_rest*2p_avg'] = avg_data_table['_2p_avg'] * avg_data_table['days_rest']
avg_data_table['opp_2p_avg*opp_3p_avg'] = avg_data_table['opp_2p_avg'] * avg_data_table['opp_3p_avg']

## Train - Test Split

In [None]:
X, y = avg_data_table.drop(['plus_minus'],axis=1), avg_data_table['plus_minus']

# keep 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=365)

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=48)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=48)

In [None]:
selected_columns = ['home', 'orb_pct_avg', '2p_avg', '3p_avg',
       'ft_avg', 'ast_avg', 'tov_avg', 'pf_avg', 'opp_2p_avg', 'opp_3p_avg',
       'opp_ft_avg', 'opp_tov_avg', 'opp_pf_avg', 'drb_avg', 'days_rest',
      '2p_avg*days_rest','opp_2p_avg*opp_3p_avg', 'CHI', 'LAL', 'PHO', 'SAC']

## Train & Validation

In [None]:
lm = LinearRegression()

#Standard scaler to enable Ridge
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_reg = Ridge(alpha=1)

#tranforming train, val, & test to run poly model for each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

In [None]:
#Running linear, ridge and polynomial regression to see what works on the model moving forward

lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}')

In [None]:
def split_and_validate(X, y):
    '''
    For a set of features and target X, y, perform a 80/20 train/val split, 
    fit and validate a linear regression model, and report results
    '''
    scaler = StandardScaler()
    
    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=24)
    
    X_train_scaled = scaler.fit_transform(X_train.values)
    X_val_scaled = scaler.transform(X_val.values)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val_scaled, y_val)
    
    # getting the results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 

In [None]:
split_and_validate(X,y)

### Cross Validation

In [None]:
X, y = np.array(X), np.array(y)

In [None]:
#Time to run the cross validation

kf = KFold(n_splits=5, shuffle=True, random_state = 21)
cv_lm_r2s, cv_lm_reg_r2s = [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #simple linear regression
    lm = LinearRegression()
    lm_reg = Ridge(alpha=1)

    lm.fit(X_train, y_train)
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge regression with scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))

#simple and ridge regression scores
print('Simple regression scores: ', cv_lm_r2s)
print('Ridge scores: ', cv_lm_reg_r2s, '\n')

#linear & ridge regression mean Cross validation r-squares
print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')

## Test

In [None]:
#Our final test results with linear regression

lm.fit(X,y)
print(f'Linear Regression test R^2: {lm.score(X_test, y_test):.3f}')

## OLS Model

In [None]:
avg_data_table.info()

In [None]:
avg_model = sm.OLS(y, X, data=avg_data_table)

results = avg_model.fit()

# summarize our model
results.summary()

In [None]:
results.params