In [1]:
import numpy as np
import re
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn import metrics

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

%matplotlib inline

In [2]:
team_lookup = pickle.load(open('teams_lookup.pickle', "rb" ))
df = pickle.load(open('final_19.pickle', "rb" ))

In [22]:
def TVT_split(df, train_min = 12, val_min = 24, test_min = 28):
    
    df_train = df[(df.GP >= train_min) & (df.GP_vs >= train_min) & (df.GP <= (val_min -1))] 
    df_val = df[(df.GP >= val_min) & (df.GP <= (test_min -1))] 
    df_test = df[df.GP >= test_min]
    
    total = len(df)
    train = len(df_train)
    val = len(df_val)
    test = len(df_test)
    mature = train + val + test
    immature = total - mature
    
    
    splits = [['dataset','games','percent of total'],['total', total ,'1'],['immature', immature, immature/total],
               ['mature', mature, mature/total], ['dataset','games','percent of mature'], ['train', train, train/mature],
               ['val', val, val/mature], ['test', test, test/mature]]
    print(splits)
    return df_train, df_val, df_test

In [23]:
df_train, df_val, df_test = TVT_split(df)

[['dataset', 'games', 'percent of total'], ['total', 11206, '1'], ['immature', 4426, 0.39496698197394253], ['mature', 6780, 0.6050330180260575], ['dataset', 'games', 'percent of mature'], ['train', 4046, 0.596755162241888], ['val', 1394, 0.2056047197640118], ['test', 1340, 0.1976401179941003]]


In [59]:
cols = df.columns.tolist()
target = 'Spread'
all_predictors = ['Home', 'Away', 'Wins', 'Tm', 'Opp', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB',
                   'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'ORtg', 'DRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%',
                   'STL%', 'BLK%', 'OeFG%', 'OTOV%', 'ORB%', 'OFT/FGA', 'DeFG%', 'DTOV%', 'DRB%', 'DFT/FGA', 'P_S', 'FG_S',
                   'FGA_S', 'FG%_S', '3P_S', '3PA_S', '3P%_S', 'FT_S', 'FTA_S', 'FT%_S', 'TRB_S', 'AST_S', 'STL_S', 'BLK_S',
                   'TOV_S', 'PF_S', 'FTr_S', '3PAr_S', 'TS%_S', 'TRB%_S', 'AST%_S', 'STL%_S', 'BLK%_S',
                   'Home_vs', 'Away_vs', 'Wins_vs', 'Tm_vs', 'Opp_vs', 'FG_vs', 'FGA_vs', 'FG%_vs', '3P_vs', '3PA_vs',
                   '3P%_vs', 'FT_vs', 'FTA_vs', 'FT%_vs', 'ORB_vs', 'TRB_vs', 'AST_vs', 'STL_vs', 'BLK_vs', 'TOV_vs',
                   'PF_vs', 'ORtg_vs', 'DRtg_vs', 'Pace_vs', 'FTr_vs', '3PAr_vs', 'TS%_vs', 'TRB%_vs', 'AST%_vs',
                   'STL%_vs', 'BLK%_vs', 'OeFG%_vs', 'OTOV%_vs', 'ORB%_vs', 'OFT/FGA_vs', 'DeFG%_vs', 'DTOV%_vs',
                   'DRB%_vs', 'DFT/FGA_vs', 'P_S_vs', 'FG_S_vs', 'FGA_S_vs', 'FG%_S_vs', '3P_S_vs', '3PA_S_vs',
                   '3P%_S_vs', 'FT_S_vs', 'FTA_S_vs', 'FT%_S_vs', 'TRB_S_vs', 'AST_S_vs', 'STL_S_vs', 'BLK_S_vs',
                   'TOV_S_vs', 'PF_S_vs', 'FTr_S_vs', '3PAr_S_vs', 'TS%_S_vs', 'TRB%_S_vs', 'AST%_S_vs', 'STL%_S_vs',
                   'BLK%_S_vs']
info_cols = ['url','Team','Date','Opponent','Win_Loss','GP','GP_vs']

## Initial Model - OLS using all stats

In [71]:
scaler = StandardScaler()

In [81]:
# Create your model
model = sm.OLS(df_train[target], sm.add_constant(scaler.fit_transform(df_train[all_predictors])))

# Fit your model to your training set
fit = model.fit()

In [82]:
# Print summary statistics of the model's performance
fit.summary(xname = ['const'] + all_predictors)

0,1,2,3
Dep. Variable:,Spread,R-squared:,0.347
Model:,OLS,Adj. R-squared:,0.328
Method:,Least Squares,F-statistic:,17.72
Date:,"Wed, 15 Jul 2020",Prob (F-statistic):,2.33e-278
Time:,00:06:07,Log-Likelihood:,-15469.0
No. Observations:,4046,AIC:,31180.0
Df Residuals:,3927,BIC:,31930.0
Df Model:,118,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0788,0.177,-0.446,0.655,-0.425,0.268
Home,0.8447,0.827,1.021,0.307,-0.777,2.467
Away,-0.7596,0.827,-0.918,0.359,-2.382,0.863
Wins,0.7088,0.521,1.359,0.174,-0.313,1.731
Tm,-36.2420,75.430,-0.480,0.631,-184.128,111.644
Opp,79.5928,79.664,0.999,0.318,-76.594,235.780
FG,-33.3656,109.196,-0.306,0.760,-247.452,180.721
FGA,2.2978,4.715,0.487,0.626,-6.946,11.542
FG%,-23.2144,16.061,-1.445,0.148,-54.703,8.274

0,1,2,3
Omnibus:,18.604,Durbin-Watson:,1.105
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.407
Skew:,-0.001,Prob(JB):,3.04e-06
Kurtosis:,3.388,Cond. No.,1.36e+16


## Ridge Regression

In [98]:
model = sm.OLS(df_train[target], df_train[all_predictors])
fit_ridge = model.fit_regularized(method='elastic_net',L1_wt = 0)

In [103]:
print(fit_ridge.summary())

NotImplementedError: 