# Predict Point Totals Using Linear Regression

## Imports

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Read in Compact Data

In [68]:
# Load regular season compact results
reg_season_df = pd.read_csv('./data/MDataFiles_Stage1/MRegularSeasonCompactResults.csv')
reg_season_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0
...,...,...,...,...,...,...,...,...
161547,2019,132,1153,69,1222,57,N,0
161548,2019,132,1209,73,1426,64,N,0
161549,2019,132,1277,65,1276,60,N,0
161550,2019,132,1387,55,1382,53,N,0


## Create Features

In [69]:
## Create Features

# Find average win score
avgWinScore = reg_season_df.groupby(['Season', 'WTeamID'])['WScore'].mean()
df_win_score = pd.DataFrame(index=avgWinScore.index, data=avgWinScore)
df_win_score.reset_index(inplace=True)
df_win_score = df_win_score.rename(columns={'WTeamID': 'TeamID', 'WScore': 'AvgWinScore'})

# Find average loss score
avgLossScore = reg_season_df.groupby(['Season', 'LTeamID'])['LScore'].mean()
df_loss_score = pd.DataFrame(index=avgLossScore.index, data=avgLossScore)
df_loss_score.reset_index(inplace=True)
df_loss_score = df_loss_score.rename(columns={'LTeamID': 'TeamID', 'LScore': 'AvgLossScore'})

train_df = pd.merge(df_win_score, df_loss_score, how='inner', on=['TeamID', 'Season'])

# Find average win total score
avgWinTotalScore = reg_season_df.groupby(['Season', 'WTeamID'])['WScore'].mean() + reg_season_df.groupby(['Season', 'WTeamID'])['LScore'].mean()
df_total_win_score = pd.DataFrame(index=avgWinTotalScore.index, data=avgWinTotalScore)
df_total_win_score.reset_index(inplace=True)
df_total_win_score.columns = ['Season', 'TeamID', 'AvgWinTotalScore']
train_df = pd.merge(train_df, df_total_win_score, how='inner', on=['TeamID', 'Season'])

# Find average loss total score
avgLossTotalScore = reg_season_df.groupby(['Season', 'LTeamID'])['WScore'].mean() + reg_season_df.groupby(['Season', 'LTeamID'])['LScore'].mean()
df_total_loss_score = pd.DataFrame(index=avgLossTotalScore.index, data=avgLossTotalScore)
df_total_loss_score.reset_index(inplace=True)
df_total_loss_score.columns = ['Season', 'TeamID', 'AvgLossTotalScore']

train_df = pd.merge(train_df, df_total_loss_score, how='inner', on=['TeamID', 'Season'])

# Find Average Win Margin
avgWinMargin = reg_season_df.groupby(['Season', 'WTeamID'])['WScore'].mean() - reg_season_df.groupby(['Season', 'WTeamID'])['LScore'].mean()
df_win_margin = pd.DataFrame(index=avgWinMargin.index, data=avgWinMargin)
df_win_margin.reset_index(inplace=True)
df_win_margin.columns = ['Season', 'TeamID', 'AvgWinMargin']

train_df = pd.merge(train_df, df_win_margin, how='inner', on=['TeamID', 'Season'])

# Find Average Loss Margin
avgLossMargin = reg_season_df.groupby(['Season', 'LTeamID'])['WScore'].mean() - reg_season_df.groupby(['Season', 'LTeamID'])['LScore'].mean()
df_loss_margin = pd.DataFrame(index=avgLossMargin.index, data=avgLossMargin)
df_loss_margin.reset_index(inplace=True)
df_loss_margin.columns = ['Season', 'TeamID', 'AvgLossMargin']

train_df = pd.merge(train_df, df_loss_margin, how='inner', on=['TeamID', 'Season'])

train_df

Unnamed: 0,Season,TeamID,AvgWinScore,AvgLossScore,AvgWinTotalScore,AvgLossTotalScore,AvgWinMargin,AvgLossMargin
0,1985,1102,71.000000,61.000000,132.000000,131.947368,10.000000,9.947368
1,1985,1103,70.222222,55.142857,132.888889,120.142857,7.555556,9.857143
2,1985,1104,72.095238,60.111111,131.000000,125.000000,13.190476,4.777778
3,1985,1106,75.100000,69.142857,140.700000,151.571429,9.500000,13.285714
4,1985,1108,85.842105,74.000000,157.842105,158.666667,13.842105,10.666667
...,...,...,...,...,...,...,...,...
11219,2019,1462,76.944444,65.800000,143.333333,141.466667,10.555556,9.866667
11220,2019,1463,83.761905,72.285714,153.857143,156.857143,13.666667,12.285714
11221,2019,1464,78.100000,71.200000,147.700000,155.250000,8.500000,12.850000
11222,2019,1465,80.083333,71.500000,146.250000,154.428571,13.916667,11.428571


In [70]:
# Need to "randomize" the winning and losing teams for training
random_season_df = reg_season_df
sample = random_season_df.sample(frac=0.5)

temp = sample['WTeamID'].copy(deep=True)
sample['WTeamID'] = sample['LTeamID'].copy(deep=True)
sample['LTeamID'] = temp.copy(deep=True)

temp = sample['WScore'].copy(deep=True)
sample['WScore'] = sample['LScore'].copy(deep=True)
sample['LScore'] = temp.copy(deep=True)

random_season_df.iloc[sample.index.tolist(), :] = sample

# Create a result column "Total" that is the total points scored by each team in a single game.
# This will be used to train our model.
random_season_df['Total'] = random_season_df['WScore'] + random_season_df['LScore']

# Add training data features for strong team
train_df = train_df.rename(columns={'TeamID': 'WTeamID'})
current_df = pd.merge(random_season_df, train_df, how='left', on=['Season', 'WTeamID'])
current_df = current_df.rename(columns=
                               {
                                   'AvgWinScore': 'WAvgWinScore',
                                   'AvgLossScore': 'WAvgLossScore',
                                   'AvgWinMargin': 'WAvgWinMargin',
                                   'AvgLossMargin': 'WAvgLossMargin',
                                   'AvgWinTotalScore': 'WAvgWinTotalScore',
                                   'AvgLossTotalScore': 'WAvgLossTotalScore'
                               })

# Add training data features for week team
train_df = train_df.rename(columns={'WTeamID': 'LTeamID'})
current_df = pd.merge(current_df, train_df, how='inner', on=['Season', 'LTeamID'])
current_df = current_df.rename(columns=
                               {
                                   'AvgWinScore': 'LAvgWinScore',
                                   'AvgLossScore': 'LAvgLossScore',
                                   'AvgWinMargin': 'LAvgWinMargin',
                                   'AvgLossMargin': 'LAvgLossMargin',
                                   'AvgWinTotalScore': 'LAvgWinTotalScore',
                                   'AvgLossTotalScore': 'LAvgLossTotalScore'
                               })

# Reset train_df
train_df = train_df.rename(columns={'LTeamID': 'TeamID'})

# Subtract relative columns (WNumWins - LNumWins, WAvgWinMargin - LAvgWinMargin, etc.)
current_df['AvgWinScoreSum'] = current_df['WAvgWinScore'] + current_df['LAvgWinScore']
current_df['AvgLossScoreSum'] = current_df['WAvgLossScore'] + current_df['LAvgLossScore']
current_df['WAvgScore'] = (current_df['WAvgWinScore'] + current_df['WAvgLossScore']) / 2
current_df['LAvgScore'] = (current_df['LAvgWinScore'] + current_df['LAvgLossScore']) / 2
current_df['AvgTotalScore'] = (current_df['WAvgWinTotalScore'] + current_df['WAvgLossTotalScore'] + current_df['LAvgWinTotalScore'] + current_df['LAvgLossTotalScore']) / 4

# Drop unneeded columns
current_df = current_df.drop(['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], axis=1)
cols = current_df.columns.tolist()
current_df = current_df[cols[0:3] + cols[4:] + [cols[3]]]
current_df = current_df.drop(['Season', 'WTeamID', 'LTeamID'], axis=1)
current_df

Unnamed: 0,WAvgWinScore,WAvgLossScore,WAvgWinTotalScore,WAvgLossTotalScore,WAvgWinMargin,WAvgLossMargin,LAvgWinScore,LAvgLossScore,LAvgWinTotalScore,LAvgLossTotalScore,LAvgWinMargin,LAvgLossMargin,AvgWinScoreSum,AvgLossScoreSum,WAvgScore,LAvgScore,AvgTotalScore,Total
0,92.800000,75.000000,167.640000,160.000000,17.960000,10.000000,73.217391,53.875000,129.086957,115.5,17.347826,7.750000,166.017391,128.875000,83.900000,63.546196,143.056739,145
1,66.230769,58.062500,126.076923,123.437500,6.384615,7.312500,73.217391,53.875000,129.086957,115.5,17.347826,7.750000,139.448161,111.937500,62.146635,63.546196,123.525345,147
2,74.714286,69.636364,138.214286,148.818182,11.214286,9.545455,73.217391,53.875000,129.086957,115.5,17.347826,7.750000,147.931677,123.511364,72.175325,63.546196,132.904856,144
3,92.800000,75.000000,167.640000,160.000000,17.960000,10.000000,73.217391,53.875000,129.086957,115.5,17.347826,7.750000,166.017391,128.875000,83.900000,63.546196,143.056739,143
4,64.083333,55.307692,119.916667,121.692308,8.250000,11.076923,73.217391,53.875000,129.086957,115.5,17.347826,7.750000,137.300725,109.182692,59.695513,63.546196,121.548983,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161308,79.666667,64.230769,150.666667,141.769231,8.666667,13.307692,80.800000,72.733333,148.600000,154.2,13.000000,8.733333,160.466667,136.964103,71.948718,76.766667,148.808974,127
161309,76.058824,71.583333,144.470588,150.833333,7.647059,7.666667,80.800000,72.733333,148.600000,154.2,13.000000,8.733333,156.858824,144.316667,73.821078,76.766667,149.525980,150
161310,64.000000,57.423077,121.500000,134.153846,6.500000,19.307692,80.800000,72.733333,148.600000,154.2,13.000000,8.733333,144.800000,130.156410,60.711538,76.766667,139.613462,111
161311,84.266667,73.000000,157.866667,157.416667,10.666667,11.416667,80.800000,72.733333,148.600000,154.2,13.000000,8.733333,165.066667,145.733333,78.633333,76.766667,154.520833,178


## Train Regression Model

In [73]:
# Train on regular season results using XGBoost regression model
X = current_df.iloc[:, :-1]
y = current_df.iloc[:, -1]

train_X, test_X, train_y, test_y = train_test_split(X, y, 
                      test_size = 0.2, random_state = 42) 

d_train = xgb.DMatrix(train_X, train_y) # format data properly
params = {
    'booster': 'gblinear',
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'eta': '.01'
}
watchlist= [(d_train, "train")]
num_boost_round = 10000
model = xgb.train(params=params, dtrain=d_train, num_boost_round=num_boost_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=50)
print("Finished Training")

model.save_model('./models/totals.model')

[0]	train-mae:117.38301
[20]	train-mae:14.62803
[40]	train-mae:14.52291
[60]	train-mae:14.51323
[80]	train-mae:14.50104
[100]	train-mae:14.48904
[120]	train-mae:14.47728
[140]	train-mae:14.46574
[160]	train-mae:14.45445
[180]	train-mae:14.44339
[200]	train-mae:14.43257
[220]	train-mae:14.42197
[240]	train-mae:14.41157
[260]	train-mae:14.40135
[280]	train-mae:14.39133
[300]	train-mae:14.38156
[320]	train-mae:14.37194
[340]	train-mae:14.36253
[360]	train-mae:14.35331
[380]	train-mae:14.34427
[400]	train-mae:14.33542
[420]	train-mae:14.32673
[440]	train-mae:14.31825
[460]	train-mae:14.30992
[480]	train-mae:14.30173
[500]	train-mae:14.29373
[520]	train-mae:14.28587
[540]	train-mae:14.27815
[560]	train-mae:14.27060
[580]	train-mae:14.26321
[600]	train-mae:14.25596
[620]	train-mae:14.24882
[640]	train-mae:14.24184
[660]	train-mae:14.23499
[680]	train-mae:14.22824
[700]	train-mae:14.22161
[720]	train-mae:14.21514
[740]	train-mae:14.20875
[760]	train-mae:14.20255
[780]	train-mae:14.19640
[800]

[6360]	train-mae:13.81690
[6380]	train-mae:13.81656
[6400]	train-mae:13.81623
[6420]	train-mae:13.81590
[6440]	train-mae:13.81557
[6460]	train-mae:13.81524
[6480]	train-mae:13.81489
[6500]	train-mae:13.81455
[6520]	train-mae:13.81422
[6540]	train-mae:13.81389
[6560]	train-mae:13.81356
[6580]	train-mae:13.81323
[6600]	train-mae:13.81288
[6620]	train-mae:13.81257
[6640]	train-mae:13.81225
[6660]	train-mae:13.81193
[6680]	train-mae:13.81159
[6700]	train-mae:13.81127
[6720]	train-mae:13.81094
[6740]	train-mae:13.81061
[6760]	train-mae:13.81027
[6780]	train-mae:13.80995
[6800]	train-mae:13.80964
[6820]	train-mae:13.80931
[6840]	train-mae:13.80898
[6860]	train-mae:13.80867
[6880]	train-mae:13.80835
[6900]	train-mae:13.80802
[6920]	train-mae:13.80772
[6940]	train-mae:13.80737
[6960]	train-mae:13.80705
[6980]	train-mae:13.80673
[7000]	train-mae:13.80643
[7020]	train-mae:13.80609
[7040]	train-mae:13.80579
[7060]	train-mae:13.80547
[7080]	train-mae:13.80516
[7100]	train-mae:13.80484
[7120]	train

## Test Regression Model

In [74]:
# Test model
d_test = xgb.DMatrix(test_X)
pred = model.predict(d_test)
print("Predictions: ", pred)

rmse = mean_squared_error(test_y, pred, squared=False)
print("Root Mean Squared Error : %f \n" %(rmse))

mae = mean_absolute_error(test_y, pred)
print("Mean Absolute Error : %f \n" %(mae))

r2 = r2_score(test_y, pred)
print("R-Squared : %f \n" %(r2))
# R-squared of 0 indicates that it is equivalent to guessing the average value over time (mean total points)

Predictions:  [131.15569 134.65544 144.4498  ... 136.43274 152.13007 141.17555]
Root Mean Squared Error : 17.538127 

Mean Absolute Error : 13.677145 

R-Squared : 0.330071 



Using regular season compact results, we achieve a rmse of 17.538, mae of 13.677, and r2 of 0.33