In [1]:
import pandas as pd
df = pd.read_csv('baseGamesAndOdds.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16979 entries, 0 to 16978
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              16979 non-null  int64  
 1   GAME_DATE_EST   16979 non-null  object 
 2   GAME_ID         16979 non-null  int64  
 3   SEASON          16979 non-null  int64  
 4   HOME_TEAM_ID    16979 non-null  int64  
 5   HOME_W%         16979 non-null  float64
 6   AWAY_TEAM_ID    16979 non-null  int64  
 7   AWAY_W%         16979 non-null  float64
 8   SPREAD          8602 non-null   float64
 9   HOME_TEAM_WINS  16979 non-null  int64  
 10  MARGIN          16979 non-null  float64
dtypes: float64(4), int64(6), object(1)
memory usage: 1.4+ MB


In [2]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## Algo: Choose team with highest current win %

In [3]:
df['W%_PRED'] = np.where(df['HOME_W%'] >= df["AWAY_W%"], True, False).astype("int32")
df.head()

Unnamed: 0,ID,GAME_DATE_EST,GAME_ID,SEASON,HOME_TEAM_ID,HOME_W%,AWAY_TEAM_ID,AWAY_W%,SPREAD,HOME_TEAM_WINS,MARGIN,W%_PRED
0,0,2019-04-10,21801220,2018,1610612737,0.354,1610612754,0.585,-5.25,0,-1.0,0
1,1,2019-04-10,21801221,2018,1610612751,0.512,1610612748,0.476,-4.58,1,19.0,1
2,2,2019-04-10,21801222,2018,1610612766,0.476,1610612753,0.512,-3.5,0,-8.0,0
3,3,2019-04-10,21801223,2018,1610612752,0.207,1610612765,0.5,8.58,0,-26.0,0
4,4,2019-04-10,21801224,2018,1610612755,0.622,1610612741,0.268,-6.42,1,16.0,1


In [4]:
y_actual = df['HOME_TEAM_WINS']
y_pred_algo = df['W%_PRED']

# confusion matrix
print(confusion_matrix(y_actual, y_pred_algo))

# accuracy: (tp + tn) / (p + n)
print('Accuracy: %f' % accuracy_score(y_actual, y_pred_algo))

# precision tp / (tp + fp)
print('Precision: %f' % precision_score(y_actual, y_pred_algo))

# recall: tp / (tp + fn)
print('Recall: %f' % recall_score(y_actual, y_pred_algo))

[[5091 1822]
 [3058 7008]]
Accuracy: 0.712586
Precision: 0.793658
Recall: 0.696205


## Algo: Choose team that vegas chooses

In [5]:
# Removing all rows without the SPREAD variable
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8602 entries, 0 to 8608
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              8602 non-null   int64  
 1   GAME_DATE_EST   8602 non-null   object 
 2   GAME_ID         8602 non-null   int64  
 3   SEASON          8602 non-null   int64  
 4   HOME_TEAM_ID    8602 non-null   int64  
 5   HOME_W%         8602 non-null   float64
 6   AWAY_TEAM_ID    8602 non-null   int64  
 7   AWAY_W%         8602 non-null   float64
 8   SPREAD          8602 non-null   float64
 9   HOME_TEAM_WINS  8602 non-null   int64  
 10  MARGIN          8602 non-null   float64
 11  W%_PRED         8602 non-null   int32  
dtypes: float64(4), int32(1), int64(6), object(1)
memory usage: 840.0+ KB


In [6]:
df['VEGAS_PRED'] = np.where((df['SPREAD'] <= 0), True, False).astype("int32")
df.head(50)

Unnamed: 0,ID,GAME_DATE_EST,GAME_ID,SEASON,HOME_TEAM_ID,HOME_W%,AWAY_TEAM_ID,AWAY_W%,SPREAD,HOME_TEAM_WINS,MARGIN,W%_PRED,VEGAS_PRED
0,0,2019-04-10,21801220,2018,1610612737,0.354,1610612754,0.585,-5.25,0,-1.0,0,1
1,1,2019-04-10,21801221,2018,1610612751,0.512,1610612748,0.476,-4.58,1,19.0,1,1
2,2,2019-04-10,21801222,2018,1610612766,0.476,1610612753,0.512,-3.5,0,-8.0,0,1
3,3,2019-04-10,21801223,2018,1610612752,0.207,1610612765,0.5,8.58,0,-26.0,0,0
4,4,2019-04-10,21801224,2018,1610612755,0.622,1610612741,0.268,-6.42,1,16.0,1,1
5,5,2019-04-10,21801225,2018,1610612763,0.402,1610612744,0.695,5.42,1,15.0,0,0
6,6,2019-04-10,21801226,2018,1610612749,0.732,1610612760,0.598,4.08,0,-11.0,1,0
7,7,2019-04-10,21801227,2018,1610612759,0.585,1610612742,0.402,-14.0,1,11.0,1,1
8,8,2019-04-10,21801228,2018,1610612743,0.659,1610612750,0.439,-12.67,1,4.0,1,1
9,9,2019-04-10,21801229,2018,1610612746,0.585,1610612762,0.61,-7.33,1,6.0,0,1


In [7]:
y_actual = df['HOME_TEAM_WINS']
y_pred_algo = df['VEGAS_PRED']

# confusion matrix
print(confusion_matrix(y_actual, y_pred_algo))

# accuracy: (tp + tn) / (p + n)
print('Accuracy: %f' % accuracy_score(y_actual, y_pred_algo))

# precision tp / (tp + fp)
print('Precision: %f' % precision_score(y_actual, y_pred_algo))

# recall: tp / (tp + fn)
print('Recall: %f' % recall_score(y_actual, y_pred_algo))

[[1886 1666]
 [1031 4019]]
Accuracy: 0.686468
Precision: 0.706948
Recall: 0.795842


## Adding column for if home team beats spread
- Potentially another classification target?

In [8]:
df['HOME_TEAM_BEATS_SPREAD'] = np.where((df['MARGIN'] > -df['SPREAD']), True, False).astype("int32")
df.head(50)

Unnamed: 0,ID,GAME_DATE_EST,GAME_ID,SEASON,HOME_TEAM_ID,HOME_W%,AWAY_TEAM_ID,AWAY_W%,SPREAD,HOME_TEAM_WINS,MARGIN,W%_PRED,VEGAS_PRED,HOME_TEAM_BEATS_SPREAD
0,0,2019-04-10,21801220,2018,1610612737,0.354,1610612754,0.585,-5.25,0,-1.0,0,1,0
1,1,2019-04-10,21801221,2018,1610612751,0.512,1610612748,0.476,-4.58,1,19.0,1,1,1
2,2,2019-04-10,21801222,2018,1610612766,0.476,1610612753,0.512,-3.5,0,-8.0,0,1,0
3,3,2019-04-10,21801223,2018,1610612752,0.207,1610612765,0.5,8.58,0,-26.0,0,0,0
4,4,2019-04-10,21801224,2018,1610612755,0.622,1610612741,0.268,-6.42,1,16.0,1,1,1
5,5,2019-04-10,21801225,2018,1610612763,0.402,1610612744,0.695,5.42,1,15.0,0,0,1
6,6,2019-04-10,21801226,2018,1610612749,0.732,1610612760,0.598,4.08,0,-11.0,1,0,0
7,7,2019-04-10,21801227,2018,1610612759,0.585,1610612742,0.402,-14.0,1,11.0,1,1,0
8,8,2019-04-10,21801228,2018,1610612743,0.659,1610612750,0.439,-12.67,1,4.0,1,1,0
9,9,2019-04-10,21801229,2018,1610612746,0.585,1610612762,0.61,-7.33,1,6.0,0,1,0


In [9]:
y_actual = df['HOME_TEAM_BEATS_SPREAD']
y_pred_algo = df['W%_PRED']

# confusion matrix
print(confusion_matrix(y_actual, y_pred_algo))

# accuracy: (tp + tn) / (p + n)
print('Accuracy: %f' % accuracy_score(y_actual, y_pred_algo))

# precision tp / (tp + fp)
print('Precision: %f' % precision_score(y_actual, y_pred_algo))

# recall: tp / (tp + fn)
print('Recall: %f' % recall_score(y_actual, y_pred_algo))

[[2302 2062]
 [1833 2405]]
Accuracy: 0.547198
Precision: 0.538393
Recall: 0.567485
