In [1]:
import pandas as pd
import numpy as np
import pprint
import os
import matplotlib.pyplot as plt
import datetime

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

pd.set_option('display.max_columns', None)

## Research Question
Can we predict game outcomes using all past seasons' team statistics?
For instance, if teams A and B are playing, can we use all the past seasons' statistics for team A and B to predict who will win?

#### Loading Each Dataset

In [2]:
# loading each dataset into a dictionary
datasets = {}
for file_name in os.listdir('archive'):
    if file_name.endswith('.csv'):
        datasets[file_name.split('.')[0]] = pd.read_csv('archive/' + file_name, low_memory=False)

list(datasets.keys())

['teams', 'players', 'games', 'ranking', 'games_details']

#### Cleaning and Organizing


In [3]:
# only keeping the team id, abbreviation, nickname, city, and arena capacity for each team in the teams dataset
# replacing NaN and 0 values with the mean
datasets['teams'] = datasets['teams'][['TEAM_ID', 'ABBREVIATION', 'NICKNAME', 'CITY', 'ARENACAPACITY']].fillna(0).replace(0, datasets['teams']["ARENACAPACITY"].mean().round(0))

datasets['game_details'] = datasets['games_details'][["GAME_ID", "TEAM_ID", "MIN", "FG_PCT", "FG3_PCT", "FT_PCT", 
                                                    "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "PLUS_MINUS"]].dropna()
datasets['game_details']['MIN'] = datasets['game_details']['MIN'].str.split(':').apply(lambda x: float(x[0]) + float(x[1])/60).round(3)

datasets['ranking'] = datasets['ranking'][["TEAM_ID", "SEASON_ID", "STANDINGSDATE", "CONFERENCE", "G", "W_PCT"]]
datasets['ranking']["STANDINGSDATE"] = pd.to_datetime(datasets['ranking']["STANDINGSDATE"])

datasets['games'] = datasets['games'].drop(['GAME_STATUS_TEXT', "TEAM_ID_home", "TEAM_ID_away"], axis=1)
datasets['games']["GAME_DATE_EST"] = pd.to_datetime(datasets['games']["GAME_DATE_EST"])

#### Dataset Preprocessing

In [4]:
# aggregating the win percentages by team on that date
# first joining for the home team
datasets['games'] = datasets['games'].merge(datasets['ranking'], left_on=['HOME_TEAM_ID', 'GAME_DATE_EST'], right_on=['TEAM_ID', 'STANDINGSDATE'], how='left', suffixes=('', '_home_ranking')).rename(columns={'W_PCT': 'HOME_TEAM_W_PCT', 'CONFERENCE': 'HOME_TEAM_CONFERENCE'}).drop(['TEAM_ID', 'STANDINGSDATE', 'G'], axis=1)
# now we do the same for the away team
datasets['games'] = datasets['games'].merge(datasets['ranking'], left_on=['VISITOR_TEAM_ID', 'GAME_DATE_EST'], right_on=['TEAM_ID', 'STANDINGSDATE'], how='left', suffixes=('', '_away_ranking')).rename(columns={'W_PCT': 'VISITOR_TEAM_W_PCT', 'CONFERENCE': 'VISITOR_TEAM_CONFERENCE'}).drop(['TEAM_ID', 'STANDINGSDATE', 'G', 'SEASON_ID_away_ranking', 'SEASON_ID'], axis=1)

#fixing datatypes
datasets['games']['SEASON'] = datasets['games']['SEASON'].astype('str')

# nan values only for numerical variables, so we can fill them in with average
datasets['games'].fillna(datasets['games'].select_dtypes(include=['float64', 'int64']).mean(), inplace=True)

In [5]:
# now we have compiled a comprehensive dataset that we can do feature engineering on to make our models
# taking out the identifiers: game_id, home_id, visitor_team_id and copying the dataset
df = datasets['games'].copy().drop(['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'GAME_DATE_EST'], axis=1)
df.head()

Unnamed: 0,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,HOME_TEAM_CONFERENCE,HOME_TEAM_W_PCT,VISITOR_TEAM_CONFERENCE,VISITOR_TEAM_W_PCT
0,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1,West,0.613,West,0.323
1,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1,West,0.543,East,0.364
2,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.47,0.682,0.433,20.0,46.0,1,East,0.667,East,0.71
3,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1,East,0.6,East,0.235
4,2022,108.0,0.429,1.0,0.378,22.0,47.0,110.0,0.5,0.773,0.292,20.0,47.0,0,East,0.5,East,0.419


In [6]:
# normalizing and ohe the data
categorical_columns = ['SEASON', 'HOME_TEAM_CONFERENCE', 'VISITOR_TEAM_CONFERENCE']
numerical_columns = ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away', 'HOME_TEAM_W_PCT', 'VISITOR_TEAM_W_PCT']
target = df.pop('HOME_TEAM_WINS')

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

transformer = ColumnTransformer([('scaler', scaler, numerical_columns), ('ohe', ohe, categorical_columns)], remainder='passthrough')

transformed_df = transformer.fit_transform(df)

ohe_columns = transformer.named_transformers_['ohe'].get_feature_names_out(input_features=categorical_columns)
all_columns = numerical_columns + list(ohe_columns)

# Convert the numpy array to a DataFrame
df_transformed = pd.DataFrame(transformed_df, columns=all_columns)

# adding back the target variable to the dataset
df_transformed['HOME_TEAM_WINS'] = target

df_transformed.head()

Unnamed: 0,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_W_PCT,VISITOR_TEAM_W_PCT,SEASON_2003,SEASON_2004,SEASON_2005,SEASON_2006,SEASON_2007,SEASON_2008,SEASON_2009,SEASON_2010,SEASON_2011,SEASON_2012,SEASON_2013,SEASON_2014,SEASON_2015,SEASON_2016,SEASON_2017,SEASON_2018,SEASON_2019,SEASON_2020,SEASON_2021,SEASON_2022,HOME_TEAM_CONFERENCE_East,HOME_TEAM_CONFERENCE_West,VISITOR_TEAM_CONFERENCE_East,VISITOR_TEAM_CONFERENCE_West,HOME_TEAM_WINS
0,1.69919,0.410891,1.648311,0.233954,0.419333,0.396775,1.218498,0.509733,0.544237,-0.260829,0.291614,0.289059,0.51402,-0.904342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
1,1.246794,0.481598,1.907018,0.909851,-1.316714,-0.51038,0.845829,2.00686,0.059811,-0.150969,-0.290883,-0.784524,0.160757,-0.697327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1
2,0.794397,0.375537,0.255273,-0.38787,-0.159349,-0.963957,0.398626,0.365432,-0.744336,0.764533,-0.290883,0.595796,0.786538,1.04968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
3,0.718998,-0.349213,1.479156,-0.532062,0.805122,0.850352,-0.570313,-1.041506,-0.230844,-0.81013,-1.261712,0.595796,0.448414,-1.348668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
4,0.342001,-0.561336,2.384631,0.197907,-0.159349,0.547967,0.696761,0.906562,0.137319,-0.526325,-0.290883,0.749165,-0.056248,-0.419624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0


In [12]:
# checking for correlation between the variables and the target variable
df_transformed.corr()['HOME_TEAM_WINS'].sort_values(ascending=False)[:12]

HOME_TEAM_WINS                  1.000000
FG_PCT_home                     0.431611
HOME_TEAM_W_PCT                 0.396118
PTS_home                        0.394714
AST_home                        0.301019
FG3_PCT_home                    0.300957
REB_home                        0.245151
FT_PCT_home                     0.092896
HOME_TEAM_CONFERENCE_West       0.041981
VISITOR_TEAM_CONFERENCE_East    0.035941
SEASON_2012                     0.013025
SEASON_2010                     0.012279
Name: HOME_TEAM_WINS, dtype: float64

### Models

In [9]:
# first we split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_transformed.drop('HOME_TEAM_WINS', axis=1), df_transformed['HOME_TEAM_WINS'], test_size=0.2, random_state=42)

In [14]:
# now we fit a logistic regression frequentist GLM model
frequentist_model = sm.Logit(y_train, X_train).fit()
print(frequentist_model.summary())

Optimization terminated successfully.
         Current function value: 0.001785
         Iterations 22
                           Logit Regression Results                           
Dep. Variable:         HOME_TEAM_WINS   No. Observations:                21336
Model:                          Logit   Df Residuals:                    21300
Method:                           MLE   Df Model:                           35
Date:                Mon, 04 Dec 2023   Pseudo R-squ.:                  0.9974
Time:                        01:26:29   Log-Likelihood:                -38.091
converged:                       True   LL-Null:                       -14479.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
PTS_home                       258.4194     49.272      5.245      0.00

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


In [13]:
predicted_frequentist = frequentist_model.predict(X_test)
predicted_frequentist = np.where(predicted_frequentist > 0.5, 1, 0)
RMSE_frequentist = np.sqrt(np.mean((predicted_frequentist - y_test)**2))
print("RMSE: ", RMSE_frequentist)

RMSE:  0.041072774693139975


  return 1/(1+np.exp(-X))
