In [28]:
import pandas as pd
import numpy as np
import pprint
import os
import matplotlib.pyplot as plt
import pymc as pm
import arviz as az
import datetime, time

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import statsmodels.api as sm

pd.set_option('display.max_columns', None)

#### Research Question 1: How well can we predict the winner of basketball games using key categorical and numerical data (# assists, conference, win percentage etc.)?

#### Loading and Cleaning up the data

In [2]:
# loading the data
datasets = {}
for file_name in os.listdir('archive'):
    if file_name.endswith('.csv'):
        datasets[file_name.split('.')[0]] = pd.read_csv('archive/' + file_name, low_memory=False)

list(datasets.keys())

['teams', 'players', 'games', 'ranking', 'games_details']

In [3]:
# cleaning each dataset in the dictionary, keeping only relevant columns

# filling missing values with the mean for stadiums capacity
datasets['teams'] = datasets['teams'][['TEAM_ID', 'ABBREVIATION', 'NICKNAME', 'CITY', 'ARENACAPACITY']].fillna(0).replace(0, datasets['teams']["ARENACAPACITY"].mean().round(0))

# dropping missing values and converting the minutes played into a float
datasets['game_details'] = datasets['games_details'][["GAME_ID", "TEAM_ID", "MIN", "FG_PCT", "FG3_PCT", "FT_PCT", 
                                                    "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "PLUS_MINUS"]].dropna()
datasets['game_details']['MIN'] = datasets['game_details']['MIN'].str.split(':').apply(lambda x: float(x[0]) + float(x[1])/60).round(3)

# converting the date into a datetime object
datasets['ranking'] = datasets['ranking'][["TEAM_ID", "SEASON_ID", "STANDINGSDATE", "CONFERENCE", "G", "W_PCT"]]
datasets['ranking']["STANDINGSDATE"] = pd.to_datetime(datasets['ranking']["STANDINGSDATE"])

# converting the date into a datetime object
datasets['games'] = datasets['games'].drop(['GAME_STATUS_TEXT', "TEAM_ID_home", "TEAM_ID_away"], axis=1)
datasets['games']["GAME_DATE_EST"] = pd.to_datetime(datasets['games']["GAME_DATE_EST"])

#### Data Preprocessing

In [4]:
# combining the win percentage of the home and away team based on the date of the game
# home teams
datasets['games'] = datasets['games'].merge(datasets['ranking'], left_on=['HOME_TEAM_ID', 'GAME_DATE_EST'], right_on=['TEAM_ID', 'STANDINGSDATE'], how='left', suffixes=('', '_home_ranking')).rename(columns={'W_PCT': 'HOME_TEAM_W_PCT', 'CONFERENCE': 'HOME_TEAM_CONFERENCE'}).drop(['TEAM_ID', 'STANDINGSDATE', 'G'], axis=1)

# away teams
datasets['games'] = datasets['games'].merge(datasets['ranking'], left_on=['VISITOR_TEAM_ID', 'GAME_DATE_EST'], right_on=['TEAM_ID', 'STANDINGSDATE'], how='left', suffixes=('', '_away_ranking')).rename(columns={'W_PCT': 'VISITOR_TEAM_W_PCT', 'CONFERENCE': 'VISITOR_TEAM_CONFERENCE'}).drop(['TEAM_ID', 'STANDINGSDATE', 'G', 'SEASON_ID_away_ranking', 'SEASON_ID'], axis=1)

# creating a new column for the difference in win percentage between the home and away team
datasets['games']['W_PCT_DIFF'] = datasets['games']['HOME_TEAM_W_PCT'] - datasets['games']['VISITOR_TEAM_W_PCT']

# fixing data type for the season year for possible ohe later
datasets['games']['SEASON'] = datasets['games']['SEASON'].astype('str')

# replacing nan values with the average for that column
datasets['games'].fillna(datasets['games'].select_dtypes(include=['float64', 'int64']).mean(), inplace=True)

datasets['games'].head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,HOME_TEAM_CONFERENCE,HOME_TEAM_W_PCT,VISITOR_TEAM_CONFERENCE,VISITOR_TEAM_W_PCT,W_PCT_DIFF
0,2022-12-22,22200477,1610612740,1610612759,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1,West,0.613,West,0.323,0.29
1,2022-12-22,22200478,1610612762,1610612764,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1,West,0.543,East,0.364,0.179
2,2022-12-21,22200466,1610612739,1610612749,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.47,0.682,0.433,20.0,46.0,1,East,0.667,East,0.71,-0.043
3,2022-12-21,22200467,1610612755,1610612765,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1,East,0.6,East,0.235,0.365
4,2022-12-21,22200468,1610612737,1610612741,2022,108.0,0.429,1.0,0.378,22.0,47.0,110.0,0.5,0.773,0.292,20.0,47.0,0,East,0.5,East,0.419,0.081


In [5]:
# now we can drop the columns we don't need i.e. the identifiers and the win percentages
df = datasets['games'].copy().drop(['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'GAME_DATE_EST', 'HOME_TEAM_W_PCT', 'VISITOR_TEAM_W_PCT'], axis=1)
df.head()

Unnamed: 0,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,HOME_TEAM_CONFERENCE,VISITOR_TEAM_CONFERENCE,W_PCT_DIFF
0,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1,West,West,0.29
1,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1,West,East,0.179
2,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.47,0.682,0.433,20.0,46.0,1,East,East,-0.043
3,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1,East,East,0.365
4,2022,108.0,0.429,1.0,0.378,22.0,47.0,110.0,0.5,0.773,0.292,20.0,47.0,0,East,East,0.081


In [6]:
# now we can standardize the numerical variables and one hot encode the categorical variables
categorical_columns = ['SEASON', 'HOME_TEAM_CONFERENCE', 'VISITOR_TEAM_CONFERENCE']
numerical_columns = ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away', 'W_PCT_DIFF']
target_column = df.pop('HOME_TEAM_WINS')

scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# applying the transformations to the dataframe
transformer = ColumnTransformer([('scaler', scaler, numerical_columns), ('ohe', ohe, categorical_columns)], remainder='passthrough')
transformed_df = transformer.fit_transform(df)

# converting the transformed data back to a df
ohe_columns = transformer.named_transformers_['ohe'].get_feature_names_out(input_features=categorical_columns)
all_columns = numerical_columns + list(ohe_columns)
transformed_df = pd.DataFrame(transformed_df, columns=all_columns)
df_transformed = pd.concat([transformed_df, target_column], axis=1)

df_transformed.head()

Unnamed: 0,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,W_PCT_DIFF,SEASON_2003,SEASON_2004,SEASON_2005,SEASON_2006,SEASON_2007,SEASON_2008,SEASON_2009,SEASON_2010,SEASON_2011,SEASON_2012,SEASON_2013,SEASON_2014,SEASON_2015,SEASON_2016,SEASON_2017,SEASON_2018,SEASON_2019,SEASON_2020,SEASON_2021,SEASON_2022,HOME_TEAM_CONFERENCE_East,HOME_TEAM_CONFERENCE_West,VISITOR_TEAM_CONFERENCE_East,VISITOR_TEAM_CONFERENCE_West,HOME_TEAM_WINS
0,1.69919,0.410891,1.648311,0.233954,0.419333,0.396775,1.218498,0.509733,0.544237,-0.260829,0.291614,0.289059,0.918867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
1,1.246794,0.481598,1.907018,0.909851,-1.316714,-0.51038,0.845829,2.00686,0.059811,-0.150969,-0.290883,-0.784524,0.555849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1
2,0.794397,0.375537,0.255273,-0.38787,-0.159349,-0.963957,0.398626,0.365432,-0.744336,0.764533,-0.290883,0.595796,-0.170187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
3,0.718998,-0.349213,1.479156,-0.532062,0.805122,0.850352,-0.570313,-1.041506,-0.230844,-0.81013,-1.261712,0.595796,1.164149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
4,0.342001,-0.561336,2.384631,0.197907,-0.159349,0.547967,0.696761,0.906562,0.137319,-0.526325,-0.290883,0.749165,0.235347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0


In [7]:
# counting the number of datapoints for all seasons 2003-2022
season_columns = [col for col in df_transformed.columns if col.startswith('SEASON_')]
season_counts = df_transformed[season_columns].sum(axis=0).sort_values(ascending=False)
season_counts / len(df_transformed)

SEASON_2005    0.053691
SEASON_2013    0.053504
SEASON_2008    0.053429
SEASON_2009    0.053391
SEASON_2010    0.053316
SEASON_2012    0.053241
SEASON_2006    0.053204
SEASON_2014    0.053166
SEASON_2015    0.053091
SEASON_2007    0.052904
SEASON_2016    0.052679
SEASON_2021    0.052079
SEASON_2003    0.051929
SEASON_2017    0.051817
SEASON_2018    0.051667
SEASON_2004    0.051067
SEASON_2020    0.047580
SEASON_2019    0.046530
SEASON_2011    0.041393
SEASON_2022    0.020322
dtype: float64

##### The season statistics are somewhat equal for all seasons, thus they do not seem statistically significant, so we can drop these

In [8]:
df_transformed.corr()['HOME_TEAM_WINS'].sort_values(ascending=False)

HOME_TEAM_WINS                  1.000000
W_PCT_DIFF                      0.482869
FG_PCT_home                     0.431611
PTS_home                        0.394714
AST_home                        0.301019
FG3_PCT_home                    0.300957
REB_home                        0.245151
FT_PCT_home                     0.092896
HOME_TEAM_CONFERENCE_West       0.041981
VISITOR_TEAM_CONFERENCE_East    0.035941
SEASON_2012                     0.013025
SEASON_2010                     0.012279
SEASON_2007                     0.011459
SEASON_2005                     0.008566
SEASON_2004                     0.008457
SEASON_2009                     0.007131
SEASON_2008                     0.006930
SEASON_2015                     0.003311
SEASON_2011                     0.003022
SEASON_2006                     0.002031
SEASON_2022                     0.000444
SEASON_2018                     0.000363
SEASON_2017                    -0.002162
SEASON_2016                    -0.002998
SEASON_2003     

In [9]:
# dropping the season columns
df_transformed.drop(season_columns, axis=1, inplace=True)
df_transformed.head()

# also drop the points scored by the home team and the away team as they are highly correlated with the target variable
df_transformed.drop(['PTS_home', 'PTS_away'], axis=1, inplace=True)

## Modeling

#### Frequentist GLM

In [10]:
# splitting the data into train and test sets
X = sm.add_constant(df_transformed.drop('HOME_TEAM_WINS', axis=1))
y = df_transformed['HOME_TEAM_WINS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Frequentist GLM model
frequentist_model = sm.Logit(y_train, X_train).fit()
frequentist_model.summary()

Optimization terminated successfully.
         Current function value: 0.280343
         Iterations 8


0,1,2,3
Dep. Variable:,HOME_TEAM_WINS,No. Observations:,18669.0
Model:,Logit,Df Residuals:,18654.0
Method:,MLE,Df Model:,14.0
Date:,"Mon, 04 Dec 2023",Pseudo R-squ.:,0.5873
Time:,21:56:57,Log-Likelihood:,-5233.7
converged:,True,LL-Null:,-12682.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.4382,5.36e+05,8.18e-07,1.000,-1.05e+06,1.05e+06
FG_PCT_home,1.4497,0.043,33.739,0.000,1.365,1.534
FT_PCT_home,0.4321,0.026,16.455,0.000,0.381,0.484
FG3_PCT_home,0.5805,0.029,19.731,0.000,0.523,0.638
AST_home,0.2847,0.033,8.592,0.000,0.220,0.350
REB_home,0.6059,0.032,19.039,0.000,0.544,0.668
FG_PCT_away,-1.4171,0.042,-33.732,0.000,-1.499,-1.335
FT_PCT_away,-0.4100,0.026,-15.564,0.000,-0.462,-0.358
FG3_PCT_away,-0.5386,0.029,-18.630,0.000,-0.595,-0.482


In [12]:
# calculating the RMSE for the training and test sets
y_test_pred = frequentist_model.predict(X_test)
predicted_outcome = np.where(y_test_pred > 0.5, 1, 0)
RMSE = np.sqrt(np.mean((y_test - predicted_outcome)**2))
print(f'RMSE: {RMSE}')

RMSE: 0.35597516826303355


In [13]:
# instead of 2 columns for home and away metrics, we combine them into one by calcluating 
# the difference between the home and away metrics
df_diff = df_transformed.copy()
df_diff['FG_PCT_DIFF'] = df_transformed['FG_PCT_home'] - df_transformed['FG_PCT_away']
df_diff['FT_PCT_DIFF'] = df_transformed['FT_PCT_home'] - df_transformed['FT_PCT_away']
df_diff['FG3_PCT_DIFF'] = df_transformed['FG3_PCT_home'] - df_transformed['FG3_PCT_away']
df_diff['AST_DIFF'] = df_transformed['AST_home'] - df_transformed['AST_away']
df_diff['REB_DIFF'] = df_transformed['REB_home'] - df_transformed['REB_away']
df_diff.drop(['FG_PCT_home', 'FG_PCT_away', 'FT_PCT_home', 'FT_PCT_away', 'FG3_PCT_home', 'FG3_PCT_away', 'AST_home', 'AST_away', 'REB_home', 'REB_away'], axis=1, inplace=True)

# renormalizing the numerical columns with the new columns
numerical_columns = ['FG_PCT_DIFF', 'FT_PCT_DIFF', 'FG3_PCT_DIFF', 'AST_DIFF', 'REB_DIFF', 'W_PCT_DIFF']
scaler = StandardScaler()
df_diff[numerical_columns] = scaler.fit_transform(df_diff[numerical_columns])

df_diff.head()

Unnamed: 0,W_PCT_DIFF,HOME_TEAM_CONFERENCE_East,HOME_TEAM_CONFERENCE_West,VISITOR_TEAM_CONFERENCE_East,VISITOR_TEAM_CONFERENCE_West,HOME_TEAM_WINS,FG_PCT_DIFF,FT_PCT_DIFF,FG3_PCT_DIFF,AST_DIFF,REB_DIFF
0,0.918867,0.0,1.0,0.0,1.0,1,-0.071398,0.795346,0.349352,0.09774,0.078879
1,0.555849,0.0,1.0,1.0,0.0,1,-1.101755,1.33068,0.749015,-0.785041,0.200752
2,-0.170187,1.0,0.0,1.0,0.0,1,0.007299,0.720093,-0.813679,0.10066,-1.142185
3,1.164149,1.0,0.0,1.0,0.0,1,0.50007,1.231841,0.196336,1.581695,0.186407
4,0.235347,1.0,0.0,1.0,0.0,0,-1.060319,1.618906,0.511359,0.10066,-0.147335


In [14]:
# re-splitting the data into train and test sets
X_diff = sm.add_constant(df_diff.drop('HOME_TEAM_WINS', axis=1))
y_diff = df_diff['HOME_TEAM_WINS']
X_train_diff, X_test_diff, y_train_diff, y_test_diff = train_test_split(X_diff, y_diff, test_size=0.3, random_state=42)

# fitting the model
frequentist_model_diff = sm.Logit(y_train_diff, X_train_diff).fit()
frequentist_model_diff.summary()

Optimization terminated successfully.
         Current function value: 0.280465
         Iterations 21


0,1,2,3
Dep. Variable:,HOME_TEAM_WINS,No. Observations:,18669.0
Model:,Logit,Df Residuals:,18658.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 04 Dec 2023",Pseudo R-squ.:,0.5871
Time:,21:57:00,Log-Likelihood:,-5236.0
converged:,True,LL-Null:,-12682.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.4384,,,,,
W_PCT_DIFF,1.4583,0.037,39.093,0.000,1.385,1.531
HOME_TEAM_CONFERENCE_East,0.2044,,,,,
HOME_TEAM_CONFERENCE_West,0.2340,,,,,
VISITOR_TEAM_CONFERENCE_East,0.2157,,,,,
VISITOR_TEAM_CONFERENCE_West,0.2227,,,,,
FG_PCT_DIFF,1.9811,0.044,45.487,0.000,1.896,2.067
FT_PCT_DIFF,0.5844,0.027,21.844,0.000,0.532,0.637
FG3_PCT_DIFF,0.7905,0.030,26.308,0.000,0.732,0.849


In [15]:
# calculating the RMSE for the training and test sets
y_test_pred_diff = frequentist_model_diff.predict(X_test_diff)
predicted_outcome = np.where(y_test_pred_diff > 0.5, 1, 0)
RMSE = np.sqrt(np.mean((y_test_diff - predicted_outcome)**2))
print(f'RMSE: {RMSE}')

RMSE: 0.35544818771019693


In [16]:
# AIC and BIC for the two models
print(f'Frequentist model AIC: {frequentist_model.aic}')
print(f'Frequentist model BIC: {frequentist_model.bic}')
print(f'Frequentist model with difference AIC: {frequentist_model_diff.aic}')
print(f'Frequentist model with difference BIC: {frequentist_model_diff.bic}')

Frequentist model AIC: 10497.4392666059
Frequentist model BIC: 10614.95856170446
Frequentist model with difference AIC: 10494.01031395076
Frequentist model with difference BIC: 10580.19113035637


#### Bayesian GLM

In [17]:
# renaming the columns for the pymc model
df_pymc = df_diff.copy().rename(columns={'FG_PCT_DIFF': 'fg', 'FT_PCT_DIFF': 'ft', 
                                        'FG3_PCT_DIFF': 'fg3', 'AST_DIFF': 'ast', 
                                        'REB_DIFF': 'reb', 'W_PCT_DIFF': 'w_pct',
                                        'HOME_TEAM_WINS': 'win', 
                                        'HOME_TEAM_CONFERENCE_East': 'h_east',
                                        'HOME_TEAM_CONFERENCE_West': 'h_west', 
                                        'VISITOR_TEAM_CONFERENCE_East': 'v_east',
                                        'VISITOR_TEAM_CONFERENCE_West': 'v_west'})
df_pymc.head()

Unnamed: 0,w_pct,h_east,h_west,v_east,v_west,win,fg,ft,fg3,ast,reb
0,0.918867,0.0,1.0,0.0,1.0,1,-0.071398,0.795346,0.349352,0.09774,0.078879
1,0.555849,0.0,1.0,1.0,0.0,1,-1.101755,1.33068,0.749015,-0.785041,0.200752
2,-0.170187,1.0,0.0,1.0,0.0,1,0.007299,0.720093,-0.813679,0.10066,-1.142185
3,1.164149,1.0,0.0,1.0,0.0,1,0.50007,1.231841,0.196336,1.581695,0.186407
4,0.235347,1.0,0.0,1.0,0.0,0,-1.060319,1.618906,0.511359,0.10066,-0.147335


In [None]:
with pm.Model() as logistic_model:
    # first i define the priors
    intercept = pm.Normal('intercept', mu=0, sigma=10)
    beta = pm.Normal('beta', mu=0, sigma=10, shape=len(df_pymc.columns)-1)
    # make the model
    eq = intercept + pm.math.dot(df_pymc.drop('win', axis=1), beta)
    # calculate the likelihood
    inv = pm.math.invlogit(eq)
    likelihood = pm.Bernoulli('likelihood', p=inv, observed=df_pymc['win'])

    # default taking too long so approximating with ADVI
    approx = pm.fit(n=30000, method='advi')
    advi_trace = approx.sample(draws=500)

    trace = pm.sample(500, tune=1000, cores=4, target_accept=0.8, return_inferencedata=True)

### Non-parametric method 1: Random Forest

In [18]:
# here we make use of our diff dataset to predict the outcome of a game using a random
# forest classifier
df_random_forest = df_diff.copy()
y = df_random_forest.pop('HOME_TEAM_WINS')
X = df_random_forest

# splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
# defining the rf classifier and calculating the total runtime
random_forest = RandomForestClassifier(random_state=42)
start = time.time()
random_forest.fit(X_train, y_train)

# making predictions
y_pred = random_forest.predict(X_test)
end = time.time()
print(f"Runtime: {end - start} seconds")

Runtime: 1.7046799659729004 seconds


In [23]:
# all my evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Report:\n {report}')

Accuracy: 0.8689077730567358
Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      3230
           1       0.89      0.89      0.89      4772

    accuracy                           0.87      8002
   macro avg       0.86      0.86      0.86      8002
weighted avg       0.87      0.87      0.87      8002



#### Non-parametric method 2: Neural Networks

In [24]:
# similar to rf, we use the diff dataset to predict the outcome of a game using a neural
# network classifier
df_nn = df_diff.copy()
y = df_nn.pop('HOME_TEAM_WINS')
X = df_nn

# splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
# first i define the nn classifier
nn = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, random_state=42)
start = time.time()
nn.fit(X_train, y_train)

# making predictions
y_pred = nn.predict(X_test)
end = time.time()
print(f"Runtime: {end - start} seconds")

Runtime: 47.34456706047058 seconds


In [31]:
# all my evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Report:\n {report}')

Accuracy: 0.8422894276430892
Report:
               precision    recall  f1-score   support

           0       0.79      0.83      0.81      3230
           1       0.88      0.85      0.87      4772

    accuracy                           0.84      8002
   macro avg       0.84      0.84      0.84      8002
weighted avg       0.84      0.84      0.84      8002

