In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [2]:
all_data = pd.read_csv('gamelvl_joined.csv')
all_data.head()

Unnamed: 0.1,Unnamed: 0,year,game_ID,home_numberGame,home_ID_team,home_isB2B,home_isB2BFirst,home_isB2BSecond,home_name,home_countDaysRest,...,away_points,away_plusminus,away_ind_Win,away_ind_SecondB2B,away_ind_overtime,away_offensive_rating,away_defensive_rating,away_RAPTOR_offensive,away_RAPTOR_defensive,away_RAPTOR_net
0,1,2018,21700002,1,1610612744,False,True,False,GSW,120,...,122,1,1,0,0,114.7,106.1,6.1,2.5,8.7
1,2,2018,21700001,1,1610612739,False,True,False,CLE,120,...,99,-3,0,0,0,107.6,103.9,-1.0,4.7,3.7
2,3,2018,21700010,1,1610612762,False,True,False,UTA,120,...,96,-10,0,0,0,112.5,111.0,3.9,-2.3,1.5
3,4,2018,21700011,1,1610612759,False,False,False,SAS,120,...,99,-8,0,0,0,113.4,111.1,4.8,-2.4,2.3
4,5,2018,21700007,2,1610612738,True,False,True,BOS,0,...,108,8,1,0,0,109.8,110.1,1.2,-1.5,-0.3


In [3]:
#Engineering some new variables that may be useful in predicting spread of NBA games

all_data["spread"] = all_data["home_points"] - all_data["away_points"]
all_data["home_ftr"] = all_data["home_freethrow_attempts"] / all_data["home_fieldgoal_attempts"]
all_data["away_ftr"] = all_data["away_freethrow_attempts"] / all_data["away_fieldgoal_attempts"]
all_data["home_pace"] = (all_data["home_fieldgoal_attempts"] + .44 * all_data["home_freethrow_attempts"] - all_data["home_off_reb"]+all_data["home_turnovers"])/48
all_data["away_pace"] = (all_data["away_fieldgoal_attempts"] + .44 * all_data["away_freethrow_attempts"] - all_data["away_off_reb"]+all_data["away_turnovers"])/48
all_data["home_turnover_rate"] = all_data["home_turnovers"] / (all_data["home_fieldgoal_attempts"] + .44 * all_data["home_freethrow_attempts"] - all_data["home_off_reb"]+all_data["home_turnovers"])
all_data["away_turnover_rate"] = all_data["away_turnovers"] / (all_data["away_fieldgoal_attempts"] + .44 * all_data["away_freethrow_attempts"] - all_data["away_off_reb"]+all_data["away_turnovers"])
all_data["home_atr"] = all_data["home_assists"] / all_data["home_turnovers"]
all_data["away_atr"] = all_data["away_assists"] / all_data["away_turnovers"]
all_data["matchup"] = all_data["away_name"] + " @ " + all_data["home_name"]
all_data.head()

Unnamed: 0.1,Unnamed: 0,year,game_ID,home_numberGame,home_ID_team,home_isB2B,home_isB2BFirst,home_isB2BSecond,home_name,home_countDaysRest,...,spread,home_ftr,away_ftr,home_pace,away_pace,home_turnover_rate,away_turnover_rate,home_atr,away_atr,matchup
0,1,2018,21700002,1,1610612744,False,True,False,GSW,120,...,-1,0.2625,0.195876,2.088333,2.2575,0.169593,0.11997,2.0,2.153846,HOU @ GSW
1,2,2018,21700001,1,1610612739,False,True,False,CLE,120,...,3,0.301205,0.284091,2.125,2.125,0.166667,0.117647,1.117647,2.0,BOS @ CLE
2,3,2018,21700010,1,1610612762,False,True,False,UTA,120,...,10,0.197531,0.155844,2.021667,2.005833,0.154575,0.218114,1.866667,1.0,DEN @ UTA
3,4,2018,21700011,1,1610612759,False,False,False,SAS,120,...,8,0.213483,0.270588,2.0075,2.023333,0.114155,0.133855,2.0,1.769231,MIN @ SAS
4,5,2018,21700007,2,1610612738,True,False,True,BOS,0,...,-8,0.230769,0.394737,2.171667,2.045833,0.143899,0.152749,1.533333,1.266667,MIL @ BOS


In [4]:
def get_averages(year:int):

    all_data_2024 = all_data[all_data['year'] == year]

    # Group by home_name (home team) and calculate the mean across numeric variables, removing NA values
    grouped_by_home = all_data_2024.groupby('home_name').mean()

    # Extract column names containing "home"
    columns_containing_home = [col for col in grouped_by_home.columns if 'home' in col]

    # Select only columns containing "home" to isolate averages when teams are playing at home
    average_values_by_home_team = grouped_by_home[columns_containing_home].reset_index()

    # Repeating but for away
    average_values_by_away_team = all_data_2024.groupby('away_name').mean().dropna()
    away_columns = [col for col in average_values_by_away_team.columns if 'away' in col]
    average_values_by_away_team = average_values_by_away_team[away_columns].reset_index()

    joined_averages = pd.merge(average_values_by_away_team, average_values_by_home_team, how='cross')
    joined_averages = joined_averages[joined_averages['away_name'] != joined_averages['home_name']]
    joined_averages['matchup'] = joined_averages['away_name'] + ' @ ' + joined_averages['home_name']

    # Drop unnecessary columns
    joined_averages = joined_averages.drop(columns=['away_name', 'home_name'])
    return joined_averages.iloc[:]

full_averages_table = pd.DataFrame()
for year in range(2018,2025):
    year_averages = get_averages(year)
    year_averages['year'] = year
    full_averages_table = pd.concat([full_averages_table, year_averages], ignore_index=True)

full_averages_table


Unnamed: 0,away_numberGame,away_ID_team,away_isB2B,away_isB2BFirst,away_isB2BSecond,away_countDaysRest,away_countDaysNextGame,away_isWin,away_fieldgoal_made,away_fieldgoal_attempts,...,home_defensive_rating,home_RAPTOR_offensive,home_RAPTOR_defensive,home_RAPTOR_net,home_ftr,home_pace,home_turnover_rate,home_atr,matchup,year
0,40.707317,1.610613e+09,0.463415,0.146341,0.170732,4.121951,1.073171,0.195122,37.878049,86.146341,...,110.6,-1.7,-2.0,-3.8,0.268236,2.110163,0.142100,1.676886,ATL @ BKN,2018
1,40.707317,1.610613e+09,0.463415,0.146341,0.170732,4.121951,1.073171,0.195122,37.878049,86.146341,...,103.9,-1.0,4.7,3.7,0.241794,2.067764,0.136055,1.958005,ATL @ BOS,2018
2,40.707317,1.610613e+09,0.463415,0.146341,0.170732,4.121951,1.073171,0.195122,37.878049,86.146341,...,109.1,0.8,-0.5,0.3,0.321661,2.128374,0.131674,1.709707,ATL @ CHA,2018
3,40.707317,1.610613e+09,0.463415,0.146341,0.170732,4.121951,1.073171,0.195122,37.878049,86.146341,...,110.9,-4.9,-2.2,-7.1,0.204270,2.101545,0.133281,1.956472,ATL @ CHI,2018
4,40.707317,1.610613e+09,0.463415,0.146341,0.170732,4.121951,1.073171,0.195122,37.878049,86.146341,...,111.9,4.2,-3.3,1.0,0.279776,2.097012,0.135101,1.868456,ATL @ CLE,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6085,35.000000,1.610613e+09,0.368421,0.184211,0.131579,4.342105,1.210526,0.210526,42.684211,92.631579,...,117.9,-6.6,-2.4,-9.0,0.240264,2.096820,0.146045,1.641533,WAS @ POR,2024
6086,35.000000,1.610613e+09,0.368421,0.184211,0.131579,4.342105,1.210526,0.210526,42.684211,92.631579,...,115.9,1.6,-0.4,1.2,0.246741,2.150768,0.126110,2.522138,WAS @ SAC,2024
6087,35.000000,1.610613e+09,0.368421,0.184211,0.131579,4.342105,1.210526,0.210526,42.684211,92.631579,...,117.3,-5.1,-1.8,-6.9,0.230536,2.162741,0.146028,2.087578,WAS @ SAS,2024
6088,35.000000,1.610613e+09,0.368421,0.184211,0.131579,4.342105,1.210526,0.210526,42.684211,92.631579,...,118.6,-2.6,-3.1,-5.7,0.234390,2.107457,0.138769,2.180957,WAS @ TOR,2024


In [5]:
final_table = all_data
final_table = final_table.iloc[:,[1,93,84]]
final_table = pd.merge(final_table, full_averages_table, on=['year', 'matchup'], how='inner')
#print(final_table.columns)
final_table = final_table.iloc[:, 0:3].join(final_table.iloc[:,final_table.columns.get_loc("away_fieldgoal_made"):final_table.columns.get_loc("away_points")]
                                            ).join(final_table.iloc[:,final_table.columns.get_loc("away_offensive_rating"):final_table.columns.get_loc("away_atr")+1]
                                                   ).join(final_table.iloc[:,final_table.columns.get_loc("home_fieldgoal_made"):final_table.columns.get_loc("home_points")]
                                                          ).join(final_table.iloc[:,final_table.columns.get_loc("home_offensive_rating"):])
final_table.columns

Index(['year', 'matchup', 'spread', 'away_fieldgoal_made',
       'away_fieldgoal_attempts', 'away_fieldgoal_percent',
       'away_fieldgoal3_made', 'away_fieldgoal3_attempts',
       'away_fieldgoal3_percent', 'away_freethrow_percent',
       'away_fieldgoal2_made', 'away_fieldgoal2_attempts', 'away_pctFG2',
       'away_freethrow_made', 'away_freethrow_attempts', 'away_off_reb',
       'away_def_reb', 'away_total_reb', 'away_assists', 'away_steals',
       'away_blocks', 'away_turnovers', 'away_personalfouls',
       'away_offensive_rating', 'away_defensive_rating',
       'away_RAPTOR_offensive', 'away_RAPTOR_defensive', 'away_RAPTOR_net',
       'away_ftr', 'away_pace', 'away_turnover_rate', 'away_atr',
       'home_fieldgoal_made', 'home_fieldgoal_attempts',
       'home_fieldgoal_percent', 'home_fieldgoal3_made',
       'home_fieldgoal3_attempts', 'home_fieldgoal3_percent',
       'home_freethrow_percent', 'home_fieldgoal2_made',
       'home_fieldgoal2_attempts', 'home_pctFG2',

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score


predictors = final_table.columns[3:]

In [7]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=200, min_samples_split=100)

# X represents the features and y represents the spread
X = final_table[predictors]
y = final_table['spread']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model.fit(X_train, y_train)

rf_preds = model.predict(X_test)

random_forest_mae = mean_absolute_error(y_test, rf_preds)
print("Random Forest MAE: ", random_forest_mae)

# Perform cross-validation
scores_rf = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validation scores
print("Cross-Validation Scores:", scores_rf)
print("Mean Score:", scores_rf.mean() * -1)


Random Forest MAE:  10.005129555052456
Cross-Validation Scores: [ -9.71897216  -9.96632555 -10.77850732 -10.42193641 -10.46000119]
Mean Score: 10.269148525223315


In [8]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model_2 = LinearRegression()
model_2.fit(X_train, y_train)

linear_reg_preds = model_2.predict(X_test)

lin_mae = mean_absolute_error(y_test, linear_reg_preds)
print("Linear Regression Mean Absolute Error:", lin_mae)

# Perform cross-validation
scores_lin_reg = cross_val_score(model_2, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validation scores_lin_reg
print("Cross-Validation scores_lin_reg:", scores_lin_reg)
print("Mean Score:", scores_lin_reg.mean() * -1)


Linear Regression Mean Absolute Error: 9.96481457271782
Cross-Validation scores_lin_reg: [ -9.61486929  -9.77681266 -10.59667857 -10.25349963 -10.30967509]
Mean Score: 10.110307049340033


In [9]:
from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model_3 = Ridge()
model_3.fit(X_train, y_train)

ridge_reg_preds = model_3.predict(X_test)

ridge_mae = mean_absolute_error(y_test, ridge_reg_preds)
print("Ridge Regression Mean Absolute Error:", ridge_mae)

# Perform cross-validation
scores_ridge_reg = cross_val_score(model_3, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validation scores_ridge_reg
print("Cross-Validation scores_ridge_reg:", scores_ridge_reg)
print("Mean Score:", scores_ridge_reg.mean() * -1)

Ridge Regression Mean Absolute Error: 10.08546185852544
Cross-Validation scores_ridge_reg: [ -9.60496714  -9.77964005 -10.56474821 -10.25316546 -10.30745809]
Mean Score: 10.101995789804594


In [10]:
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_4 = tree.DecisionTreeRegressor()
model_4.fit(X_train, y_train)

tree_preds = model_4.predict(X_test)

tree_mae = mean_absolute_error(y_test, tree_preds)
print("Decision Tree Mean Absolute Error:", tree_mae)

# Perform cross-validation
scores_dt = cross_val_score(model_4, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validation scores_dt
print("Cross-Validation scores_dt:", scores_dt)
print("Mean Score:", scores_dt.mean() * -1)

Decision Tree Mean Absolute Error: 14.582723577235774
Cross-Validation scores_dt: [-15.40884146 -13.61463415 -14.72073171 -14.10799268 -15.07138499]
Mean Score: 14.584716997276743


In [11]:
from sklearn import neural_network
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_5 = neural_network.MLPRegressor()
model_5.fit(X_train, y_train)

neural_net_preds = model_5.predict(X_test)

neural_net_mae = mean_absolute_error(y_test, neural_net_preds)
print("Neural Network Mean Absolute Error:", neural_net_mae)

# Create an instance of the LinearRegression model
linear_regression = LinearRegression()

# Perform cross-validation
scores_nn = cross_val_score(model_5, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validation scores_nn
print("Cross-Validation scores_nn:", scores_nn)
print("Mean Score:", scores_nn.mean() * -1)


Neural Network Mean Absolute Error: 10.292752684304016
Cross-Validation scores_nn: [ -9.8321967   -9.83672655 -10.7106927  -10.26303724 -10.52115106]
Mean Score: 10.232760851709875
