In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error, explained_variance_score, max_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df = pd.read_csv("../../Data/win_loss_completed_df.csv")

In [3]:
cols = list(df.drop(columns=["pts", "team1", "team2"]).columns)
for col in cols:
    for row in df[cols].index:
        if df[col][row] == "Null":
            df[col][row] = np.nan
exp_col = []
pos_col = []
for col in cols:
    if "exp" in col:
        exp_col.append(col)
    else:
        pos_col.append(col)
for col in exp_col:
    for row in df[cols].index:
        if df[col][row] == "R":
            df[col][row] = 0
hold_out_df = df[df["game_date"] == 2022]
working_df = df[df["game_date"] != 2022]
working_df = working_df.drop(columns=["game_date"])
# hold_out_df = hold_out_df.drop(columns=["game_date"])
X = working_df.drop(columns=["pts"])
y = working_df["pts"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2)
for col in pos_col:
    X_train[col+" is null"] = df[col].isna()
    X_test[col+" is null"] = df[col].isna()
num_imp = SimpleImputer(strategy='mean')
num_imp.fit(X_train.drop(columns=["team1", "team2"]))

X_train_imp = num_imp.transform(X_train.drop(columns=["team1", "team2"]))
X_test_imp = num_imp.transform(X_test.drop(columns=["team1", "team2"]))
X_train_imp_df = pd.DataFrame(X_train_imp)
X_test_imp_df = pd.DataFrame(X_test_imp)

for col in X_train_imp_df.columns:
    X_train_imp_df[col] = X_train_imp_df[col].astype(float)
    X_test_imp_df[col] = X_test_imp_df[col].astype(float)


In [23]:
param_grid = {
    "max_depth": [5],
    "learning_rate" : [.1],
    "num_parallel_tree" : [15]
}
xgb_regres = xgb.XGBRegressor()

gs1 = GridSearchCV(xgb_regres, param_grid, cv=5, scoring='neg_mean_absolute_error')
gs1.fit(X_train_imp_df, y_train)

best_estimator1 = gs1.best_estimator_
y_hat_test1 = best_estimator1.predict(X_test_imp_df)
print("Test set: mean squared error", str(mean_squared_error(y_test, y_hat_test1, squared=False)), "max error", str(max_error(y_test, y_hat_test1)), "Explained variance", str(explained_variance_score(y_test, y_hat_test1)))

Test set: mean squared error 11.523063236672044 max error 38.13196563720703 Explained variance 0.2816532774455486


In [24]:
hold_out_df

Unnamed: 0,game_date,pts,team1-1-power-forward,team1-1-point-guard,team1-1-shooting-guard,team1-1-small-forward,team1-1-center,team1-1-power-forward-exp,team1-1-point-guard-exp,team1-1-shooting-guard-exp,...,team2-2-shooting-guard,team2-2-small-forward,team2-2-center,team2-2-power-forward-exp,team2-2-point-guard-exp,team2-2-shooting-guard-exp,team2-2-small-forward-exp,team2-2-center-exp,team2,team1
0,2022,103,68,5,157,73,57,6,12,3,...,151,77,25,1,7,17,1,14,BOS,GSW
1,2022,90,105,63,5,87,71,2,1,5,...,150,13,84,9,17,0,7,17,GSW,BOS
2,2022,104,68,5,157,73,57,6,12,3,...,151,77,25,1,7,17,1,14,BOS,GSW
3,2022,94,105,63,5,87,71,2,1,5,...,150,13,84,9,17,0,7,17,GSW,BOS
4,2022,107,68,5,157,73,57,6,12,3,...,151,77,25,1,7,17,1,14,BOS,GSW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,2022,99,124,21,49,47,66,1,7,2,...,147,43,82,1,8,3,8,6,DAL,UTA
170,2022,130,89,48,35,6,60,1,9,5,...,133,11,22,0,2,2,4,2,MEM,MIN
171,2022,117,82,50,16,50,49,0,6,1,...,163,89,38,1,2,0,5,2,MIN,MEM
172,2022,123,68,5,157,73,57,6,12,3,...,119,62,32,7,1,2,7,10,DEN,GSW


In [25]:
hold_out_df.columns.tolist()

['game_date',
 'pts',
 'team1-1-power-forward',
 'team1-1-point-guard',
 'team1-1-shooting-guard',
 'team1-1-small-forward',
 'team1-1-center',
 'team1-1-power-forward-exp',
 'team1-1-point-guard-exp',
 'team1-1-shooting-guard-exp',
 'team1-1-small-forward-exp',
 'team1-1-center-exp',
 'team1-2-power-forward',
 'team1-2-point-guard',
 'team1-2-shooting-guard',
 'team1-2-small-forward',
 'team1-2-center',
 'team1-2-power-forward-exp',
 'team1-2-point-guard-exp',
 'team1-2-shooting-guard-exp',
 'team1-2-small-forward-exp',
 'team1-2-center-exp',
 'team2-1-power-forward',
 'team2-1-point-guard',
 'team2-1-shooting-guard',
 'team2-1-small-forward',
 'team2-1-center',
 'team2-1-power-forward-exp',
 'team2-1-point-guard-exp',
 'team2-1-shooting-guard-exp',
 'team2-1-small-forward-exp',
 'team2-1-center-exp',
 'team2-2-power-forward',
 'team2-2-point-guard',
 'team2-2-shooting-guard',
 'team2-2-small-forward',
 'team2-2-center',
 'team2-2-power-forward-exp',
 'team2-2-point-guard-exp',
 'team

In [26]:
def TeamMatchUpGuesser(estimator, holdout, team1, team2):
    df = holdout[((holdout["team1"] == team1) & (holdout["team2"] == team2)) | ((holdout["team2"] == team1) & (holdout["team1"] == team2))]
    pred_input_df = df.drop(columns=["pts","team2","team1","game_date"])
    names_df = df.drop(columns=pred_input_df.columns.tolist())
#     print(pred_input_df.columns)
    for col in pos_col:
        pred_input_df[col+" is null"] = df[col].isna()
    X_imp = num_imp.transform(pred_input_df)
    X_imp_df = pd.DataFrame(X_imp)

    for col in X_train_imp_df.columns:
        X_imp_df[col] = X_imp_df[col].astype(float)
    
    
    y_hat = list(estimator.predict(X_imp_df))
    names_df["preds"] = y_hat
    print(names_df.head(2))

In [31]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "PHX", "NOP")

    game_date  pts team2 team1      preds
92       2022  115   NOP   PHX  96.834679
93       2022  109   PHX   NOP  98.242401


In [32]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "DAL", "UTA")

    game_date  pts team2 team1       preds
90       2022   96   DAL   UTA  105.654686
91       2022   98   UTA   DAL  104.502266


In [34]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "GSW", "DEN")

    game_date  pts team2 team1       preds
96       2022   98   GSW   DEN  102.212250
97       2022  102   DEN   GSW  110.154427


In [35]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "MEM", "MIN")

    game_date  pts team2 team1       preds
88       2022  114   MIN   MEM  110.054733
89       2022  106   MEM   MIN  111.541801


In [37]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "PHX", "DAL")

    game_date  pts team2 team1       preds
36       2022  123   PHX   DAL   99.704498
37       2022   90   DAL   PHX  101.441490


In [38]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "GSW", "MEM")

    game_date  pts team2 team1       preds
40       2022   96   GSW   MEM  105.595879
41       2022  110   MEM   GSW  115.032928


In [39]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "DAL", "GSW")

    game_date  pts team2 team1       preds
16       2022  110   GSW   DAL   99.404427
17       2022  120   DAL   GSW  112.799065


In [40]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "BOS", "GSW")

   game_date  pts team2 team1       preds
0       2022  103   BOS   GSW  113.530663
1       2022   90   GSW   BOS  104.345093


In [41]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "BOS", "MIA")

    game_date  pts team2 team1       preds
12       2022  100   MIA   BOS  103.773232
13       2022   96   BOS   MIA  108.654068


In [42]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "PHI", "MIA")

    game_date  pts team2 team1       preds
46       2022   90   MIA   PHI  105.211060
47       2022   99   PHI   MIA  101.445618


In [43]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "BOS", "MIL")

    game_date  pts team2 team1       preds
38       2022  109   MIL   BOS   97.702866
39       2022   81   BOS   MIL  111.522240


In [44]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "BOS", "BKN")

     game_date  pts team2 team1       preds
110       2022  116   BKN   BOS   98.102722
111       2022  112   BOS   BKN  107.422798


In [45]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "CHI", "MIL")

    game_date  pts team2 team1       preds
98       2022  116   CHI   MIL  100.612846
99       2022  100   MIL   CHI   95.668053


In [46]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "ATL", "MIA")

     game_date  pts team2 team1       preds
104       2022   94   MIA   ATL  102.437065
105       2022   97   ATL   MIA  109.888428


In [47]:
TeamMatchUpGuesser(best_estimator1, hold_out_df, "PHI", "TOR")

    game_date  pts team2 team1       preds
94       2022  132   TOR   PHI  104.307091
95       2022   97   PHI   TOR  104.712906
