Import Libraries & Data

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import json

import h2o
from h2o.automl import H2OAutoML

# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
#                              f1_score, log_loss, roc_curve, confusion_matrix,
#                              precision_score, recall_score, plot_confusion_matrix,
#                              make_scorer)

# Import Data
# X_train = pd.read_csv("../data/pipeline/X_train.csv")
# y_train = pd.read_csv("../data/pipeline/y_train.csv")
# X_test = pd.read_csv("../data/pipeline/X_test.csv")
# y_test = pd.read_csv("../data/pipeline/y_test.csv")

train = pd.read_csv("../data/pipeline/train.csv")

XGBoost Training

In [3]:
# Initialize H2O
# h2o.init()
# h2o.demo("glm")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (build 21.0.1+12-29, mixed mode, sharing)
  Starting server from C:\Users\Peter\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Peter\AppData\Local\Temp\tmp__26aims
  JVM stdout: C:\Users\Peter\AppData\Local\Temp\tmp__26aims\h2o_Peter_started_from_python.out
  JVM stderr: C:\Users\Peter\AppData\Local\Temp\tmp__26aims\h2o_Peter_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,2 days
H2O_cluster_name:,H2O_from_python_Peter_k7gun8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.927 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [None]:
# Initialize H2O
h2o.init()

# Convert DataFrame to H2O Frame
train_h2o = h2o.H2OFrame(train)

# Test Train Split
train_h2o['team1_win'] = train_h2o['team1_win'].asfactor()
train_h2o, test = train_h2o.split_frame(ratios = [0.8], seed = 42)

# Define the AutoML model
aml = H2OAutoML(max_models = 10, seed = 42, nfolds = 5, balance_classes=True, stopping_metric='logloss')

# Train the AutoML model with Feature and Target Columns
aml.train(x = [col for col in train if any(substring in col for substring in ['diff', 'ratio', 'pythag'])], 
          y = 'team_win1', training_frame = train_h2o)

# View leaderboard
lb = aml.leaderboard
print(lb)

# Assign Best Model
cbb_model = aml.leader

# Shutdown H2O
h2o.shutdown()

Extras

In [None]:
#Assign Model
# model_t = XGBClassifier(n_estimators= 70, max_depth=4, eta = .05, subsample = .9, colsample_bytree = .8)
#Utilize Cross Validation To Further Enhance Model
# cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=1)
# scores = cross_val_score(model_t, X_train, y_train, cv=cv, scoring = 'neg_log_loss', n_jobs = -1, error_score = 'raise')
# scores = abs(scores)
# print('Mean Log Loss: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
#Build The Model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", 
                              random_state = 42, 
                              eta = .04, 
                              max_depth = 6,
                              min_child_weight = 3,
                              n_estimators = 100,
                              gamma = .6,
                              reg_lambda = .2,
                              subsample = 1,
                              colsample_bytree = .99)

#Fit The Model
xgb_model.fit(X, Y, early_stopping_rounds = 5, eval_metric = 'logloss', eval_set = [(x, y)])

In [None]:
#Update The Model Based On Cross Validation Tests and Fit and Balance Overfitting With Performance
model_t = XGBClassifier(n_estimators= 40, max_depth = 4, eta = .05, subsample = .8, colsample_bytree = .9)
print("                 Test Set                        Train Set")
model_t.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)]) #Check For Overfitting

In [None]:
#Hyperparameter Tuning
param_grid = {'eta': [.0035],
              'objective':['binary:logistic'],
              'max_depth': [6],
              'min_child_weight': [1],
              'n_estimators': [8],
              'gamma': [.44],
              'reg_lambda' : [.55],
              'subsample': [1],
              'colsample_bytree': [.5]}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(xgb_model, param_grid, n_jobs = 4, scoring = LogLoss, verbose = 0, cv = 3)

optimal_params.fit(X, Y)

In [None]:
#Accuracy Score
accuracy_score(test['team1_win'], test['prediction'])
precision_score(test['team1_win'], test['prediction'])
recall_score(test['team1_win'], test['prediction'])
f1_score(test['team1_win'], test['prediction'])
log_loss(test['team1_win'].values, test['prob'].values, labels=[0,1])

In [None]:
#Feature Importance
feat_importances = pd.Series(xgb_model.feature_importances_, index= X.columns)
feat_importances.nlargest(54).plot(kind='barh')
sns.set(rc = {'figure.figsize':(14,14)})
plt.title("Feature Importance of XGBoost Model", size = 14)

In [None]:
#Examples
FEATURES = [
    #     "WinPercentage",
    #     "MedianScoreDiff",
    #     "ChalkSeed",
    #     "OppWinPercentage",
    #     "OppMedianScoreDiff",
    #     "OppChalkSeed",
    "WinPctDiff",
    "ChalkSeedDiff",
    #     "538rating",
    #     "538ratingOpp",
    "538rating_diff",
]
TARGET = "Win"


X = df_historic_tourney_features[FEATURES]
y = df_historic_tourney_features[TARGET]
groups = df_historic_tourney_features["Season"]
seasons = df_historic_tourney_features["Season"].unique()

# Setup cross-validation
gkf = GroupKFold(n_splits=df_historic_tourney_features["Season"].nunique())
cv_results = []
models = []

season_idx = 0
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Prepare the model
    model = xgb.XGBRegressor(
        eval_metric="logloss",
        n_estimators=1_000,
        learning_rate=0.001,
    )
    holdout_season = seasons[season_idx]
    print(f"Holdout Season: {holdout_season}")
    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    # Predict on the test set
    y_pred = model.predict(X_test)
    score_ll = log_loss(y_test, y_pred)
    y_pred = y_pred > 0.5
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cv_results.append(accuracy)
    season_idx += 1
    print(f"Season {holdout_season}: {accuracy} {score_ll}")
    models.append(model)
# Print the average accuracy across all folds
print("Average CV Accuracy:", np.mean(cv_results))