Production (Importing, XGBoost Training, Fitting Current Data, Export)

In [42]:
# Import Libraries
import pandas as pd
import numpy as np

import h2o
from h2o.automl import H2OAutoML

train = pd.read_csv("../data/pipeline/train.csv")
train.head(3)

Unnamed: 0,game_id,season,team1_win,team1_score,team2_score,team1_pythag,elite sos_ratio,avg hgt_ratio,threes fg%_ratio,oppstlrate_ratio
0,2013-1388-1292,2013,1,67,54,0.564461,1.569254,1.012386,0.98939,0.982724
1,2019-1113-1385,2019,1,74,65,0.57019,0.927589,1.013223,0.935933,1.223296
2,2008-1338-1331,2008,1,82,63,0.665861,1.987082,0.9839,1.00811,0.8677


H20 Training

In [43]:
# Initialize H2O
h2o.init()

# Convert DataFrame to H2O Frame
train_h2o = h2o.H2OFrame(train)

# Test Train Split with Validation
train_h2o['team1_win'] = train_h2o['team1_win'].asfactor()
train, test = train_h2o.split_frame(ratios = [0.8], seed = 42)

# h2o.estimators.xgboost.H2OXGBoostEstimator.available()

# Initiate the H20 AutoML Model
aml = H2OAutoML(max_models = 20, seed = 42, verbosity = "info", nfolds = 5, balance_classes = True, stopping_metric = 'logloss')

# Train Model with Feature and Target Columns
aml.train(x = train.columns[-5:], y = 'team1_win', training_frame = train)
print('Training Columns:', train.columns[-5:],'\n','Target Column: "team1_win"')
print('Training Size:', train.shape,'\n','Test Size:', test.shape)

Checking whether there is an H2O instance running at http://localhost:54321.

 connected.


0,1
H2O_cluster_uptime:,1 hour 3 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,"7 days, 6 hours and 14 minutes"
H2O_cluster_name:,H2O_from_python_Peter_ba1wjq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.882 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%

21:37:04.266: Project: AutoML_6_20240320_213704
21:37:04.267: Setting stopping tolerance adaptively based on the training frame: 0.03569153051241249
21:37:04.267: Build control seed: 42
21:37:04.267: training frame: Frame key: AutoML_6_20240320_213704_training_py_62_sid_b553    cols: 10    rows: 785  chunks: 1    size: 50024  checksum: 3105719761633264220
21:37:04.267: validation frame: NULL
21:37:04.267: leaderboard frame: NULL
21:37:04.267: blending frame: NULL
21:37:04.267: response column: team1_win
21:37:04.267: fold column: null
21:37:04.267: weights column: null
21:37:04.267: AutoML: XGBoost is not available; skipping it.
21:37:04.267: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {

In [35]:
# View leaderboard
aml.leaderboard.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GLM_1_AutoML_4_20240320_210844,0.760111,0.583501,0.742141,0.306772,0.446402,0.199274


In [25]:
# Predict On Test Data
xgb = aml.get_best_model(algorithm="xgboost", criterion="logloss")
aml.leader.predict(test)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict,p0,p1
1,0.246822,0.753178
1,0.195632,0.804368
1,0.458614,0.541386
1,0.329869,0.670131
0,0.751234,0.248766
1,0.504744,0.495256
1,0.398822,0.601178
0,0.67546,0.32454
1,0.199387,0.800613
1,0.116601,0.883399


In [27]:
# Evaluate Model Performance
aml.leader.model_performance(test) 

Unnamed: 0,0,1,Error,Rate
0,10.0,37.0,0.7872,(37.0/47.0)
1,2.0,44.0,0.0435,(2.0/46.0)
Total,12.0,81.0,0.4194,(39.0/93.0)

metric,threshold,value,idx
max f1,0.2427183,0.6929134,80.0
max f2,0.1888745,0.8395522,83.0
max f0point5,0.6676081,0.7,25.0
max accuracy,0.6676081,0.6774194,25.0
max precision,0.9115203,1.0,0.0
max recall,0.107564,1.0,91.0
max specificity,0.9115203,1.0,0.0
max absolute_mcc,0.6676081,0.3900713,25.0
max min_per_class_accuracy,0.5205543,0.6382979,46.0
max mean_per_class_accuracy,0.6578838,0.6755319,29.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0107527,0.9073872,2.0217391,2.0217391,1.0,0.9115203,1.0,0.9115203,0.0217391,0.0217391,102.173913,102.173913,0.0217391
2,0.0215054,0.899045,2.0217391,2.0217391,1.0,0.9070278,1.0,0.909274,0.0217391,0.0434783,102.173913,102.173913,0.0434783
3,0.0322581,0.8867891,2.0217391,2.0217391,1.0,0.8975245,1.0,0.9053575,0.0217391,0.0652174,102.173913,102.173913,0.0652174
4,0.0430108,0.8737018,2.0217391,2.0217391,1.0,0.8833989,1.0,0.8998679,0.0217391,0.0869565,102.173913,102.173913,0.0869565
5,0.0537634,0.8608238,2.0217391,2.0217391,1.0,0.8691384,1.0,0.893722,0.0217391,0.1086957,102.173913,102.173913,0.1086957
6,0.1075269,0.8170039,1.6173913,1.8195652,0.8,0.8390679,0.9,0.8663949,0.0869565,0.1956522,61.7391304,81.9565217,0.1743756
7,0.1505376,0.801306,1.5163043,1.7329193,0.75,0.8084691,0.8571429,0.8498447,0.0652174,0.2608696,51.6304348,73.2919255,0.2183164
8,0.2043011,0.768955,1.2130435,1.5961098,0.6,0.7889609,0.7894737,0.8338226,0.0652174,0.326087,21.3043478,59.610984,0.2409806
9,0.3010753,0.6610961,1.3478261,1.5163043,0.6666667,0.7013591,0.75,0.7912451,0.1304348,0.4565217,34.7826087,51.6304348,0.3075856
10,0.3978495,0.6014587,0.8985507,1.36604,0.4444444,0.6277057,0.6756757,0.7514652,0.0869565,0.5434783,-10.1449275,36.6039953,0.2881591


In [None]:
# View Feature Importance
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0])
out.varimp_plot() 

In [None]:
# Assign Best Model
cbb_model = aml.leader

# Shutdown H2O
h2o.shutdown()

Previous Models

In [None]:
#Assign Model
# model_t = XGBClassifier(n_estimators= 70, max_depth=4, eta = .05, subsample = .9, colsample_bytree = .8)
#Utilize Cross Validation To Further Enhance Model
# cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=1)
# scores = cross_val_score(model_t, X_train, y_train, cv=cv, scoring = 'neg_log_loss', n_jobs = -1, error_score = 'raise')
# scores = abs(scores)
# print('Mean Log Loss: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
#Build The Model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", 
                              random_state = 42, 
                              eta = .04, 
                              max_depth = 6,
                              min_child_weight = 3,
                              n_estimators = 100,
                              gamma = .6,
                              reg_lambda = .2,
                              subsample = 1,
                              colsample_bytree = .99)

#Fit The Model
xgb_model.fit(X, Y, early_stopping_rounds = 5, eval_metric = 'logloss', eval_set = [(x, y)])

In [None]:
#Update The Model Based On Cross Validation Tests and Fit and Balance Overfitting With Performance
model_t = XGBClassifier(n_estimators= 40, max_depth = 4, eta = .05, subsample = .8, colsample_bytree = .9)
print("                 Test Set                        Train Set")
model_t.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)]) #Check For Overfitting

In [None]:
#Hyperparameter Tuning
param_grid = {'eta': [.0035],
              'objective':['binary:logistic'],
              'max_depth': [6],
              'min_child_weight': [1],
              'n_estimators': [8],
              'gamma': [.44],
              'reg_lambda' : [.55],
              'subsample': [1],
              'colsample_bytree': [.5]}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(xgb_model, param_grid, n_jobs = 4, scoring = LogLoss, verbose = 0, cv = 3)

optimal_params.fit(X, Y)

In [None]:
#Accuracy Score
accuracy_score(test['team1_win'], test['prediction'])
precision_score(test['team1_win'], test['prediction'])
recall_score(test['team1_win'], test['prediction'])
f1_score(test['team1_win'], test['prediction'])
log_loss(test['team1_win'].values, test['prob'].values, labels=[0,1])

In [None]:
#Feature Importance
feat_importances = pd.Series(xgb_model.feature_importances_, index= X.columns)
feat_importances.nlargest(54).plot(kind='barh')
sns.set(rc = {'figure.figsize':(14,14)})
plt.title("Feature Importance of XGBoost Model", size = 14)

In [None]:
#Examples
FEATURES = [
    #     "WinPercentage",
    #     "MedianScoreDiff",
    #     "ChalkSeed",
    #     "OppWinPercentage",
    #     "OppMedianScoreDiff",
    #     "OppChalkSeed",
    "WinPctDiff",
    "ChalkSeedDiff",
    #     "538rating",
    #     "538ratingOpp",
    "538rating_diff",
]
TARGET = "Win"


X = df_historic_tourney_features[FEATURES]
y = df_historic_tourney_features[TARGET]
groups = df_historic_tourney_features["Season"]
seasons = df_historic_tourney_features["Season"].unique()

# Setup cross-validation
gkf = GroupKFold(n_splits=df_historic_tourney_features["Season"].nunique())
cv_results = []
models = []

season_idx = 0
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Prepare the model
    model = xgb.XGBRegressor(
        eval_metric="logloss",
        n_estimators=1_000,
        learning_rate=0.001,
    )
    holdout_season = seasons[season_idx]
    print(f"Holdout Season: {holdout_season}")
    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    # Predict on the test set
    y_pred = model.predict(X_test)
    score_ll = log_loss(y_test, y_pred)
    y_pred = y_pred > 0.5
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cv_results.append(accuracy)
    season_idx += 1
    print(f"Season {holdout_season}: {accuracy} {score_ll}")
    models.append(model)
# Print the average accuracy across all folds
print("Average CV Accuracy:", np.mean(cv_results))