In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, log_loss

In [3]:
df = pd.read_csv('../data/replay_data/transformed_replay_data_win_probability/dsk_200k_games.csv')

In [4]:
# data prep
data = df
X = data.drop(columns=['game_id', 'won'])
y = data['won']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
X_test

Unnamed: 0,turn,on_play,user_hand_1,user_hand_2,user_hand_3,user_hand_4,user_hand_5,user_hand_6,user_hand_7,user_hand_8,...,oppo_non_creatures_12,oppo_non_creatures_13,oppo_non_creatures_14,oppo_non_creatures_15,oppo_non_creatures_16,oppo_non_creatures_17,oppo_non_creatures_18,oppo_non_creatures_19,oppo_non_creatures_20,oppo_life
1367219,3,True,0.563,,,0.557,0.548,,,,...,,,,,,,,,,18.0
1514912,4,True,,0.536,0.520,,,,,,...,,,,,,,,,,18.0
784759,5,True,,0.523,0.498,0.557,,,,,...,,,,,,,,,,21.0
133900,2,False,,0.558,0.538,0.570,0.570,,,,...,,,,,,,,,,20.0
865962,4,True,,0.542,0.562,0.562,,,,,...,,,,,,,,,,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459648,12,False,,,,,,,,,...,,,,,,,,,,9.0
1253982,3,True,0.547,0.548,0.562,0.536,0.548,,,,...,,,,,,,,,,22.0
1299045,7,True,0.565,0.537,0.559,0.538,0.555,,,,...,,,,,,,,,,14.0
478113,9,False,,,,,,,,,...,,,,,,,,,,17.0


In [4]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, log_loss

# Simplified parameter grid
param_grid = {
    'n_estimators': [50, 100],      # Number of trees
    'learning_rate': [0.1],         # Fixed learning rate for simplicity
    'max_depth': [4, 6],            # Test a smaller range of tree depths
    'subsample': [0.8],             # Fixed subsample fraction
    'colsample_bytree': [0.8]       # Fixed feature fraction
}

# Initialize the model
xgb_model = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',  # Use AUC as the evaluation metric
    cv=3,               # 3-fold cross-validation
    verbose=1,          # Show minimal progress during search
    n_jobs=-1           # Use all available CPU cores
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_predict_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
auc_score = roc_auc_score(y_test, y_predict_proba)
log_loss_score = log_loss(y_test, y_predict_proba)

print(f"AUC: {auc_score:.4f}")
print(f"Log Loss: {log_loss_score:.4f}")


Fitting 3 folds for each of 4 candidates, totalling 12 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
AUC: 0.7457
Log Loss: 0.5865


In [9]:
import pickle

# Save the trained model to a file
model_filename = '../models/win_probability/xgb_win_prob_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Model saved to {model_filename}")


Model saved to ../models/win_probability/xgb_win_prob_model.pkl
