In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [3]:
models = []
param_grid = {
    "loss":["log_loss"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "squared_error"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

In [4]:
window_sizes = [5,10,15,18,20]
for window in window_sizes:
    print('='*60)
    print(f'Training Gradient Boosting Model with window size={window} games')
    window_df = pd.read_excel(f'data/train_{window}.xlsx')
    
    X = window_df.drop(columns=['Target'])
    y = window_df['Target']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.drop(columns=['Date']))
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
    
    grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_gbm_model = grid_search.best_estimator_
    
    y_pred = best_gbm_model.predict(X_test)
    lr_acc = accuracy_score(y_test, y_pred)
    models.append({'Window': window, 'Model': best_gbm_model, 'Scaler': scaler, 'Accuracy': lr_acc})
    
    print(f'Model Accuracy: {lr_acc}')
    print('Report:\n', classification_report(y_test, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('='*60)

Training Gradient Boosting Model with window size=5 games
Model Accuracy: 0.5634408602150538
Report:
               precision    recall  f1-score   support

           0       0.56      0.60      0.58       232
           1       0.57      0.52      0.55       233

    accuracy                           0.56       465
   macro avg       0.56      0.56      0.56       465
weighted avg       0.56      0.56      0.56       465

Confusion Matrix:
 [[140  92]
 [111 122]]
Training Gradient Boosting Model with window size=10 games
Model Accuracy: 0.5954022988505747
Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.61       217
           1       0.61      0.55      0.57       218

    accuracy                           0.60       435
   macro avg       0.60      0.60      0.59       435
weighted avg       0.60      0.60      0.59       435

Confusion Matrix:
 [[140  77]
 [ 99 119]]
Training Gradient Boosting Model with window size=15 game

In [7]:
gb_models = pd.DataFrame(models).sort_values(by=['Accuracy'], ascending=False).reset_index(drop=True)
gb_models

Unnamed: 0,Window,Model,Scaler,Accuracy
0,20,"([DecisionTreeRegressor(max_depth=3, max_featu...",StandardScaler(),0.597333
1,10,([DecisionTreeRegressor(criterion='friedman_ms...,StandardScaler(),0.595402
2,15,"([DecisionTreeRegressor(max_depth=5, max_featu...",StandardScaler(),0.590123
3,5,"([DecisionTreeRegressor(max_depth=5, max_featu...",StandardScaler(),0.563441
4,18,"([DecisionTreeRegressor(max_depth=3, max_featu...",StandardScaler(),0.534884


In [6]:
import joblib

In [9]:
model = gb_models.iloc[0]['Model']
scaler = gb_models.iloc[0]['Scaler']
print('Best Model by Accuracy')
print(f'Window: {gb_models.iloc[0]["Window"]}')
print(f'Accuracy: {gb_models.iloc[0]["Accuracy"]}')

Best Model by Accuracy
Window: 20
Accuracy: 0.5973333333333334


In [10]:
file = 'TrainedModels/GB_Model.pkl'
scaler_file = 'Scalers/GB.bin'
joblib.dump(model, file)
joblib.dump(scaler, scaler_file, compress=True)

['Scalers/GB.bin']