In [1]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import ShuffleSplit, GridSearchCV

pd.set_option("display.max_rows", None, "display.max_columns", None)

## Getting the data ready

In [2]:
def get_datasets(span = 5):
    path = os.path.abspath(f'../../data/{span}span_training_set.csv')
    training_df = pd.read_csv(path)

    path = os.path.abspath(f'../../data/{span}span_testing_set.csv')
    testing_df = pd.read_csv(path)

    train_true, test_true = training_df.pop('Win'), testing_df.pop('Win')

    print(f'{len(training_df)} train examples')
    print(f'{len(testing_df)} test examples')

    return training_df, testing_df, train_true, test_true

## Training the model

In [21]:
def train_model(estimator, param_grid, training_df, train_true, n_splits = 5):
    # Create the grid search object
    grid_search = GridSearchCV(estimator, param_grid, cv=ShuffleSplit(n_splits), verbose=5)

    # Fit the grid search to the data
    grid_search.fit(training_df, train_true)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    return grid_search.best_estimator_

## Testing the model

In [4]:
def test_model(clf, testing_df, test_true):
    y_pred = clf.predict(testing_df)

    accuracy = accuracy_score(test_true, y_pred)
    print(f"Accuracy: {(accuracy*100):.2f}")

    print("\nClassification Report:")
    print(classification_report(test_true, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(test_true, y_pred))

## Saving the models

In [5]:
def save_model(model, filename):
    path = os.path.abspath(f'../model/mens/{filename}')
    pickle.dump(model, open(path, 'wb'))

## Constant Variables

In [6]:
SPANS = [3, 5, 7]

In [7]:
def train_test_save(estimator, param_grid, filename, spans = [3, 5, 7], n_splits = 5):
    for span in spans:
        print(f'----- Span: {span} -----')
        
        training_df, testing_df, train_true, test_true = get_datasets(span)
        clf = train_model(estimator, param_grid, training_df, train_true, n_splits)
        test_model(clf, testing_df, test_true)
        save_model(clf, f'{span}span_{filename}')

# Logistic Regression

In [8]:
# Define the Logistic Regression model
logreg_model = LogisticRegression()

# Define the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2'],  # Vary the regularization type
    'solver': ['lbfgs', 'newton-cholesky'],
    'max_iter': [int(10e10)]
}

filename = 'logistic_regression_model.pkl'

train_test_save(logreg_model, param_grid, filename)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.713 total time=   1.3s
[CV 2/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.671 total time=   1.6s
[CV 3/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.693 total time=   1.1s
[CV 4/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.673 total time=   1.5s
[CV 5/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.688 total time=   1.1s
[CV 1/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.713 total time=   0.0s
[CV 2/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.669 total time=   0.0s
[CV 3/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.693 total time=   0.0s
[CV 4/5] END C=0.001, max

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.686 total time=  24.8s
[CV 3/5] END C=1, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.686 total time=  22.2s
[CV 4/5] END C=1, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.666 total time=  20.0s
[CV 5/5] END C=1, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.686 total time=  21.5s
[CV 1/5] END C=1, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.714 total time=   0.0s
[CV 2/5] END C=1, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.687 total time=   0.0s
[CV 3/5] END C=1, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.684 total time=   0.0s
[CV 4/5] END C=1, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.669 total time=   0.0s
[CV 5/5] END C=1, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.687 total time=   0.0s
[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solve

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.687 total time=  26.2s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.684 total time=  27.5s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.666 total time=  25.5s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.688 total time=  26.4s
[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.716 total time=   0.0s
[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.688 total time=   0.0s
[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.681 total time=   0.0s
[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.669 total time=   0.0s
[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.688 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.717 total time=  26.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.690 total time=  25.7s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.685 total time=  26.2s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.668 total time=  26.1s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.690 total time=  26.0s
[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.711 total time=   0.0s
[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.683 total time=   0.0s
[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.679 total time=   0.0s
[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.670 total time=   0.0s
[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.684 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.718 total time=  25.5s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.686 total time=  25.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.687 total time=  25.8s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.666 total time=  26.1s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.687 total time=  25.3s
[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.708 total time=   0.0s
[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.684 total time=   0.0s
[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.676 total time=   0.0s
[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.668 total time=   0.0s
[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.684 total time=   0.0s
Best hyperparameters: {'C': 0.01, 'max_iter': 100000000000, 'penalty': 'l2', 'solver': 'newton-cholesky'}
Accuracy: 68.25

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.48      0.55      2507
           1       0.70      0.82      0.75      3653

    accuracy                

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.666 total time=  19.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.681 total time=  20.2s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.673 total time=  20.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.676 total time=  20.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.687 total time=  19.4s
[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.664 total time=   0.0s
[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.681 total time=   0.0s
[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.669 total time=   0.0s
[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.676 total time=   0.0s
[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.687 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.665 total time=  19.7s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.679 total time=  19.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.671 total time=  19.7s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.677 total time=  21.1s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.689 total time=  21.9s
[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.669 total time=   0.0s
[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.679 total time=   0.0s
[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.676 total time=   0.0s
[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.678 total time=   0.0s
[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.686 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.664 total time=  22.6s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.683 total time=  23.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.673 total time=  22.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.682 total time=  21.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.690 total time=  21.2s
[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.667 total time=   0.0s
[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.674 total time=   0.0s
[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.673 total time=   0.0s
[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.679 total time=   0.0s
[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.683 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best hyperparameters: {'C': 1000, 'max_iter': 100000000000, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 67.75

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.51      0.55      2224
           1       0.71      0.79      0.75      3388

    accuracy                           0.68      5612
   macro avg       0.66      0.65      0.65      5612
weighted avg       0.67      0.68      0.67      5612


Confusion Matrix:
[[1128 1096]
 [ 714 2674]]
----- Span: 7 -----
11907 train examples
5104 test examples
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.680 total time=   0.8s
[CV 2/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.672 total time=   0.8s
[CV 3/5] END C=0.001, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.678 total time=   0.9s
[CV 4/5] END C=0.001, max_iter=100000000000, penalty=l2, solv

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.673 total time=  16.4s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.683 total time=  16.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.683 total time=  18.1s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.694 total time=  19.3s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.680 total time=  19.3s
[CV 1/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.674 total time=   0.0s
[CV 2/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.682 total time=   0.0s
[CV 3/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.680 total time=   0.0s
[CV 4/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.694 total time=   0.0s
[CV 5/5] END C=10, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.679 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.674 total time=  19.8s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.685 total time=  20.2s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.683 total time=  19.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.695 total time=  18.9s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.678 total time=  16.3s
[CV 1/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.677 total time=   0.0s
[CV 2/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.678 total time=   0.0s
[CV 3/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.681 total time=   0.0s
[CV 4/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.696 total time=   0.0s
[CV 5/5] END C=100, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.677 total time=   0.0s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.673 total time=  16.5s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.684 total time=  17.6s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.687 total time=  17.7s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.693 total time=  18.4s


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=lbfgs;, score=0.677 total time=  19.2s
[CV 1/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.677 total time=   0.0s
[CV 2/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.678 total time=   0.0s
[CV 3/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.679 total time=   0.0s
[CV 4/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.696 total time=   0.0s
[CV 5/5] END C=1000, max_iter=100000000000, penalty=l2, solver=newton-cholesky;, score=0.673 total time=   0.0s
Best hyperparameters: {'C': 0.01, 'max_iter': 100000000000, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 69.02

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.51      0.57      2060
           1       0.71      0.81      0.76      3044

    accuracy                          

# Support Vector Machine

In [8]:
# Define the SVM model
svm_model = SVC()

# Define the hyperparameter grid
# param_grid = {
#     'C': [0.01, 0.1, 1, 10],
#     'kernel': ['linear', 'rbf', 'sigmoid'],
#     'gamma': [0.1, 0.01, 0.001],
#     # 'probability': [True]
# }

param_grid = {
    'C': [0.01],
    'kernel': ['linear'],
    'gamma': [0.1],
    'probability': [True]
}

filename = 'support_vector_machine_model.pkl'

train_test_save(svm_model, param_grid, filename, n_splits=1)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV 1/1] END C=0.01, gamma=0.1, kernel=linear, probability=True;, score=0.670 total time= 2.6min
Best hyperparameters: {'C': 0.01, 'gamma': 0.1, 'kernel': 'linear', 'probability': True}
Accuracy: 68.02

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.47      0.55      2507
           1       0.69      0.82      0.75      3653

    accuracy                           0.68      6160
   macro avg       0.67      0.65      0.65      6160
weighted avg       0.67      0.68      0.67      6160


Confusion Matrix:
[[1186 1321]
 [ 649 3004]]
----- Span: 5 -----
13093 train examples
5612 test examples
Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV 1/1] END C=0.01, gamma=0.1, kernel=linear, probability=True;, score=0.669 total time= 1.9min
Best hyperparameters: {'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'

# K-Nearest Neighbors (KNN)

In [23]:
# Define the KNN model
knn = KNeighborsClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 10],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weighting scheme
    'p': [1, 2],  # Distance metric (1 for Manhattan, 2 for Euclidean)
    'leaf_size': [15, 30, 45],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

filename = 'knn_model.pkl'

train_test_save(knn, param_grid, filename, n_splits=1)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 1 folds for each of 240 candidates, totalling 240 fits
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=1, p=1, weights=uniform;, score=0.583 total time=   0.4s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=1, p=1, weights=distance;, score=0.583 total time=   0.4s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=1, p=2, weights=uniform;, score=0.552 total time=   0.0s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=1, p=2, weights=distance;, score=0.552 total time=   0.0s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=3, p=1, weights=uniform;, score=0.601 total time=   0.4s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=3, p=1, weights=distance;, score=0.601 total time=   0.4s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=3, p=2, weights=uniform;, score=0.581 total time=   0.1s
[CV 1/1] END algorithm=auto, leaf_size=15, n_neighbors=3, p=2, weights=distance;, score=0.

# Random Forests

In [24]:
# Define the Random Forest model
rfc = RandomForestClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 500],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Maximum number of features considered for splitting
    'max_depth': [4, 5, 6, 7, 8, None],  # Maximum depth of each tree
    'criterion': ['gini', 'entropy', 'log_loss']  # Split quality criterion
}

filename = 'random_forest.pkl'

train_test_save(rfc, param_grid, filename, n_splits=1)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=500;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=100;, score=0.665 total time=   3.5s
[CV 1/1] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.663 total time=   7.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=500;, score=0.665 total time=  17.7s
[CV 1/1] END criterion=gini, max_depth=4, max_features=log2, n_estimators=100;, score=0.650 total time=   1.9s
[CV 1/1] END criterion=gini, max_depth=4, max_features=log2, n_estimators=200;, score=0.654 total time=   

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'criterion': 'log_loss', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy: 66.74

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.38      0.48      2507
           1       0.67      0.87      0.76      3653

    accuracy                           0.67      6160
   macro avg       0.66      0.62      0.62      6160
weighted avg       0.67      0.67      0.64      6160


Confusion Matrix:
[[ 946 1561]
 [ 488 3165]]
----- Span: 5 -----
13093 train examples
5612 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=500;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini,

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
Accuracy: 67.59

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.45      0.52      2224
           1       0.70      0.82      0.75      3388

    accuracy                           0.68      5612
   macro avg       0.66      0.64      0.64      5612
weighted avg       0.67      0.68      0.66      5612


Confusion Matrix:
[[1002 1222]
 [ 597 2791]]
----- Span: 7 -----
11907 train examples
5104 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, max_depth=4, max_features=auto, n_estimators=500;, score=nan total time=   0.0s
[CV 1/1] END criterion=gini, 

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
Accuracy: 68.30

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.47      0.55      2060
           1       0.70      0.82      0.76      3044

    accuracy                           0.68      5104
   macro avg       0.67      0.65      0.65      5104
weighted avg       0.68      0.68      0.67      5104


Confusion Matrix:
[[ 976 1084]
 [ 534 2510]]


# Gradient Boosting

In [25]:
# Define the Random Forest model
gbc = GradientBoostingClassifier()

# Define the hyperparameter grid
param_grid = {
    'loss': ['log_loss', 'deviance', 'exponential'],  # Loss function
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate (eta)
    'max_depth': [3, 5, 8],  # Maximum depth of each tree
    'max_features': ['log2', 'sqrt'],  # Maximum number of features considered for splitting
    'n_estimators': [100, 200, 300]  # Number of trees
}

filename = 'gradient_boosting.pkl'

train_test_save(gbc, param_grid, filename, n_splits=1)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=100;, score=0.619 total time=   2.1s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=200;, score=0.644 total time=   4.4s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=300;, score=0.656 total time=   6.6s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=sqrt, n_estimators=100;, score=0.634 total time=   3.8s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=sqrt, n_estimators=200;, score=0.656 total time=   7.7s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=sqrt, n_estimators=300;, score=0.661 total time=  11.7s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=5, max_features=log2, n_estimators=100;

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 200}
Accuracy: 67.45

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.49      0.55      2507
           1       0.70      0.80      0.74      3653

    accuracy                           0.67      6160
   macro avg       0.66      0.65      0.65      6160
weighted avg       0.67      0.67      0.67      6160


Confusion Matrix:
[[1238 1269]
 [ 736 2917]]
----- Span: 5 -----
13093 train examples
5612 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=100;, score=0.618 total time=   2.1s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=200;, score=0.662 total time=   4.2s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=l

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 300}
Accuracy: 66.39

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.49      0.53      2224
           1       0.70      0.78      0.74      3388

    accuracy                           0.66      5612
   macro avg       0.65      0.63      0.64      5612
weighted avg       0.66      0.66      0.66      5612


Confusion Matrix:
[[1082 1142]
 [ 744 2644]]
----- Span: 7 -----
11907 train examples
5104 test examples
Fitting 1 folds for each of 162 candidates, totalling 162 fits
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=100;, score=0.631 total time=   1.9s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=log2, n_estimators=200;, score=0.676 total time=   3.8s
[CV 1/1] END learning_rate=0.01, loss=log_loss, max_depth=3, max_features=l

54 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\mtayl\anaconda3\envs\mlmb-model-env\Lib\site-packages\sklearn\utils\_param_vali

Best hyperparameters: {'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy: 68.01

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.50      0.56      2060
           1       0.70      0.80      0.75      3044

    accuracy                           0.68      5104
   macro avg       0.67      0.65      0.65      5104
weighted avg       0.67      0.68      0.67      5104


Confusion Matrix:
[[1033 1027]
 [ 606 2438]]


# Multilayer Perceptron

In [26]:
# Define the MLP model
mlp_model = MLPClassifier()

# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(12,), (23,), (45,), (91,), (181,)],  # Vary the number of neurons in the hidden layer
    'activation': ['relu', 'identity', 'logistic', 'tanh'],  # Vary the activation function
    'solver': ['adam', 'sgd', 'adam'],  # Vary the solver
    'alpha': [0.0001, 0.001],  # Vary the L2 regularization strength
}

filename = 'multilayer_perceptron.pkl'

train_test_save(mlp_model, param_grid, filename, n_splits=1)

----- Span: 3 -----
14373 train examples
6160 test examples
Fitting 1 folds for each of 120 candidates, totalling 120 fits
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=adam;, score=0.660 total time=   1.9s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=sgd;, score=0.698 total time=   3.4s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=adam;, score=0.689 total time=   2.9s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=adam;, score=0.707 total time=   2.1s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=sgd;, score=0.679 total time=   4.3s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=adam;, score=0.706 total time=   2.3s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(45,), solver=adam;, score=0.608 total time=   1.4s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(45,), solv

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# retrain 7 span
train_test_save(mlp_model, param_grid, filename, n_splits=1, spans=[7])

----- Span: 7 -----
11907 train examples
5104 test examples
Fitting 1 folds for each of 120 candidates, totalling 120 fits
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=adam;, score=0.667 total time=   2.5s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=sgd;, score=0.594 total time=   0.6s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(12,), solver=adam;, score=0.663 total time=   2.9s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=adam;, score=0.688 total time=   1.6s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=sgd;, score=0.682 total time=   3.2s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(23,), solver=adam;, score=0.676 total time=   1.3s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(45,), solver=adam;, score=0.664 total time=   1.1s
[CV 1/1] END activation=relu, alpha=0.0001, hidden_layer_sizes=(45,), solv

# Server testing

In [74]:
import joblib

mens_model_dir, womens_model_dir = os.path.join("../model/mens/"), os.path.join("../model/womens/")
mens_filenames, womens_filenames = [filename for filename in os.listdir(mens_model_dir)], [filename for filename in os.listdir(womens_model_dir)]
mens_models, womens_models = {filename.split('.pkl')[0]: joblib.load(f'{mens_model_dir}{filename}') for filename in mens_filenames}, {filename.split('.pkl')[0]: joblib.load(f'{womens_model_dir}{filename}') for filename in womens_filenames}
print(mens_filenames)

payload = [{
    'model': '3span_knn_model',
    'isWomens': False,
    'team1': [
          27.33333333333333,
          26.393939393939394,
          26.58692363323644,
          58.0,
          56.90909090909091,
          56.277424356201664,
          0.4713333333333332,
          0.4665757575757576,
          0.4721426885365508,
          10.333333333333334,
          7.090909090909091,
          10.582568880636243,
          23.666666666666668,
          20.03030303030303,
          22.62090914323926,
          0.4343333333333333,
          0.3542424242424242,
          0.4643407699386589,
          10.666666666666666,
          14.787878787878787,
          11.464159086113796,
          14.333333333333334,
          19.51515151515152,
          16.123824953334406,
          0.757,
          0.7663333333333334,
          0.7228634019719903,
          6.666666666666667,
          8.575757575757576,
          6.452909055864438,
          29.0,
          30.939393939393938,
          30.71948037599213,
          16.666666666666668,
          12.424242424242424,
          16.448066715849563,
          7.333333333333333,
          6.151515151515151,
          6.840419095475227,
          1.6666666666666667,
          1.6666666666666667,
          2.218273723265156,
          9.666666666666666,
          9.424242424242424,
          9.69409596431069,
          17.666666666666668,
          15.969696969696969,
          17.0706839885097,
          113.7,
          111.4060606060606,
          113.3470722494647,
          97.56666666666666,
          102.3939393939394,
          95.66693762349897,
          66.53333333333335,
          66.7,
          66.28398357145488,
          0.2516666666666666,
          0.3579393939393939,
          0.2917529405199457,
          0.409,
          0.3535151515151515,
          0.4035162461462896,
          0.585,
          0.5660000000000001,
          0.5887726908712647,
          51.7,
          53.23030303030303,
          53.63034644359723,
          60.03333333333333,
          46.83030303030304,
          61.26416795952245,
          11.0,
          9.190909090909091,
          10.294132525194437,
          4.966666666666668,
          4.578787878787878,
          6.817281067278236,
          0.5613333333333334,
          0.5293030303030303,
          0.5664465686886107,
          12.933333333333332,
          12.412121212121212,
          13.108024106733502,
          26.066666666666663,
          29.1,
          24.82206351116765,
          0.186,
          0.272030303030303,
          0.2067422204576433
        ],
    'team2': [
          25.666666666666668,
          28.6875,
          24.59661942813545,
          55.333333333333336,
          58.625,
          54.18251581070945,
          0.4630000000000001,
          0.4889375,
          0.4534812509915791,
          7.666666666666667,
          8.4375,
          6.992306689731777,
          16.0,
          20.5625,
          16.076059906743467,
          0.4733333333333333,
          0.406125,
          0.4282937919711694,
          15.0,
          17.84375,
          16.7495296751149,
          20.666666666666668,
          24.78125,
          23.68396619334817,
          0.7303333333333333,
          0.7280625000000001,
          0.7151780618028716,
          9.0,
          11.09375,
          8.591873774304986,
          31.33333333333333,
          37.65625,
          31.69479167787358,
          20.0,
          18.40625,
          19.331843585707247,
          5.666666666666667,
          5.75,
          6.041198720224202,
          2.6666666666666665,
          3.75,
          2.86770398914814,
          10.666666666666666,
          10.84375,
          10.759370770305395,
          17.0,
          14.28125,
          16.808736738283187,
          110.06666666666666,
          118.73125,
          108.0258174452465,
          100.63333333333333,
          99.56875,
          98.5662613492459,
          67.3,
          69.8875,
          67.62328939689323,
          0.3793333333333333,
          0.42921875,
          0.4423435214064084,
          0.2903333333333334,
          0.35075,
          0.2971660725474357,
          0.5686666666666667,
          0.5944375,
          0.5581987138292752,
          53.13333333333333,
          58.196875,
          51.86511530568824,
          78.7333333333333,
          63.640625,
          79.51162182725966,
          8.433333333333335,
          8.103125,
          8.951511404290795,
          6.666666666666665,
          9.65,
          6.993118633283302,
          0.5316666666666666,
          0.560875,
          0.5177487884066068,
          14.033333333333337,
          13.38125,
          14.102322203852236,
          33.7,
          36.621875,
          31.172773850290103,
          0.276,
          0.30946875,
          0.3132329415972344
        ],
    'isNeutral': False
}]

results = []
for matchup in payload:
    input = np.array([matchup['team2'] + matchup['team1'] + [int(matchup['isNeutral'])]])
    models = mens_models if not matchup['isWomens'] else womens_models
    model = models[matchup['model']]
    print(model.predict(input))
    print(model.predict_proba(input))

['3span_gradient_boosting.pkl', '3span_knn_model.pkl', '3span_logistic_regression_model.pkl', '3span_multilayer_perceptron.pkl', '3span_random_forest.pkl', '3span_support_vector_machine_model.pkl', '5span_gradient_boosting.pkl', '5span_knn_model.pkl', '5span_logistic_regression_model.pkl', '5span_multilayer_perceptron.pkl', '5span_random_forest.pkl', '5span_support_vector_machine_model.pkl', '7span_gradient_boosting.pkl', '7span_knn_model.pkl', '7span_logistic_regression_model.pkl', '7span_multilayer_perceptron.pkl', '7span_random_forest.pkl', '7span_support_vector_machine_model.pkl']
[1]
[[0.14285714 0.85714286]]


