In [1]:
from team_functions import dataPrep, getTeamRollingSeason, mapTeamID

In [2]:
import pandas as pd
import os

In [3]:
teams_df = pd.DataFrame([])
current_df = pd.DataFrame([])
for team_file in os.listdir('data/TeamsPrep'):
    team_id = team_file[:3]
    
    for year in [2020, 2021, 2022, 2023]:
        team_df = pd.read_excel(f'data/TeamsPrep/{team_file}', str(year))

        team_df['TeamID'] = mapTeamID(team_id)
        team_df['OppID'] = team_df['OppID'].apply(lambda x: mapTeamID(x))
        team_df['Season'] = year

        teams_df = pd.concat([teams_df, team_df])
        
    team_df = pd.read_excel(f'data/CurrentSeason/{team_file}')
    
    roll_df = getTeamRollingSeason(team_df)
    prep_df = dataPrep(roll_df, 'Games')
    
    prep_df['TeamID'] = mapTeamID(team_id)
    prep_df['OppID'] = prep_df['OppID'].apply(lambda x: mapTeamID(x))
    prep_df['Season'] = 2024
    
    current_df = pd.concat([current_df, prep_df])
    
teams_df = teams_df.drop(columns=['Date']).reset_index(drop=True)
current_df = current_df.drop(columns=['Date']).reset_index(drop=True)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = teams_df.drop(columns=['Target'])
y = teams_df['Target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

X_test = current_df.drop(columns=['Target'])
y_test = current_df['Target']

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.model_selection import RandomizedSearchCV

In [9]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [11]:
rf_model = RandomForestClassifier(random_state=2024)
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, 
                                      n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [12]:
best_params = rf_random_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10, 'bootstrap': False}


In [14]:
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

In [15]:
y_val_pred = best_rf_model.predict(X_val)
print('Report:\n', classification_report(y_val, y_val_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_val_pred))

Report:
               precision    recall  f1-score   support

           0       0.60      0.61      0.60       914
           1       0.60      0.60      0.60       910

    accuracy                           0.60      1824
   macro avg       0.60      0.60      0.60      1824
weighted avg       0.60      0.60      0.60      1824

Confusion Matrix:
 [[553 361]
 [365 545]]


In [16]:
y_test_pred = best_rf_model.predict(X_test)
print('Report:\n', classification_report(y_test, y_test_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_test_pred))

Report:
               precision    recall  f1-score   support

           0       0.67      0.64      0.65       686
           1       0.65      0.68      0.67       686

    accuracy                           0.66      1372
   macro avg       0.66      0.66      0.66      1372
weighted avg       0.66      0.66      0.66      1372

Confusion Matrix:
 [[439 247]
 [219 467]]
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.0s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=150; total time=   4.5s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END bootstrap=True, max_de

[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   1.5s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.6s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=150; total time=   4.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.8s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.2s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   1.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   2.0s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, 

[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   1.6s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.5s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.4s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   2.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4,

[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.3s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   1.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   3.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.4s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=150; total time=   4.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   2.0s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4,