# Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing Helper Functions
from helper_functions import drop_extraneous_col, save_df
# Recursive Feature Elimination with Cross-Validation
from sklearn.feature_selection import RFECV
# Time Series Split and GridSearchCV, where GridSearchCV is for hyperparameter tuning
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_validate, RandomizedSearchCV
# Pipeline
from sklearn.pipeline import Pipeline
# Standard Scalar
from sklearn.preprocessing import StandardScaler
# Confusion Matrix
from sklearn.metrics import confusion_matrix, make_scorer, recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
# Logistic Regression, Ridge Classifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# XGBoost
from xgboost import XGBClassifier
# Support Vector Machine (SVM)
from sklearn.svm import LinearSVC

# Using the Cumulative Averages DataFrame

In [23]:
cumulative_df = pd.read_csv('/Users/siddmittal/Documents/School/3B/MSCI-446-Project/csvs/cumulative_averages.csv')
prev_game_df = pd.read_csv('/Users/siddmittal/Documents/School/3B/MSCI-446-Project/csvs/prev_game_df.csv')

drop_extraneous_col(cumulative_df)
drop_extraneous_col(prev_game_df)

training_df = pd.concat([cumulative_df, prev_game_df], axis=1)

In [24]:
training_df = training_df.loc[:,~training_df.columns.duplicated()]

In [26]:
training_df

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,restDays_team0,restDays_team1,mp_cumulative_team0,...,orb%_prev_game_team0,drb%_prev_game_team0,trb%_prev_game_team0,ast%_prev_game_team0,stl%_prev_game_team0,blk%_prev_game_team0,tov%_prev_game_team0,ortg_prev_game_team0,drtg_prev_game_team0,ft/fga_prev_game_team0
0,CLE,MIL,CLE,2018,2017-10-20,6,10,2.0,1.0,240.000000,...,19.6,82.0,52.1,50.0,3.0,7.1,15.3,102.7,99.7,0.253
1,LAL,PHO,LAL,2018,2017-10-20,25,24,0.0,1.0,240.000000,...,22.2,70.2,46.8,56.8,7.6,9.5,15.8,86.9,102.0,0.154
2,GSW,NOP,GSW,2018,2017-10-20,21,30,2.0,1.0,240.000000,...,15.4,77.8,48.8,79.1,4.9,16.1,16.0,118.6,119.6,0.238
3,ORL,BRK,BRK,2018,2017-10-20,15,4,1.0,1.0,240.000000,...,25.0,78.0,53.2,51.2,7.6,12.7,12.0,110.3,103.6,0.244
4,BOS,PHI,BOS,2018,2017-10-20,2,5,1.0,1.0,240.000000,...,22.0,84.2,48.9,59.0,12.3,3.6,10.7,102.2,110.4,0.121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,MIL,BOS,BOS,2024,2024-03-20,10,2,2.0,1.0,241.119403,...,23.8,78.1,47.3,68.6,11.3,3.7,9.1,143.4,132.2,0.149
8351,PHI,PHO,PHO,2024,2024-03-20,5,24,1.0,2.0,240.735294,...,15.6,78.0,48.4,62.2,7.3,21.4,11.7,102.8,95.4,0.129
8352,MIA,CLE,MIA,2024,2024-03-20,11,6,1.0,1.0,240.384615,...,22.0,84.4,51.6,65.7,8.4,6.1,13.6,95.4,102.8,0.101
8353,LAC,POR,LAC,2024,2024-03-20,22,17,2.0,1.0,240.373134,...,22.2,75.0,47.1,60.7,11.1,10.0,14.7,102.8,121.6,0.351


# Splitting Dataframe into Train and Test

In [6]:
undesired_columns = ['team0', 'team1', 'winner', 'season', 'date', 'team1_winner']
# We decided to train from the 2018 season to the 2023 season
training_seasons = [2018,2019,2020,2021,2022,2023]
# Splitting the dataframe into train and test
X_train = training_df[training_df['season'].isin(training_seasons)].drop(undesired_columns, axis=1)
X_test = training_df[training_df['season'] == 2024].drop(undesired_columns, axis=1)
y_train = training_df[training_df['season'].isin(training_seasons)]['team1_winner']
y_test = training_df[training_df['season'] == 2024]['team1_winner']


In [7]:
# Double checking the shapes of the training and testing dataframes

print(f'Observations in X_train: {X_train.shape[0]}')
print(f'Observations in y_train: {y_train.shape[0]}')

print(f'Observations in X_test: {X_test.shape[0]}')
print(f'Observations in y_test: {y_test.shape[0]}')


Observations in X_train: 7348
Observations in y_train: 7348
Observations in X_test: 1007
Observations in y_test: 1007


# Scaling Features

In [8]:
std_scalar = StandardScaler()
X_train = std_scalar.fit_transform(X_train)
X_test = std_scalar.fit_transform(X_test) 

# Defining the Type of Cross Validation

In [9]:
# Define the type of cross validation
tscv = TimeSeriesSplit()

# Creating a Results DataFrame to Store Training and Validation Scores

In [10]:
results_df = pd.DataFrame(columns=['Model', 'Training Accuracy', 'Validation Accuracy',
                             'Training Precision', 'Validation Precision',
                             'Training Recall ', 'Validation Recall',
                             'Training F1', 'Validation F1',
                             'Training ROC_AUC', 'Validation ROC_AUC'])

# Logistic Regression

In [12]:
# Define the Logistic Regresion Model
model_lr = LogisticRegression(solver='saga', max_iter=5000, random_state=42)
# Define the Recursive Feature Elimination Cross Validation
model_rfecv_lr = RFECV(estimator = model_lr, cv=tscv, min_features_to_select=30, scoring='accuracy')

# Fitting the rfecv model to the data
model_rfecv_lr.fit(X_train, y_train)
# Transforming the the training dataset to only have the selected features
X_train_selected = model_rfecv_lr.transform(X_train)
# Define a grid of hyperparameters
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

grid_search_lr = GridSearchCV(estimator=model_lr, param_grid=param_grid, cv=tscv, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)
grid_search_lr.fit(X_train_selected,y_train)

cv_results_lr = grid_search_lr.cv_results_

new_row_data = ['Logistic Regression']
for scorer in scoring:
    best_validation_score = cv_results_lr[f'mean_test_{scorer}'].max()
    i = list(cv_results_lr[f'mean_test_{scorer}']).index(best_validation_score)
    train_score = list(cv_results_lr[f'mean_train_{scorer}'])[i]
    new_row_data.extend([train_score,best_validation_score])

num_features_lr = model_rfecv_lr.n_features_
best_params_lr = grid_search_lr.best_params_

new_row_series = pd.Series(new_row_data, index=results_df.columns)
results_df = pd.concat([results_df, pd.DataFrame(new_row_series).T], axis=0, ignore_index=True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


# Random Forest Classifier

In [13]:
# Define the RandomForestClassifier Model
model_rfc = RandomForestClassifier(random_state=42)
# Define the Recursive Feature Elimination Cross Validation
model_rfecv_rfc = RFECV(estimator = model_rfc, cv=tscv, min_features_to_select=30, scoring='accuracy')
# Fitting the rfecv model to the data
model_rfecv_rfc.fit(X_train, y_train)
# Transforming the the training dataset to only have the selected features
X_train_selected = model_rfecv_rfc.transform(X_train)
# Define a grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 500],       # Number of trees in the forest.
    'max_depth': [None, 10, 20, 30],        # Maximum depth of the tree.
    'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2, 4], 
}
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

random_search_rfc = RandomizedSearchCV(estimator=model_rfc, n_iter=100, param_distributions=param_grid, cv=tscv, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)
random_search_rfc.fit(X_train_selected,y_train)

cv_results_rfc = random_search_rfc.cv_results_

new_row_data = ['Random Forest Classifier']
for scorer in scoring:
    best_validation_score = cv_results_rfc[f'mean_test_{scorer}'].max()
    i = list(cv_results_rfc[f'mean_test_{scorer}']).index(best_validation_score)
    train_score = list(cv_results_rfc[f'mean_train_{scorer}'])[i]
    new_row_data.extend([train_score,best_validation_score])

num_features_rfc = model_rfecv_rfc.n_features_
best_params_rfc = random_search_rfc.best_params_

new_row_series = pd.Series(new_row_data, index=results_df.columns)
results_df = pd.concat([results_df, pd.DataFrame(new_row_series).T], axis=0, ignore_index=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


# XGBoost Classifier

In [14]:
# Define the XGB Classifier Model
model_xgb = XGBClassifier(objective='binary:logistic')
# Define the Recursive Feature Elimination Cross Validation
model_rfecv_xgb = RFECV(estimator = model_xgb, cv=tscv, min_features_to_select=30, scoring='accuracy')
# Fitting the rfecv model to the data
model_rfecv_xgb.fit(X_train, y_train)
# Transforming the the training dataset to only have the selected features
X_train_selected = model_rfecv_xgb.transform(X_train)
# Define a grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
}
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

random_search_xgb = RandomizedSearchCV(estimator=model_xgb, n_iter=100, param_distributions=param_grid, cv=tscv, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)
random_search_xgb.fit(X_train_selected,y_train)

cv_results_xgb = random_search_xgb.cv_results_

new_row_data = ['XGB Classifier']
for scorer in scoring:
    best_validation_score = cv_results_xgb[f'mean_test_{scorer}'].max()
    i = list(cv_results_xgb[f'mean_test_{scorer}']).index(best_validation_score)
    train_score = list(cv_results_xgb[f'mean_train_{scorer}'])[i]
    new_row_data.extend([train_score,best_validation_score])

num_features_xgb = model_rfecv_xgb.n_features_
best_params_xgb = random_search_xgb.best_params_

new_row_series = pd.Series(new_row_data, index=results_df.columns)
results_df = pd.concat([results_df, pd.DataFrame(new_row_series).T], axis=0, ignore_index=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


# Support Vector Machine

In [15]:
# Define the Support Vector Machine Model
model_svm = LinearSVC(max_iter=5000, random_state=42, dual='auto')
# Define the Recursive Feature Elimination Cross Validation
model_rfecv_svm = RFECV(estimator = model_svm, cv=tscv, min_features_to_select=30, scoring='accuracy')
# Fitting the rfecv model to the data
model_rfecv_svm.fit(X_train, y_train)
# Transforming the the training dataset to only have the selected features
X_train_selected = model_rfecv_svm.transform(X_train)
# Define a grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty
}
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

grid_search_svm = GridSearchCV(estimator=model_svm, param_grid=param_grid, cv=tscv, scoring=scoring, refit='accuracy', verbose=1, return_train_score=True)
grid_search_svm.fit(X_train_selected,y_train)

cv_results_svm = grid_search_svm.cv_results_

new_row_data = ['Linear SVC']
for scorer in scoring:
    best_validation_score = cv_results_svm[f'mean_test_{scorer}'].max()
    i = list(cv_results_svm[f'mean_test_{scorer}']).index(best_validation_score)
    train_score = list(cv_results_svm[f'mean_train_{scorer}'])[i]
    new_row_data.extend([train_score,best_validation_score])

num_features_svm = model_rfecv_svm.n_features_
best_params_svm = grid_search_svm.best_params_

new_row_series = pd.Series(new_row_data, index=results_df.columns)
results_df = pd.concat([results_df, pd.DataFrame(new_row_series).T], axis=0, ignore_index=True)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [20]:
results_df

Unnamed: 0,Model,Training Accuracy,Validation Accuracy,Training Precision,Validation Precision,Training Recall,Validation Recall,Training F1,Validation F1,Training ROC_AUC,Validation ROC_AUC
0,Logistic Regression,0.653676,0.633007,0.671728,0.6576,0.901078,0.89119,0.736857,0.719659,0.628283,0.616199
1,Random Forest Classifier,0.921446,0.625817,0.999511,0.641243,0.982894,0.79238,0.949626,0.705055,0.933047,0.603987
2,XGB Classifier,0.843363,0.621242,0.99139,0.638448,0.933242,0.906406,0.790881,0.72448,0.929741,0.596454
3,Linear SVC,0.654781,0.628922,0.674252,0.656741,0.806736,0.740571,0.729382,0.691225,0.637309,0.613532


In [21]:
print(best_params_lr)
print(best_params_rfc)
print(best_params_xgb)
print(best_params_svm)

{'C': 0.1, 'penalty': 'l1'}
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10}
{'subsample': 0.7, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.5}
{'C': 0.1, 'penalty': 'l1'}


In [22]:
print(num_features_lr)
print(num_features_rfc)
print(num_features_xgb)
print(num_features_svm)

49
49
116
81


In [19]:
save_df(results_df, "cominbation_df_results.csv")