# Libraries

In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Importing Helper Functions
from helper_functions import drop_extraneous_col


# Recursive Feature Elimination with Cross-Validation
from sklearn.feature_selection import RFECV
# Time Series Split and GridSearchCV, where GridSearchCV is for hyperparameter tuning
# Blocked Time Series Split (which we would have to implement on our own)
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_validate
# Pipeline
from sklearn.pipeline import Pipeline
# Standard Scalar
from sklearn.preprocessing import StandardScaler
# Confusion Matrix
from sklearn.metrics import confusion_matrix

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# XGBoost
from xgboost import XGBClassifier

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier

# Support Vector Machine (SVM)
from sklearn.svm import SVC

# Using the Previous Game DataFrame

In [87]:
training_df  = pd.read_csv('csvs/prev_game_df.csv')
drop_extraneous_col(training_df)
training_df

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,mp_prev_game_team0,fg_prev_game_team0,fga_prev_game_team0,...,orb%_prev_game_team1,drb%_prev_game_team1,trb%_prev_game_team1,ast%_prev_game_team1,stl%_prev_game_team1,blk%_prev_game_team1,tov%_prev_game_team1,ortg_prev_game_team1,drtg_prev_game_team1,team1_winner
0,SAC,DAL,SAC,2018,2017-10-20,23,27,240.0,42.0,88.0,...,20.0,69.6,45.1,71.1,5.1,5.3,13.6,112.6,118.7,0
1,POR,IND,POR,2018,2017-10-20,17,7,240.0,44.0,90.0,...,30.4,75.0,52.2,54.7,10.6,14.1,10.8,123.6,115.7,0
2,ORL,BRK,BRK,2018,2017-10-20,15,4,240.0,43.0,90.0,...,25.0,69.6,47.8,48.9,6.2,2.9,15.6,115.7,123.6,1
3,BOS,PHI,BOS,2018,2017-10-20,2,5,240.0,39.0,91.0,...,23.4,69.8,48.0,58.1,5.8,8.0,14.4,110.3,115.1,0
4,DET,WAS,WAS,2018,2017-10-20,8,14,240.0,41.0,96.0,...,30.2,76.6,52.0,50.0,7.7,17.2,7.3,115.1,110.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,PHI,PHO,PHO,2024,2024-03-20,5,24,240.0,37.0,85.0,...,21.9,76.2,52.7,70.6,7.2,9.4,11.4,132.2,143.4,1
8351,UTA,OKC,OKC,2024,2024-03-20,18,16,240.0,39.0,84.0,...,21.6,84.1,55.6,55.6,7.2,21.6,10.3,121.3,115.1,1
8352,IND,DET,IND,2024,2024-03-20,7,8,240.0,42.0,91.0,...,17.4,70.0,44.8,58.3,6.5,6.5,11.7,101.3,128.2,0
8353,MIL,BOS,BOS,2024,2024-03-20,10,2,240.0,51.0,94.0,...,30.0,82.6,55.2,69.8,8.6,14.5,8.2,128.2,101.3,1


# Splitting Dataframe into Train and Test

In [88]:
undesired_columns = ['team0', 'team1', 'winner', 'season', 'date', 'team1_winner']
# We decided to train from the 2018 season to the 2023 season
training_seasons = [2018,2019,2020,2021,2022,2023]
# Splitting the dataframe into train and test
X_train = training_df[training_df['season'].isin(training_seasons)].drop(undesired_columns, axis=1)
X_test = training_df[training_df['season'] == 2024].drop(undesired_columns, axis=1)
y_train = training_df[training_df['season'].isin(training_seasons)]['team1_winner']
y_test = training_df[training_df['season'] == 2024]['team1_winner']


In [89]:
# Double checking the shapes of the training and testing dataframes

print(f'Observations in the training set: {X_train.shape[0]}')
print(f'Observations in the training set: {y_train.shape[0]}')

print(f'Observations in the testing set: {X_test.shape[0]}')
print(f'Observations in the testing set: {y_test.shape[0]}')


Observations in the training set: 7348
Observations in the training set: 7348
Observations in the testing set: 1007
Observations in the testing set: 1007


# Scaling Features

In [90]:
std_scalar = StandardScaler()
X_train = std_scalar.fit_transform(X_train)
X_test = std_scalar.fit_transform(X_test) 

# Training Models using the Previous Game DataFrame

In [94]:
model_lr = LogisticRegression()
tscv = TimeSeriesSplit()
model_rfecv = RFECV(estimator=model_lr, cv=tscv, min_features_to_select=25)
cv_results = cross_validate(model_rfecv, X_train, y_train, cv=tscv, scoring='precision', return_train_score=True)

In [95]:
cv_results

{'fit_time': array([1.37064314, 1.72504783, 2.00561571, 2.95733523, 3.22076511]),
 'score_time': array([0.00334787, 0.0009861 , 0.00087237, 0.00087976, 0.00085974]),
 'test_score': array([0.63023256, 0.57489879, 0.55897436, 0.58002039, 0.59848485]),
 'train_score': array([0.64594895, 0.62531646, 0.61675824, 0.60610807, 0.60256141])}

# Feature Selection

In [None]:
# This can be used for all the machine learning algorithms below
tscv = TimeSeriesSplit()
model_rfecv = RFECV(estimator=, cv=tscv, min_features_to_select=)

# Logistic Regression

In [None]:
# Can apply L1 or L2 Regularization
model_lr = LogisticRegression(penalty=)

scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

# Random Forest Classifier

In [None]:
model_rfc = RandomForestClassifier(n_estimators=, max_depth=)

# Understand features using feature_importances_


scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

# XGBoost Classifier

In [None]:
model_xgb = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, objective='binary:logistic')

# Understand features using feature_importances_


scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

# Ridge Classifier

In [None]:
# Feature selection is embedded in the algorithm
model_rc = RidgeClassifier()


scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

# Support Vector Machine

In [None]:
model_svm = SVC(kernel=)



scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)