In [1]:
import pandas as pd
import numpy as np
from math import floor
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, TimeSeriesSplit

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

# Data Analysis/Preprocessing

In [113]:
raw_data = pd.read_csv('df_final.csv')
raw_data = raw_data.drop('Unnamed: 0',axis = 1)
raw_data.head()

Unnamed: 0,SeasonID,TeamID,Conference,PlayoffRankLag1,WinPCTLag1,DiffPointsPGLag1,PlayoffRankLag2,WinPCTLag2,DiffPointsPGLag2,PlayoffRankLag3,WinPCTLag3,DiffPointsPGLag3,PlayoffRankLag4,WinPCTLag4,DiffPointsPGLag4,isPlayoff,nyt_score,twitter_score,reddit_score
0,2008,1610612739,East,4.0,0.549,-0.4,2.0,0.61,3.8,3.0,0.61,2.2,9.0,0.512,0.8,1,0.038036,0.024716,0.141607
1,2008,1610612747,West,1.0,0.695,7.3,7.0,0.512,-0.1,7.0,0.549,2.5,11.0,0.415,-3.0,1,0.235119,0.070399,0.082954
2,2008,1610612743,West,8.0,0.61,3.7,6.0,0.549,1.6,4.0,0.537,0.2,7.0,0.598,2.0,1,0.098203,-0.037116,-0.009162
3,2008,1610612738,East,1.0,0.805,10.3,15.0,0.293,-3.4,11.0,0.402,-1.5,4.0,0.549,0.9,1,-0.014583,0.095421,0.066737
4,2008,1610612759,West,3.0,0.683,4.8,3.0,0.707,8.4,1.0,0.768,6.8,2.0,0.72,7.8,1,0.086667,0.097475,0.097209


In [114]:
train_data = raw_data.iloc[:-30]
test_data = raw_data.iloc[-30:]

In [115]:
# Split into X and y train
X_train = train_data.drop(['SeasonID','TeamID','isPlayoff'],axis = 1)
y_train = train_data['isPlayoff']

X_test = test_data.drop(['SeasonID','TeamID','isPlayoff'],axis = 1)
y_test = test_data['isPlayoff']
X_test = X_test.reset_index().drop('index',axis = 1)
y_test = y_test.reset_index().drop('index',axis = 1)

In [116]:
len(X_train), len(y_train), len(X_test), len(y_test)

(390, 390, 30, 30)

## One Hot Encode

In [117]:
# Train data
ohe = OneHotEncoder()
encoded_data = ohe.fit_transform(X_train[['Conference']]).toarray()
columns = ohe.get_feature_names(X_train[['Conference']].columns)
train_encoded = pd.DataFrame(encoded_data, columns=columns)

X_train = pd.concat([X_train.drop('Conference',axis=1),train_encoded],axis = 1)

In [118]:
# Test data
encoded_data = ohe.transform(X_test[['Conference']]).toarray()
columns = ohe.get_feature_names(X_test[['Conference']].columns)
test_encoded = pd.DataFrame(encoded_data, columns=columns)

X_test = pd.concat([X_test.drop('Conference',axis=1),test_encoded],axis = 1)

# Building Models

In [121]:
# Creating time series split
num_obs = len(train_data)
num_per_season = 30
n_splits = int(num_obs/num_per_season - 1)

time_series_split = TimeSeriesSplit(n_splits = n_splits)

## Logistic Regression

In [119]:
# Hyperparameter tuning
logreg_param = {'penalty': ['l1','l2'], 'C': np.arange(0.0001,0.04,0.01), 
                'max_iter': np.arange(2000,3000,100), 'solver': ['liblinear','saga']}

# Scoring metric
scoring = 'f1'

# Build the model
logreg = LogisticRegression()
logreg_gridsearch = GridSearchCV(logreg, logreg_param, cv = time_series_split, scoring = scoring)
logreg_gridsearch.fit(X_train, y_train)

# Find the best hyperparameters
print('Best hyperparameters:', logreg_gridsearch.best_params_)
print('Best score:', logreg_gridsearch.best_score_)

Best hyperparameters: {'C': 0.0301, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'saga'}
Best score: 0.7573259619096905


In [120]:
# Get the other metrics for the best model

# Since only 16 teams can make it into the playoffs, we take the top 16 teams using the predict probability 
# function, there must also be 8 teams from each region

# Get probabilities of making it to the Playoffs from the best hyperparameters
logreg_final = LogisticRegression(C = 0.0301, max_iter = 2000, penalty = 'l2', solver = 'saga')
logreg_final.fit(X_train, y_train)
logreg_y_probs = logreg_final.predict_proba(X_train)[:,1]

# Combine the probabilities with the initial full dataframe
logreg_combined = pd.concat([train_data,pd.DataFrame(logreg_y_probs,columns = ['prob'])],axis = 1)

# Input class labels for each team in each season based on the ranking of probabilties. The top 8 teams for each
# season and each region will be labelled as 1 while the others will be labelled as 0

logreg_classone = logreg_combined.sort_values('prob',ascending=False).groupby(['SeasonID','Conference']).head(8)
logreg_combined['predicted_class'] = 0
logreg_combined.loc[logreg_classone.index, 'predicted_class'] = 1
logreg_y_pred = logreg_combined['predicted_class']

metrics = ['f1-score','Accuracy','Recall','Precision','ROC-AUC']

logreg_f1, logreg_accuracy = f1_score(y_train,logreg_y_pred), accuracy_score(y_train,logreg_y_pred)
logreg_recall, logreg_precision = recall_score(y_train,logreg_y_pred), precision_score(y_train,logreg_y_pred)
logreg_auc = roc_auc_score(y_train,logreg_y_pred)

logreg_metrics = pd.DataFrame([logreg_f1, logreg_accuracy, logreg_recall, logreg_precision, logreg_auc], 
                              index = metrics)
logreg_metrics

Unnamed: 0,0
f1-score,0.75
Accuracy,0.733333
Recall,0.75
Precision,0.75
ROC-AUC,0.732143


In [126]:
logreg_combined[logreg_combined['SeasonID'] == 2020].sort_values(['Conference','prob'],ascending = False)

Unnamed: 0,SeasonID,TeamID,Conference,PlayoffRankLag1,WinPCTLag1,DiffPointsPGLag1,PlayoffRankLag2,WinPCTLag2,DiffPointsPGLag2,PlayoffRankLag3,...,DiffPointsPGLag3,PlayoffRankLag4,WinPCTLag4,DiffPointsPGLag4,isPlayoff,nyt_score,twitter_score,reddit_score,prob,predicted_class
372,2020,1610612747,West,1.0,0.732,5.8,10.0,0.451,-1.7,11.0,...,-1.5,14.0,0.317,-6.9,1,0.137206,0.083646,0.131489,0.901324,1
366,2020,1610612746,West,2.0,0.681,6.4,8.0,0.585,0.9,10.0,...,0.0,4.0,0.622,4.3,1,0.159375,0.086295,0.116918,0.87857,1
364,2020,1610612743,West,3.0,0.63,2.1,2.0,0.659,4.0,9.0,...,1.5,9.0,0.488,0.5,1,0.009276,0.000588,0.135856,0.792968,1
368,2020,1610612742,West,7.0,0.573,4.9,14.0,0.402,-1.3,13.0,...,-3.0,11.0,0.402,-2.9,1,0.0,0.074769,0.120921,0.721675,1
386,2020,1610612760,West,5.0,0.611,2.0,6.0,0.598,3.4,4.0,...,3.4,6.0,0.573,0.8,0,0.0,0.051112,0.113104,0.673538,1
388,2020,1610612745,West,4.0,0.611,3.0,4.0,0.646,4.8,1.0,...,8.5,3.0,0.671,5.8,0,0.103472,0.075954,0.119333,0.638875,1
361,2020,1610612762,West,6.0,0.611,2.5,5.0,0.61,5.3,5.0,...,4.3,5.0,0.622,3.9,1,-0.3,0.096141,0.122093,0.61821,1
362,2020,1610612756,West,10.0,0.466,0.2,15.0,0.232,-9.3,15.0,...,-9.4,15.0,0.293,-5.6,1,0.086793,0.083189,0.133175,0.609756,1
371,2020,1610612757,West,8.0,0.473,-1.1,3.0,0.646,4.2,3.0,...,2.6,8.0,0.5,-0.5,1,0.0,0.083254,0.117715,0.529929,0
374,2020,1610612763,West,9.0,0.466,-1.1,12.0,0.402,-2.6,14.0,...,-6.2,7.0,0.524,0.5,1,0.0,0.086295,0.130669,0.508414,0


## Support Vector Machine

In [122]:
# Hyperparameter tuning
svc_param = {'C': np.arange(0.0001,0.1,0.04), 'gamma': np.arange(0.0001,0.1,0.04), 
             'kernel': ['linear', 'rbf']}

# Build the model
svc = SVC()
svc_gridsearch = GridSearchCV(svc, svc_param, cv = time_series_split, scoring = scoring)
svc_gridsearch.fit(X_train, y_train)

# Find the best hyperparameters
print('Best hyperparameters:', svc_gridsearch.best_params_)
print('Best score:', svc_gridsearch.best_score_)

Best hyperparameters: {'C': 0.040100000000000004, 'gamma': 0.0001, 'kernel': 'linear'}
Best score: 0.756519313492748


In [123]:
# Get the other metrics for the best model

# Since only 16 teams can make it into the playoffs, we take the top 16 teams using the predict probability 
# function, there must also be 8 teams from each region

# Get probabilities of making it to the Playoffs from the best hyperparameters
svc_final = SVC(C = 0.040100000000000004, gamma = 0.0001, kernel = 'linear', probability = True)
svc_final.fit(X_train, y_train)
svc_y_probs = svc_final.predict_proba(X_train)[:,1]

# Combine the probabilities with the initial full dataframe
svc_combined = pd.concat([train_data,pd.DataFrame(svc_y_probs,columns = ['prob'])],axis = 1)

# Input class labels for each team in each season based on the ranking of probabilties. The top 8 teams for each
# season and each region will be labelled as 1 while the others will be labelled as 0

svc_classone = svc_combined.sort_values('prob',ascending=False).groupby(['SeasonID','Conference']).head(8)
svc_combined['predicted_class'] = 0
svc_combined.loc[svc_classone.index, 'predicted_class'] = 1
svc_y_pred = svc_combined['predicted_class']

svc_f1, svc_accuracy = f1_score(y_train,svc_y_pred), accuracy_score(y_train,svc_y_pred)
svc_recall, svc_precision = recall_score(y_train,svc_y_pred), precision_score(y_train,svc_y_pred)
svc_auc = roc_auc_score(y_train,svc_y_pred)

svc_metrics = pd.DataFrame([svc_f1, svc_accuracy, svc_recall, svc_precision, svc_auc], index = metrics)
svc_metrics

Unnamed: 0,0
f1-score,0.764423
Accuracy,0.748718
Recall,0.764423
Precision,0.764423
ROC-AUC,0.747596


## AdaBoost

In [124]:
# Hyperparameter tuning
adaboost_param = {'n_estimators': np.arange(50,200,50), 'learning_rate': np.arange(0.001,0.1,0.04)}

# Build the model
adaboost = AdaBoostClassifier()
adaboost_gridsearch = GridSearchCV(adaboost, adaboost_param, cv = time_series_split, scoring = scoring)
adaboost_gridsearch.fit(X_train, y_train)

# Find the best hyperparameters
print('Best hyperparameters:', adaboost_gridsearch.best_params_)
print('Best score:', adaboost_gridsearch.best_score_)

Best hyperparameters: {'learning_rate': 0.001, 'n_estimators': 150}
Best score: 0.7192195854911372


In [125]:
# Get the other metrics for the best model

# Since only 16 teams can make it into the playoffs, we take the top 16 teams using the predict probability 
# function, there must also be 8 teams from each region

# Get probabilities of making it to the Playoffs from the best hyperparameters
adaboost_final = AdaBoostClassifier(learning_rate = 0.081, n_estimators = 150)
adaboost_final.fit(X_train, y_train)
adaboost_y_probs = adaboost_final.predict_proba(X_train)[:,1]

# Combine the probabilities with the initial full dataframe
adaboost_combined = pd.concat([train_data,pd.DataFrame(adaboost_y_probs,columns = ['prob'])],axis = 1)

# Input class labels for each team in each season based on the ranking of probabilties. The top 8 teams for each
# season and each region will be labelled as 1 while the others will be labelled as 0

adaboost_classone = adaboost_combined.sort_values('prob',
                                                  ascending=False).groupby(['SeasonID','Conference']).head(8)
adaboost_combined['predicted_class'] = 0
adaboost_combined.loc[adaboost_classone.index, 'predicted_class'] = 1
adaboost_y_pred = adaboost_combined['predicted_class']

adaboost_f1, adaboost_accuracy = f1_score(y_train,adaboost_y_pred), accuracy_score(y_train,adaboost_y_pred)
adaboost_recall = recall_score(y_train,adaboost_y_pred)
adaboost_precision = precision_score(y_train,adaboost_y_pred)
adaboost_auc = roc_auc_score(y_train,adaboost_y_pred)

adaboost_metrics = pd.DataFrame([adaboost_f1, adaboost_accuracy, adaboost_recall, adaboost_precision, 
                                 adaboost_auc], index = metrics)
adaboost_metrics

Unnamed: 0,0
f1-score,0.8125
Accuracy,0.8
Recall,0.8125
Precision,0.8125
ROC-AUC,0.799107


## Gradient Boosting

In [None]:
# Hyperparameter tuning
gradientboost_param = {'learning_rate': np.arange(0.001,0.1,0.04), 'n_estimators': np.arange(100,2000,200), 
                       'max_depth': np.arange(2,16,2)}

# Build the model
gradientboost = GradientBoostingClassifier()
gradientboost_gridsearch = RandomizedSearchCV(gradientboost, gradientboost_param, n_iter = 100, 
                                              cv = time_series_split, random_state=42, scoring = scoring)
gradientboost_gridsearch.fit(X_train, y_train)

# Find the best hyperparameters
print('Best hyperparameters:', gradientboost_gridsearch.best_params_)
print('Best score:', gradientboost_gridsearch.best_score_)

In [None]:
# Get the other metrics for the best model

# Since only 16 teams can make it into the playoffs, we take the top 16 teams using the predict probability 
# function, there must also be 8 teams from each region

# Get probabilities of making it to the Playoffs from the best hyperparameters
gradientboost_final = GradientBoostingClassifier(learning_rate = 0.001, max_depth = 2, n_estimators = 500)
gradientboost_final.fit(X_train, y_train)
gradientboost_y_probs = gradientboost_final.predict_proba(X_train)[:,1]

# Combine the probabilities with the initial full dataframe
gradientboost_combined = pd.concat([train_data,pd.DataFrame(gradientboost_y_probs,columns = ['prob'])],axis = 1)

# Input class labels for each team in each season based on the ranking of probabilties. The top 8 teams for each
# season and each region will be labelled as 1 while the others will be labelled as 0

gradientboost_classone = gradientboost_combined.sort_values('prob',
                                                  ascending=False).groupby(['SeasonID','Conference']).head(8)
gradientboost_combined['predicted_class'] = 0
gradientboost_combined.loc[gradientboost_classone.index, 'predicted_class'] = 1
gradientboost_y_pred = gradientboost_combined['predicted_class']

gradientboost_f1 = f1_score(y_train,gradientboost_y_pred)
gradientboost_accuracy = accuracy_score(y_train,gradientboost_y_pred)
gradientboost_recall = recall_score(y_train,gradientboost_y_pred)
gradientboost_precision = precision_score(y_train,gradientboost_y_pred)
gradientboost_auc = roc_auc_score(y_train,gradientboost_y_pred)

gradientboost_metrics = pd.DataFrame([gradientboost_f1, gradientboost_accuracy, gradientboost_recall, 
                                      gradientboost_precision, gradientboost_auc], index = metrics)
gradientboost_metrics

## Random Forest

In [None]:
# Hyperparameter tuning
randomforest_param = {'n_estimators': np.arange(100,2000,200), 'criterion':['gini', 'entropy'],
                      'max_depth': np.arange(2,12,2),'min_samples_split': [2, 5, 10]}

# Build the model
randomforest = RandomForestClassifier()
randomforest_gridsearch = RandomizedSearchCV(randomforest, randomforest_param, n_iter = 100, 
                                              cv = time_series_split, random_state=42, scoring = scoring)
randomforest_gridsearch.fit(X_train, y_train)

# Find the best hyperparameters
print('Best hyperparameters:', randomforest_gridsearch.best_params_)
print('Best score:', randomforest_gridsearch.best_score_)

In [None]:
# Get the other metrics for the best model

# Since only 16 teams can make it into the playoffs, we take the top 16 teams using the predict probability 
# function, there must also be 8 teams from each region

# Get probabilities of making it to the Playoffs from the best hyperparameters
randomforest_final = RandomForestClassifier(n_estimators = 1700, max_depth = 8, criterion = 'entropy')
randomforest_final.fit(X_train, y_train)
randomforest_y_probs = randomforest_final.predict_proba(X_train)[:,1]

# Combine the probabilities with the initial full dataframe
randomforest_combined = pd.concat([train_data,pd.DataFrame(randomforest_y_probs,columns = ['prob'])],axis = 1)

# Input class labels for each team in each season based on the ranking of probabilties. The top 8 teams for each
# season and each region will be labelled as 1 while the others will be labelled as 0

randomforest_classone = randomforest_combined.sort_values('prob',
                                                  ascending=False).groupby(['SeasonID','Conference']).head(8)
randomforest_combined['predicted_class'] = 0
randomforest_combined.loc[randomforest_classone.index, 'predicted_class'] = 1
randomforest_y_pred = randomforest_combined['predicted_class']

randomforest_f1 = f1_score(y_train,randomforest_y_pred)
randomforest_accuracy = accuracy_score(y_train,randomforest_y_pred)
randomforest_recall = recall_score(y_train,randomforest_y_pred)
randomforest_precision = precision_score(y_train,randomforest_y_pred)
randomforest_auc = roc_auc_score(y_train,randomforest_y_pred)

randomforest_metrics = pd.DataFrame([randomforest_f1, randomforest_accuracy, randomforest_recall, 
                                      randomforest_precision, randomforest_auc], index = metrics)
randomforest_metrics

## Summary of Models' Performance

# Using the Chosen Model on the Test Set