In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
# import ensemble methods
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [22]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("../../04_datasets/nfl_dataset_v3.csv", index_col=0)
print("...Done.")
print()

Loading dataset...
...Done.



In [23]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 1289

Display of dataset: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,...,capacity,people,attendance_info,month,dayofmonth,dayofweek,hour,minute,win_streak,lose_streak
0,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,...,67858,89.0,1,11,5,6,18,0,0.0,1.0
4,2017,4,Bills,Falcons,23,17,400951685,-6,0,1,...,75000,95.0,1,10,1,6,17,0,2.0,0.0
8,2017,4,Saints,Dolphins,20,0,400950241,-20,0,1,...,86000,98.0,1,10,1,6,13,30,0.0,2.0
12,2017,4,Bears,Packers,14,35,400951678,21,1,0,...,81041,97.0,1,9,29,4,0,25,0.0,1.0
16,2017,16,Colts,Ravens,16,23,400951596,7,1,0,...,70745,100.0,1,12,23,5,21,30,0.0,6.0



Basics statistics: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,...,capacity,people,attendance_info,month,dayofmonth,dayofweek,hour,minute,win_streak,lose_streak
count,1289.0,1289.0,1289,1289,1289.0,1289.0,1289.0,1289.0,1289.0,1289.0,...,1289.0,1140.0,1289.0,1289.0,1289.0,1289.0,1289.0,1289.0,1275.0,1275.0
unique,,,32,32,,,,,,,...,,,,,,,,,,
top,,,Cowboys,Jaguars,,,,,,,...,,,,,,,,,,
freq,,,41,41,,,,,,,...,,,,,,,,,,
mean,2019.027153,9.17533,,,22.494182,23.759503,401134100.0,1.265322,0.539178,0.460822,...,70211.602017,88.013158,0.884407,10.254461,16.082234,5.098526,15.049651,9.09232,1.032941,1.349804
std,1.422984,5.062066,,,10.139945,10.328135,133883.2,14.768326,0.498656,0.498656,...,9601.706381,24.11428,0.319861,2.123613,8.765195,1.920486,7.274286,11.26295,1.603718,2.210647
min,2017.0,1.0,,,0.0,0.0,400950200.0,-49.0,0.0,0.0,...,27000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2018.0,5.0,,,16.0,17.0,401030800.0,-7.0,0.0,0.0,...,65828.0,91.0,1.0,10.0,9.0,6.0,17.0,0.0,0.0,0.0
50%,2019.0,9.0,,,23.0,24.0,401128000.0,2.0,1.0,0.0,...,68740.0,97.0,1.0,11.0,16.0,6.0,18.0,0.0,0.0,1.0
75%,2020.0,14.0,,,30.0,31.0,401220300.0,10.0,1.0,1.0,...,73000.0,100.0,1.0,12.0,24.0,6.0,20.0,20.0,1.5,2.0



Percentage of missing values: 


season                    0.000000
week                      0.000000
awayteam                  0.000000
hometeam                  0.000000
awayscore                 0.000000
homescore                 0.000000
idgame                    0.000000
score_abs                 0.000000
winner_home               0.000000
winner_away               0.000000
winner_team               0.000000
concat                    0.000000
home_coach                0.000000
away_coach                0.000000
location_x                0.000000
weather_type              0.000000
temperature               0.000000
humidity                  0.000000
wind                      0.000000
game_date                 0.000000
team                      0.000000
delta_match               1.163693
pass_yds_MA_5             6.206362
pass_td_MA_5              6.206362
rush_yds_MA_5             6.206362
rush_td_MA_5              6.206362
rec_yds_MA_5              6.206362
rec_td_MA_5               6.206362
fumbles_MA_5        

In [32]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "winner_home"

X = dataset.drop(target_variable, axis = 1)
features_drop_list = {'team','winner_team','home_coach', 'away_coach','week','month','delta_match','season','winner_away','concat','idgame','awayscore','homescore','score_abs','location_x','game_date','date','location_y','attendance','capacity','attendance_info','hour','minute' }
X.drop(features_drop_list, axis=1, inplace=True)

Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0     1
4     0
8     0
12    1
16    1
Name: winner_home, dtype: int64

X :
   awayteam  hometeam weather_type  temperature  humidity  wind  \
0   Bengals   Jaguars       cloudy           78        74    12   
4     Bills   Falcons       cloudy           68        70     5   
8    Saints  Dolphins       cloudy           63        64    13   
12    Bears   Packers       cloudy           65        62     5   
16    Colts    Ravens       cloudy           63        78    14   

    pass_yds_MA_5  pass_td_MA_5  rush_yds_MA_5  rush_td_MA_5  ...  \
0           241.8           2.2           78.0           0.2  ...   
4             NaN           NaN            NaN           NaN  ...   
8             NaN           NaN            NaN           NaN  ...   
12            NaN           NaN            NaN           NaN  ...   
16          163.8           0.8          106.8           0.4  ...   

    kicks_return_yds_MA_5  kicks_return_td_MA_5  punt_r

In [33]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['temperature', 'humidity', 'wind', 'pass_yds_MA_5', 'pass_td_MA_5', 'rush_yds_MA_5', 'rush_td_MA_5', 'rec_yds_MA_5', 'rec_td_MA_5', 'fumbles_MA_5', 'fumbles_rec_MA_5', 'defense_sacks_MA_5', 'defense_td_MA_5', 'interceptions_MA_5', 'interceptions_td_MA_5', 'kicks_return_yds_MA_5', 'kicks_return_td_MA_5', 'punt_return_yds_MA_5', 'kicking_pct_MA_5', 'people', 'dayofmonth', 'dayofweek', 'win_streak', 'lose_streak']
Found categorical features  ['awayteam', 'hometeam', 'weather_type', 'stade']


In [34]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [35]:
# Create pipeline for categorical features
#categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ]) 

In [36]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

In [37]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [38]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
      awayteam   hometeam weather_type  temperature  humidity  wind  \
2449  Panthers      Colts        sunny           45        57     3   
3953  Dolphins  Cardinals       cloudy           66        34    11   
1056   Bengals     Ravens       cloudy           48        58     8   
4685      Jets      Colts       cloudy           44        54     2   
4633  Panthers     Giants       cloudy           57        51     0   

      pass_yds_MA_5  pass_td_MA_5  rush_yds_MA_5  rush_td_MA_5  ...  \
2449          202.4           0.8          126.6           1.6  ...   
3953          266.2           2.2          157.0           1.6  ...   
1056          228.4           1.4           95.6           0.8  ...   
4685          246.8           2.2          133.4           1.4  ...   
4633          290.6           0.8           95.8           1.0  ...   

      kicks_return_yds_MA_5  kicks_return_td_MA_5  punt_return_yds_MA_5  \
2449                   27.6  

XGBOOST Option 1

In [29]:
# Perform grid search
print("Grid search...")
xgboost = XGBClassifier()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6], # exactly the same role as in scikit-learn
    'min_child_weight': [1, 2, 3], # effect is more or less similar to min_samples_leaf and min_samples_split
    'n_estimators': [2, 4, 6, 8,] # exactly the same role as in scikit-learn
}
print(params)
gridsearch = GridSearchCV(xgboost, 
    param_grid = params, 
    cv = 3 # cv : the number of folds to be used for CV
    ) 
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))

Grid search...
{'max_depth': [2, 4, 6], 'min_child_weight': [1, 2, 3], 'n_estimators': [2, 4, 6, 8]}
...Done.
Best hyperparameters :  {'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 8}
Best validation accuracy :  0.6295087802562885

Accuracy on training set :  0.8322017458777885
Accuracy on test set :  0.7558139534883721


XGBoost Option 2

In [39]:
# Perform grid search
print("Grid search...")
xgboost = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

# Grid of values to be tested
params = {
    'max_depth': range(2, 10, 1), # exactly the same role as in scikit-learn
    'min_child_weight': [1, 2, 3], # effect is more or less similar to min_samples_leaf and min_samples_split
    'n_estimators': range(60, 220, 40), # exactly the same role as in scikit-learn
    'learning_rate': [0.1, 0.01, 0.05]
}

print(params)

gridsearch = GridSearchCV(xgboost,
    param_grid = params, 
    scoring = "roc_auc", 
    n_jobs = -1, 
    cv = 10, # cv : the number of folds to be used for CV
    verbose=True
    ) 

gridsearch.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))

Grid search...
{'max_depth': range(2, 10), 'min_child_weight': [1, 2, 3], 'n_estimators': range(60, 220, 40), 'learning_rate': [0.1, 0.01, 0.05]}
Fitting 10 folds for each of 288 candidates, totalling 2880 fits
...Done.
Best hyperparameters :  {'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 180}
Best validation accuracy :  0.67948941351202

Accuracy on training set :  0.9767663763725862
Accuracy on test set :  0.68732241097878
