In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
# import ensemble methods
from xgboost import XGBClassifier
# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


In [137]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("../../04_datasets/nfl_dataset_vf.csv", index_col=0)
print("...Done.")
print()

Loading dataset...
...Done.



In [138]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 1289

Display of dataset: 


Unnamed: 0,season,week,awayteam,hometeam,idgame,winner_home,home_coach,away_coach,weather_type,temperature,...,punt_return_yds_MA_5_home,kicking_pts_MA_5_home,delta_day_away,delta_day_home,stade,location,people,month,dayofmonth,dayofweek
0,2017,13,Vikings,Falcons,400951677,0,Dan Quinn,Mike Zimmer,,,...,10.4,9.4,10.0,7.0,Mercedes-Benz Stadium,"Atlanta, GA",95.0,12,3,6
1,2017,9,Falcons,Panthers,400951749,1,Ron Rivera,Dan Quinn,cloudy,61.0,...,22.4,7.4,7.0,7.0,Bank of America Stadium,"Charlotte, NC",100.0,11,5,6
2,2017,9,Bengals,Jaguars,400951753,1,Doug Marrone,Marvin Lewis,,78.0,...,0.4,8.4,7.0,14.0,TIAA Bank Field,"Jacksonville, FL",89.0,11,5,6
3,2017,4,Bears,Packers,400951678,1,Mike McCarthy,John Fox,cloudy,65.0,...,,,4.0,4.0,Lambeau Field,"Green Bay, WI",97.0,9,29,4
4,2017,9,Colts,Texans,400951751,0,Bill O'Brien,Chuck Pagano,cloudy,84.0,...,31.4,9.0,7.0,6.0,NRG Stadium,"Houston, TX",100.0,11,5,6



Basics statistics: 


Unnamed: 0,season,week,awayteam,hometeam,idgame,winner_home,home_coach,away_coach,weather_type,temperature,...,punt_return_yds_MA_5_home,kicking_pts_MA_5_home,delta_day_away,delta_day_home,stade,location,people,month,dayofmonth,dayofweek
count,1289.0,1289.0,1289,1289,1289.0,1289.0,1289,1289,1043,1179.0,...,1175.0,1175.0,1245.0,1239.0,1289,1289,1140.0,1289.0,1289.0,1289.0
unique,,,32,32,,,63,63,7,,...,,,,,37,35,,,,
top,,,Seahawks,Titans,,,Sean McDermott,Kyle Shanahan,cloudy,,...,,,,,MetLife Stadium,"East Rutherford, NJ",,,,
freq,,,41,41,,,41,41,522,,...,,,,,81,81,,,,
mean,2019.027153,9.17533,,,401134100.0,0.539178,,,,56.229008,...,14.44817,6.968,19.679518,19.59322,,,88.013158,10.254461,16.082234,5.098526
std,1.422984,5.062066,,,133883.2,0.498656,,,,22.435574,...,8.710801,1.811904,53.993547,54.142582,,,24.11428,2.123613,8.765195,1.920486
min,2017.0,1.0,,,400950200.0,0.0,,,,1.0,...,-1.8,1.6,3.0,3.0,,,1.0,1.0,1.0,0.0
25%,2018.0,5.0,,,401030800.0,0.0,,,,44.0,...,8.2,5.8,6.0,6.0,,,91.0,10.0,9.0,6.0
50%,2019.0,9.0,,,401128000.0,1.0,,,,61.0,...,12.6,7.0,7.0,7.0,,,97.0,11.0,16.0,6.0
75%,2020.0,14.0,,,401220300.0,1.0,,,,73.0,...,18.8,8.2,7.0,7.0,,,100.0,12.0,24.0,6.0



Percentage of missing values: 


season                         0.000000
week                           0.000000
awayteam                       0.000000
hometeam                       0.000000
idgame                         0.000000
winner_home                    0.000000
home_coach                     0.000000
away_coach                     0.000000
weather_type                  19.084562
temperature                    8.533747
humidity                       7.447634
wind                          13.343677
streak_away                    2.249806
pass_yds_MA_5_away             8.456168
pass_td_MA_5_away              8.456168
rush_yds_MA_5_away             8.456168
rush_td_MA_5_away              8.456168
rec_yds_MA_5_away              8.456168
rec_td_MA_5_away               8.456168
fumbles_MA_5_away              8.456168
fumbles_rec_MA_5_away          8.456168
defense_sacks_MA_5_away        8.456168
defense_td_MA_5_away           8.456168
interceptions_MA_5_away        8.456168
interceptions_td_MA_5_away     8.456168


In [139]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "winner_home"

X = dataset.drop(target_variable, axis = 1)
features_drop_list = {'idgame', 'location'}
X.drop(features_drop_list, axis=1, inplace=True)

Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    0
1    1
2    1
3    1
4    0
Name: winner_home, dtype: int64

X :
   season  week awayteam  hometeam     home_coach    away_coach weather_type  \
0    2017    13  Vikings   Falcons      Dan Quinn   Mike Zimmer          NaN   
1    2017     9  Falcons  Panthers     Ron Rivera     Dan Quinn       cloudy   
2    2017     9  Bengals   Jaguars   Doug Marrone  Marvin Lewis          NaN   
3    2017     4    Bears   Packers  Mike McCarthy      John Fox       cloudy   
4    2017     9    Colts    Texans   Bill O'Brien  Chuck Pagano       cloudy   

   temperature  humidity  wind  ...  kicks_return_td_MA_5_home  \
0          NaN       NaN   NaN  ...                        0.0   
1         61.0      86.0   4.0  ...                        0.0   
2         78.0      74.0  12.0  ...                        0.0   
3         65.0      62.0   5.0  ...                        NaN   
4         84.0      62.0  10.0  ...                        0.0   



In [140]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['season', 'week', 'temperature', 'humidity', 'wind', 'streak_away', 'pass_yds_MA_5_away', 'pass_td_MA_5_away', 'rush_yds_MA_5_away', 'rush_td_MA_5_away', 'rec_yds_MA_5_away', 'rec_td_MA_5_away', 'fumbles_MA_5_away', 'fumbles_rec_MA_5_away', 'defense_sacks_MA_5_away', 'defense_td_MA_5_away', 'interceptions_MA_5_away', 'interceptions_td_MA_5_away', 'kicks_return_yds_MA_5_away', 'kicks_return_td_MA_5_away', 'punt_return_yds_MA_5_away', 'kicking_pts_MA_5_away', 'streak_home', 'pass_yds_MA_5_home', 'pass_td_MA_5_home', 'rush_yds_MA_5_home', 'rush_td_MA_5_home', 'rec_yds_MA_5_home', 'rec_td_MA_5_home', 'fumbles_MA_5_home', 'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home', 'defense_td_MA_5_home', 'interceptions_MA_5_home', 'interceptions_td_MA_5_home', 'kicks_return_yds_MA_5_home', 'kicks_return_td_MA_5_home', 'punt_return_yds_MA_5_home', 'kicking_pts_MA_5_home', 'delta_day_away', 'delta_day_home', 'people', 'month', 'dayofmonth', 'dayofweek']
Found categorical feat

In [141]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [142]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

In [143]:
# Create pipeline for categorical features
#categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ]) 

In [144]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [145]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done preprocessing train set.')
print(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
      season  week    awayteam  hometeam      home_coach      away_coach  \
642     2019    13    Patriots    Texans    Bill O'Brien  Bill Belichick   
1008    2020     3  Buccaneers   Broncos      Vic Fangio    Bruce Arians   
280     2018    10     Falcons    Browns  Gregg Williams       Dan Quinn   
1200    2021    13      Giants  Dolphins    Brian Flores       Joe Judge   
1183    2021     6       Bills    Titans     Mike Vrabel  Sean McDermott   

     weather_type  temperature  humidity  wind  ...  \
642           NaN          6.0      32.0   4.0  ...   
1008       cloudy         55.0      39.0   7.0  ...   
280         sunny         36.0      56.0   9.0  ...   
1200        sunny         82.0      54.0   9.0  ...   
1183          NaN         65.0      43.0   0.0  ...   

      kicks_return_td_MA_5_home  punt_return_yds_MA_5_home  \
642                         0.0                       11.6   
1008                        0.0               



XGBoost

In [110]:
# Perform grid search
print("Grid search...")
model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=-1,
    seed=42
)

# Grid of values to be tested
params = {
    'max_depth': range(2, 10, 1), # exactly the same role as in scikit-learn
    'n_estimators': range(60, 220, 40), # exactly the same role as in scikit-learn
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight': [1, 2, 3]
}

print(params)

gridsearch = GridSearchCV(model,
    param_grid = params, 
    scoring = "roc_auc", 
    n_jobs = -1, 
    cv = 10, # cv : the number of folds to be used for CV
    verbose=True
    ) 

gridsearch.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))

Grid search...
{'max_depth': range(2, 10), 'n_estimators': range(60, 220, 40), 'learning_rate': [0.1, 0.01, 0.05], 'min_child_weight': [1, 2, 3]}
Fitting 10 folds for each of 288 candidates, totalling 2880 fits
...Done.
Best hyperparameters :  {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 3, 'n_estimators': 140}
Best validation accuracy :  0.6154548476789168

Accuracy on training set :  0.8750056796667929
Accuracy on test set :  0.6648328396106644


In [146]:

params = gridsearch.best_params_

model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=-1,
    seed=42,
    learning_rate= 0.1, 
    max_depth= 2,
    min_child_weight= 3,
    n_estimators= 140
    )

model.fit(X_train, Y_train)
model.save_model('model.bst')