In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [63]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("../../04_datasets/nfl_dataset_v2.csv", index_col=0)
print("...Done.")
print()

Loading dataset...
...Done.



In [64]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 5057

Display of dataset: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,...,line,over_under,attendance_info,month,dayofmonth,dayofweek,hour,minute,win_streak,lose_streak
0,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,...,,,1,11,5,6,18,0,0.0,1.0
1,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,...,,,1,11,5,6,18,0,2.0,0.0
2,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,...,,,1,11,5,6,18,0,0.0,1.0
3,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,...,,,1,11,5,6,18,0,2.0,0.0
4,2017,4,Bills,Falcons,23,17,400951685,-6,0,1,...,,,1,10,1,6,17,0,2.0,0.0



Basics statistics: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,...,line,over_under,attendance_info,month,dayofmonth,dayofweek,hour,minute,win_streak,lose_streak
count,5057.0,5057.0,5057,5057,5057.0,5057.0,5057.0,5057.0,5057.0,5057.0,...,2096,1880,5057.0,5057.0,5057.0,5057.0,5057.0,5057.0,5030.0,5030.0
unique,,,32,32,,,,,,,...,341,42,,,,,,,,
top,,,Bears,Dolphins,,,,,,,...,Line: MIN -3.0,Over/Under: 44.5,,,,,,,,
freq,,,164,164,,,,,,,...,20,100,,,,,,,,
mean,2019.048843,9.183706,,,22.503065,23.787423,401136200.0,1.284358,0.539055,0.460945,...,,,0.882144,10.255092,16.015029,5.101839,15.06407,9.050821,1.158648,1.265805
std,1.420697,5.071645,,,10.173439,10.331228,133667.7,14.797239,0.498522,0.498522,...,,,0.32247,2.131224,8.749897,1.918151,7.263699,11.231472,1.753524,2.145605
min,2017.0,1.0,,,0.0,0.0,400950200.0,-49.0,0.0,0.0,...,,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2018.0,5.0,,,16.0,17.0,401030800.0,-7.0,0.0,0.0,...,,,1.0,10.0,9.0,6.0,17.0,0.0,0.0,0.0
50%,2019.0,9.0,,,23.0,24.0,401128000.0,2.0,1.0,0.0,...,,,1.0,11.0,16.0,6.0,18.0,0.0,0.5,0.5
75%,2020.0,14.0,,,30.0,31.0,401220300.0,10.0,1.0,1.0,...,,,1.0,12.0,24.0,6.0,20.0,20.0,2.0,2.0



Percentage of missing values: 


season         0.000000
week           0.000000
awayteam       0.000000
hometeam       0.000000
awayscore      0.000000
                 ...   
dayofweek      0.000000
hour           0.000000
minute         0.000000
win_streak     0.533913
lose_streak    0.533913
Length: 140, dtype: float64

In [34]:
dataset.location_y.unique()

array(['Jacksonville, FL', 'Atlanta, GA', 'London', 'Green Bay, WI',
       'Baltimore, MD', 'Chicago, IL', 'Cincinnati, OH', 'Nashville, TN',
       'Kansas City, MO', 'Foxboro, MA', 'New Orleans, LA',
       'East Rutherford, NJ', 'Landover, MD', 'Charlotte, NC',
       'Santa Clara, CA', 'Cleveland, OH', 'Arlington, TX',
       'Minneapolis, MN', 'Tampa, FL', 'Orchard Park, NY', 'Denver, CO',
       'Carson, CA', 'Glendale, AZ', 'Houston, TX', 'Seattle, WA',
       'Philadelphia, PA', 'Indianapolis, IN', 'Detroit, MI',
       'Miami Gardens, FL', 'Pittsburgh, PA', 'Oakland, CA',
       'Los Angeles, CA', 'Mexico City', 'Las Vegas, NV', 'Inglewood, CA'],
      dtype=object)

In [65]:
# Separate target variable Y from features X
print("Separating labels from features...")
column_list = dataset.columns.to_list()
features_drop_list = {'delta_match','season','winner_away','concat','idgame','awayscore','homescore','score_abs','winner_home','location_x','game_date','date','stade','location_y','attendance','capacity','people','line','over_under','attendance_info','hour','minute' }
features_list = [ele for ele in column_list if ele not in features_drop_list]
target_variable = "winner_home"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    1
1    1
2    1
3    1
4    0
Name: winner_home, dtype: int64

X :
   week awayteam hometeam winner_team    home_coach      away_coach  \
0     9  Bengals  Jaguars     Jaguars  Doug Marrone    Marvin Lewis   
1     9  Bengals  Jaguars     Jaguars  Doug Marrone    Marvin Lewis   
2     9  Bengals  Jaguars     Jaguars  Doug Marrone    Marvin Lewis   
3     9  Bengals  Jaguars     Jaguars  Doug Marrone    Marvin Lewis   
4     4    Bills  Falcons       Bills     Dan Quinn  Sean McDermott   

  weather_type  temperature  humidity  wind  ... punting_yds_MA_5  \
0       cloudy           78        74    12  ...            224.2   
1       cloudy           78        74    12  ...            224.2   
2       cloudy           78        74    12  ...            216.4   
3       cloudy           78        74    12  ...            216.4   
4       cloudy           68        70     5  ...              NaN   

   punting_avg_MA_5  punting_tb_MA_5

In [56]:
features_list

['week',
 'awayteam',
 'hometeam',
 'winner_team',
 'home_coach',
 'away_coach',
 'weather_type',
 'temperature',
 'humidity',
 'wind',
 'team',
 'pass_completion',
 'pass_yds',
 'pass_avg',
 'pass_td',
 'pass_int',
 'sacks',
 'qbr',
 'rtg',
 'rush_car',
 'rush_yds',
 'rush_avg',
 'rush_td',
 'rush_long',
 'receptions',
 'rec_yds',
 'rec_avg',
 'rec_td',
 'rec_long',
 'rec_tgs',
 'fumbles',
 'fumbles_lost',
 'fumbles_rec',
 'defense_tot',
 'defense_solo',
 'defense_sacks',
 'defense_tfl',
 'defense_pd',
 'defense_qb_hits',
 'defense_td',
 'interceptions',
 'interceptions_yds',
 'interceptions_td',
 'kicks_return_no',
 'kicks_return_yds',
 'kicks_return_avg',
 'kicks_return_long',
 'kicks_return_td',
 'punt_return_no',
 'punt_return_yds',
 'punt_return_avg',
 'punt_return_long',
 'punt_return_td',
 'kicking_pct',
 'kicking_long',
 'kicking_pts',
 'punting_no',
 'punting_yds',
 'punting_avg',
 'punting_tb',
 'punting_in_20',
 'punting_long',
 'pass_completion_MA_5',
 'pass_yds_MA_5',
 'p

In [66]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['week', 'temperature', 'humidity', 'wind', 'pass_completion', 'pass_yds', 'pass_avg', 'pass_td', 'pass_int', 'sacks', 'qbr', 'rtg', 'rush_car', 'rush_yds', 'rush_avg', 'rush_td', 'rush_long', 'receptions', 'rec_yds', 'rec_avg', 'rec_td', 'rec_long', 'rec_tgs', 'fumbles', 'fumbles_lost', 'fumbles_rec', 'defense_tot', 'defense_solo', 'defense_sacks', 'defense_tfl', 'defense_pd', 'defense_qb_hits', 'defense_td', 'interceptions', 'interceptions_yds', 'interceptions_td', 'kicks_return_no', 'kicks_return_yds', 'kicks_return_avg', 'kicks_return_long', 'kicks_return_td', 'punt_return_no', 'punt_return_yds', 'punt_return_avg', 'punt_return_long', 'punt_return_td', 'kicking_pct', 'kicking_long', 'kicking_pts', 'punting_no', 'punting_yds', 'punting_avg', 'punting_tb', 'punting_in_20', 'punting_long', 'pass_completion_MA_5', 'pass_yds_MA_5', 'pass_avg_MA_5', 'pass_td_MA_5', 'pass_int_MA_5', 'sacks_MA_5', 'qbr_MA_5', 'rtg_MA_5', 'rush_car_MA_5', 'rush_yds_MA_5', 'rush_avg_M

In [67]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [68]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

In [69]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

In [70]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [71]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
      week awayteam  hometeam winner_team      home_coach      away_coach  \
452     15     Rams  Seahawks        Rams    Pete Carroll      Sean McVay   
838     12    Bills    Chiefs       Bills       Andy Reid  Sean McDermott   
2754     4  Bengals  Steelers    Steelers     Mike Tomlin      Zac Taylor   
3774     6  Broncos  Patriots     Broncos  Bill Belichick      Vic Fangio   
1479    13    Bears    Giants      Giants     Pat Shurmur       Matt Nagy   

     weather_type  temperature  humidity  wind  ... punting_yds_MA_5  \
452        cloudy           47        88    15  ...            217.2   
838        cloudy           62        36     9  ...            196.4   
2754       cloudy           76        72     1  ...            228.6   
3774        sunny           58        45     7  ...            188.4   
1479       cloudy           52        92     3  ...            184.0   

      punting_avg_MA_5  punting_tb_MA_5  punting_in_20_MA_5  \

Unnamed: 0,week,awayteam,hometeam,winner_team,home_coach,away_coach,weather_type,temperature,humidity,wind,...,punting_yds_MA_5,punting_avg_MA_5,punting_tb_MA_5,punting_in_20_MA_5,punting_long_MA_5,month,dayofmonth,dayofweek,win_streak,lose_streak
0,9,Bengals,Jaguars,Jaguars,Doug Marrone,Marvin Lewis,cloudy,78,74,12,...,224.2,48.64,0.4,1.6,56.0,11,5,6,0.0,1.0
1,9,Bengals,Jaguars,Jaguars,Doug Marrone,Marvin Lewis,cloudy,78,74,12,...,224.2,48.64,0.4,1.6,56.0,11,5,6,2.0,0.0
2,9,Bengals,Jaguars,Jaguars,Doug Marrone,Marvin Lewis,cloudy,78,74,12,...,216.4,45.14,0.0,2.4,55.4,11,5,6,0.0,1.0
3,9,Bengals,Jaguars,Jaguars,Doug Marrone,Marvin Lewis,cloudy,78,74,12,...,216.4,45.14,0.0,2.4,55.4,11,5,6,2.0,0.0
4,4,Bills,Falcons,Bills,Dan Quinn,Sean McDermott,cloudy,68,70,5,...,,,,,,10,1,6,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5052,15,Cardinals,Lions,Lions,Dan Campbell,Kliff Kingsbury,indoor,68,70,5,...,198.4,50.34,0.6,0.8,61.0,12,19,6,1.0,0.0
5053,18,Chargers,Raiders,Raiders,Rich Bisaccia,Brandon Staley,cloudy,59,20,10,...,80.6,29.52,0.2,0.4,35.2,1,10,0,0.0,1.0
5054,18,Chargers,Raiders,Raiders,Rich Bisaccia,Brandon Staley,cloudy,59,20,10,...,80.6,29.52,0.2,0.4,35.2,1,10,0,4.0,0.0
5055,18,Chargers,Raiders,Raiders,Rich Bisaccia,Brandon Staley,cloudy,59,20,10,...,143.6,47.86,0.4,1.2,56.6,1,10,0,0.0,1.0


In [72]:
# Train model
print("Train model...")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = classifier.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

Predictions on training set...
...Done.
[0 1 1 ... 0 1 0]

Probabilities on training set...
...Done.
[[0.75001572 0.24998428]
 [0.32250695 0.67749305]
 [0.33402584 0.66597416]
 ...
 [0.79885386 0.20114614]
 [0.48726299 0.51273701]
 [0.87334119 0.12665881]]



In [74]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = classifier.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

Predictions on test set...
...Done.
[0 1 0 ... 1 0 1]

Probabilities on test set...
...Done.
[[0.53487925 0.46512075]
 [0.11770391 0.88229609]
 [0.83651088 0.16348912]
 ...
 [0.02171296 0.97828704]
 [0.68476591 0.31523409]
 [0.28114338 0.71885662]]



In [75]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

accuracy on training set :  0.74427804464538
accuracy on test set :  0.6798418972332015

f1-score on training set :  0.768364474020988
f1-score on test set :  0.7093301435406698

