In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [16]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("../../04_datasets/nfl_dataset.csv", index_col=0)
print("...Done.")
print()

Loading dataset...
...Done.



In [17]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 1289

Display of dataset: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,winner_team,concat,home_coach,away_coach,location,weather_type,temperature,humidity,wind
0,2017,13,Vikings,Falcons,14,9,400951677,-5,0,1,Vikings,2017_13_Falcons_Vikings,Dan Quinn,Mike Zimmer,Home,cloudy,68,70,5
1,2017,9,Falcons,Panthers,17,20,400951749,3,1,0,Panthers,2017_09_Panthers_Falcons,Ron Rivera,Dan Quinn,Home,cloudy,61,86,4
2,2017,9,Bengals,Jaguars,7,23,400951753,16,1,0,Jaguars,2017_09_Jaguars_Bengals,Doug Marrone,Marvin Lewis,Home,cloudy,78,74,12
3,2017,4,Bears,Packers,14,35,400951678,21,1,0,Packers,2017_04_Packers_Bears,Mike McCarthy,John Fox,Home,cloudy,65,62,5
4,2017,9,Colts,Texans,20,14,400951751,-6,0,1,Colts,2017_09_Texans_Colts,Bill O'Brien,Chuck Pagano,Home,cloudy,84,62,10



Basics statistics: 


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away,winner_team,concat,home_coach,away_coach,location,weather_type,temperature,humidity,wind
count,1289.0,1289.0,1289,1289,1289.0,1289.0,1289.0,1289.0,1289.0,1289.0,1289,1289,1289,1289,1289,1289,1289.0,1289.0,1289.0
unique,,,32,32,,,,,,,33,1289,63,63,2,7,,,
top,,,Seahawks,Titans,,,,,,,Chiefs,2017_13_Falcons_Vikings,Sean McDermott,Kyle Shanahan,Home,cloudy,,,
freq,,,41,41,,,,,,,60,1,41,41,1270,768,,,
mean,2019.027153,9.17533,,,22.494182,23.759503,401134100.0,1.265322,0.539178,0.460822,,,,,,,57.233514,60.391001,8.159038
std,1.422984,5.062066,,,10.139945,10.328135,133883.2,14.768326,0.498656,0.498656,,,,,,,21.706911,19.085923,20.159696
min,2017.0,1.0,,,0.0,0.0,400950200.0,-49.0,0.0,0.0,,,,,,,1.0,0.0,0.0
25%,2018.0,5.0,,,16.0,17.0,401030800.0,-7.0,0.0,0.0,,,,,,,45.0,49.0,5.0
50%,2019.0,9.0,,,23.0,24.0,401128000.0,2.0,1.0,0.0,,,,,,,63.0,63.0,6.0
75%,2020.0,14.0,,,30.0,31.0,401220300.0,10.0,1.0,1.0,,,,,,,72.0,71.0,10.0



Percentage of missing values: 


season          0.0
week            0.0
awayteam        0.0
hometeam        0.0
awayscore       0.0
homescore       0.0
idgame          0.0
score_abs       0.0
winner_home     0.0
winner_away     0.0
winner_team     0.0
concat          0.0
home_coach      0.0
away_coach      0.0
location        0.0
weather_type    0.0
temperature     0.0
humidity        0.0
wind            0.0
dtype: float64

In [18]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["week", "awayteam", "hometeam", "weather_type", "temperature", "humidity", "wind"]
target_variable = "winner_home"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    0
1    1
2    1
3    1
4    0
Name: winner_home, dtype: int64

X :
   week awayteam  hometeam weather_type  temperature  humidity  wind
0    13  Vikings   Falcons       cloudy           68        70     5
1     9  Falcons  Panthers       cloudy           61        86     4
2     9  Bengals   Jaguars       cloudy           78        74    12
3     4    Bears   Packers       cloudy           65        62     5
4     9    Colts    Texans       cloudy           84        62    10


In [19]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['week', 'temperature', 'humidity', 'wind']
Found categorical features  ['awayteam', 'hometeam', 'weather_type']


In [20]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [21]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

In [22]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

In [23]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [24]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
      week  awayteam hometeam weather_type  temperature  humidity  wind
261      6  Steelers  Bengals       cloudy           51        93     6
771     11   Cowboys  Vikings       indoor           39        73    14
1016     1     Bears    Lions       indoor           68        70     5
2        9   Bengals  Jaguars       cloudy           78        74    12
422      2    Giants  Cowboys       cloudy           68        70     5
...Done.
  (0, 0)	-0.6536769903073458
  (0, 1)	-0.30582766750928675
  (0, 2)	1.732481256938419
  (0, 3)	-0.10136310666713878
  (0, 30)	1.0
  (0, 36)	1.0
  (1, 0)	0.34703014692487755
  (1, 1)	-0.8668918493941208
  (1, 2)	0.6927273770632043
  (1, 3)	0.2335435681085358
  (1, 14)	1.0
  (1, 64)	1.0
  (1, 68)	1.0
  (2, 0)	-1.6543841275395692
  (2, 1)	0.48901325682756147
  (2, 2)	0.5367642950819221
  (2, 3)	-0.14322644101409812
  (2, 4)	1.0
  (2, 52)	1.0
  (2, 68)	1.0
  (3, 0)	-0.053252707968011806
  (3, 1)	0.9565667417315898
 

In [25]:
# Train model
print("Train model...")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [26]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = classifier.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

Predictions on training set...
...Done.
[0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0
 1 1 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1
 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1
 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0
 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1
 0 1 1 0 1 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1
 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0
 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1
 1 0 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0
 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 1 0
 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 0

In [27]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = classifier.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

Predictions on test set...
...Done.
[1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0
 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 1
 0 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0
 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 1 0 1 1 0
 0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1
 1 0 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0
 1 1 1 1 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 0 1 1 0 1 1
 0 1 0 0 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 1 0 0 0 1
 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0
 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 0
 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 0 1]

Probabilities on test set...
...Done.
[[0.35263318 0.64736682]
 [0.8050944  0.1949056 ]
 [0.3501639  0.6498361 ]
 [0.49794188 0.50205812]
 [0.28064972 0.71935028]
 [0.22409455 0

In [28]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

accuracy on training set :  0.6829268292682927
accuracy on test set :  0.6124031007751938

f1-score on training set :  0.7217898832684825
f1-score on test set :  0.6478873239436621

