In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
# import ensemble methods
from xgboost import XGBClassifier
# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


In [16]:
os.getcwd()

'c:\\Users\\sylva\\Desktop\\nflpredictor\\nfl\\03_modelisation\\03_model3_XGBoost'

In [17]:
df = pd.read_csv("../../04_datasets/nfl_dataset_vf.csv", index_col=0)
dataset = pd.read_csv("../../04_datasets/nfl_dataset_vf.csv", index_col=0)


In [18]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "winner_home"

X = dataset.drop(target_variable, axis = 1)
features_drop_list = {'idgame', 'location'}
X.drop(features_drop_list, axis=1, inplace=True)

Y = dataset.loc[:,target_variable]

# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)

# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
#categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done preprocessing train set.')
print(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=-1,
    seed=42,
    learning_rate= 0.1, 
    max_depth= 2,
    min_child_weight= 3,
    n_estimators= 140
    )

model.fit(X_train, Y_train)


Separating labels from features...
Found numeric features  ['season', 'week', 'temperature', 'humidity', 'wind', 'streak_away', 'pass_yds_MA_5_away', 'pass_td_MA_5_away', 'rush_yds_MA_5_away', 'rush_td_MA_5_away', 'rec_yds_MA_5_away', 'rec_td_MA_5_away', 'fumbles_MA_5_away', 'fumbles_rec_MA_5_away', 'defense_sacks_MA_5_away', 'defense_td_MA_5_away', 'interceptions_MA_5_away', 'interceptions_td_MA_5_away', 'kicks_return_yds_MA_5_away', 'kicks_return_td_MA_5_away', 'punt_return_yds_MA_5_away', 'kicking_pts_MA_5_away', 'streak_home', 'pass_yds_MA_5_home', 'pass_td_MA_5_home', 'rush_yds_MA_5_home', 'rush_td_MA_5_home', 'rec_yds_MA_5_home', 'rec_td_MA_5_home', 'fumbles_MA_5_home', 'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home', 'defense_td_MA_5_home', 'interceptions_MA_5_home', 'interceptions_td_MA_5_home', 'kicks_return_yds_MA_5_home', 'kicks_return_td_MA_5_home', 'punt_return_yds_MA_5_home', 'kicking_pts_MA_5_home', 'delta_day_away', 'delta_day_home', 'people', 'month', 'dayofmonth', 



...Done.
  (0, 0)	-0.03834271189808886
  (0, 1)	-0.040338299176398446
  (0, 2)	-0.14802548401889187
  (0, 3)	-0.2942177847786325
  (0, 4)	-0.1280495365176966
  (0, 5)	0.36213626027378715
  (0, 6)	0.9857671482193667
  (0, 7)	0.7260803077114376
  (0, 8)	1.1384131354925013
  (0, 9)	0.6910193124378573
  (0, 10)	0.9926243600872926
  (0, 11)	0.7301760159383414
  (0, 12)	0.7330262317943376
  (0, 13)	0.8598579840932894
  (0, 14)	-1.2214824725508155
  (0, 15)	0.47165885524093315
  (0, 16)	-0.9483283743945345
  (0, 17)	1.0646908737555119
  (0, 18)	-1.001976875381956
  (0, 19)	-0.2775290677669772
  (0, 20)	0.3017315167618678
  (0, 21)	0.5606452304572039
  (0, 22)	0.669352969789346
  (0, 23)	0.1818883204821579
  (0, 24)	0.02988162338811878
  :	:
  (4, 25)	-0.8437097267277661
  (4, 26)	-0.6823471570166089
  (4, 27)	0.13581299130911145
  (4, 28)	0.029261772840733696
  (4, 29)	-0.4278811309999175
  (4, 30)	-0.13462031333341043
  (4, 31)	1.9403545909241002
  (4, 32)	-0.8121077937860411
  (4, 33)	-0.97

Preprocessing with awayteam and hometeam

Logistic Regression

In [14]:
df_ma = pd.read_csv("../../04_datasets/nfl_team_topredict.csv", index_col=0)
df_ma

Unnamed: 0.1,index,Unnamed: 0,idgame,team,pass_completion,pass_yds,pass_avg,pass_td,pass_int,sacks,...,punt_return_td_MA_5,kicking_pct_MA_5,kicking_long_MA_5,kicking_pts_MA_5,punting_no_MA_5,punting_yds_MA_5,punting_avg_MA_5,punting_tb_MA_5,punting_in_20_MA_5,punting_long_MA_5
80,2022-01-09 21:25:00+00:00,2538,401326599,49ers,1.71875,340.0,33.9,2.0,2.0,0.115385,...,0.0,93.4,38.0,8.0,4.2,178.2,50.58,0.2,1.0,58.4
161,2022-01-09 18:00:00+00:00,2527,401326593,Bears,0.6875,325.0,6.8,1.0,2.0,0.118644,...,0.2,90.0,40.6,7.6,3.2,150.2,46.68,0.6,0.6,57.8
241,2022-01-09 18:00:00+00:00,2517,401326588,Bengals,0.517241,136.0,4.7,1.0,0.0,0.121212,...,0.0,73.4,35.2,6.8,3.4,159.4,27.26,0.2,0.8,33.8
322,2022-01-09 21:25:00+00:00,2515,401326587,Bills,1.533333,254.0,20.3,2.0,0.0,0.0,...,0.0,80.0,25.2,7.4,3.6,142.2,23.84,0.2,1.4,31.8
402,2022-01-08 21:30:00+00:00,2536,401326598,Broncos,1.5,178.0,22.8,0.0,0.0,0.2,...,0.0,83.4,50.6,6.4,4.0,188.8,46.68,0.2,1.6,57.4
482,2022-01-09 18:00:00+00:00,2518,401326588,Browns,0.708333,176.0,7.3,2.0,1.0,0.2,...,0.0,30.0,16.0,3.4,5.0,207.6,40.24,0.4,1.2,48.4
563,2022-01-09 21:25:00+00:00,2532,401326596,Buccaneers,0.783784,326.0,8.8,3.0,0.0,0.5,...,0.0,73.4,30.2,8.4,4.4,178.4,40.24,0.0,1.4,47.6
643,2022-01-09 21:25:00+00:00,2534,401326597,Cardinals,0.717949,240.0,6.2,1.0,0.0,0.09434,...,0.0,86.6,40.8,9.2,2.2,99.6,45.32,0.0,0.4,48.8
723,2022-01-10 01:20:00+00:00,2540,401326600,Chargers,0.53125,383.0,6.0,3.0,1.0,0.107143,...,0.0,70.0,30.6,8.4,2.0,79.2,31.86,0.0,0.4,36.6
804,2022-01-08 21:30:00+00:00,2537,401326598,Chiefs,0.613636,270.0,6.1,2.0,0.0,0.066667,...,0.0,95.0,43.2,9.4,1.8,76.8,43.2,0.4,0.4,46.8


In [21]:
df_games = pd.read_csv("../../04_datasets/nfl2022_total.csv", index_col=0)
df_games

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location
0,2022,5,2022-10-06,Colts,Broncos,Empower Field at Mile High,"Denver, CO"
1,2022,5,2022-10-09,Giants,Packers,Tottenham Hotspur Stadium,London
2,2022,5,2022-10-09,Steelers,Bills,Highmark Stadium,"Orchard Park, NY"
3,2022,5,2022-10-09,Chargers,Browns,FirstEnergy Stadium,"Cleveland, OH"
4,2022,5,2022-10-09,Bears,Vikings,U.S. Bank Stadium,"Minneapolis, MN"
...,...,...,...,...,...,...,...
267,2022,18,2023-01-08,Browns,Steelers,Acrisure Stadium,"Pittsburgh, PA"
268,2022,18,2023-01-08,Cardinals,49ers,Levi's Stadium,"Santa Clara, CA"
269,2022,18,2023-01-08,Rams,Seahawks,Lumen Field,"Seattle, WA"
270,2022,18,2023-01-08,Cowboys,Washington,FedExField,"Landover, MD"


In [38]:
df_games.columns

Index(['season', 'week', 'date', 'awayteam', 'hometeam', 'stadium',
       'location'],
      dtype='object')

In [23]:
dataset.columns

Index(['season', 'week', 'awayteam', 'hometeam', 'idgame', 'winner_home',
       'home_coach', 'away_coach', 'weather_type', 'temperature', 'humidity',
       'wind', 'streak_away', 'pass_yds_MA_5_away', 'pass_td_MA_5_away',
       'rush_yds_MA_5_away', 'rush_td_MA_5_away', 'rec_yds_MA_5_away',
       'rec_td_MA_5_away', 'fumbles_MA_5_away', 'fumbles_rec_MA_5_away',
       'defense_sacks_MA_5_away', 'defense_td_MA_5_away',
       'interceptions_MA_5_away', 'interceptions_td_MA_5_away',
       'kicks_return_yds_MA_5_away', 'kicks_return_td_MA_5_away',
       'punt_return_yds_MA_5_away', 'kicking_pts_MA_5_away', 'streak_home',
       'pass_yds_MA_5_home', 'pass_td_MA_5_home', 'rush_yds_MA_5_home',
       'rush_td_MA_5_home', 'rec_yds_MA_5_home', 'rec_td_MA_5_home',
       'fumbles_MA_5_home', 'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home',
       'defense_td_MA_5_home', 'interceptions_MA_5_home',
       'interceptions_td_MA_5_home', 'kicks_return_yds_MA_5_home',
       'kicks_return_

In [39]:
df_games_topredict = pd.DataFrame(columns = [col for col in dataset.columns])

In [40]:
df_games_topredict.columns

Index(['season', 'week', 'awayteam', 'hometeam', 'idgame', 'winner_home',
       'home_coach', 'away_coach', 'weather_type', 'temperature', 'humidity',
       'wind', 'streak_away', 'pass_yds_MA_5_away', 'pass_td_MA_5_away',
       'rush_yds_MA_5_away', 'rush_td_MA_5_away', 'rec_yds_MA_5_away',
       'rec_td_MA_5_away', 'fumbles_MA_5_away', 'fumbles_rec_MA_5_away',
       'defense_sacks_MA_5_away', 'defense_td_MA_5_away',
       'interceptions_MA_5_away', 'interceptions_td_MA_5_away',
       'kicks_return_yds_MA_5_away', 'kicks_return_td_MA_5_away',
       'punt_return_yds_MA_5_away', 'kicking_pts_MA_5_away', 'streak_home',
       'pass_yds_MA_5_home', 'pass_td_MA_5_home', 'rush_yds_MA_5_home',
       'rush_td_MA_5_home', 'rec_yds_MA_5_home', 'rec_td_MA_5_home',
       'fumbles_MA_5_home', 'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home',
       'defense_td_MA_5_home', 'interceptions_MA_5_home',
       'interceptions_td_MA_5_home', 'kicks_return_yds_MA_5_home',
       'kicks_return_

In [41]:
df_games_topredict = pd.concat([df_games, df_games_topredict])

In [42]:
df_games_topredict.columns

Index(['season', 'week', 'date', 'awayteam', 'hometeam', 'stadium', 'location',
       'idgame', 'winner_home', 'home_coach', 'away_coach', 'weather_type',
       'temperature', 'humidity', 'wind', 'streak_away', 'pass_yds_MA_5_away',
       'pass_td_MA_5_away', 'rush_yds_MA_5_away', 'rush_td_MA_5_away',
       'rec_yds_MA_5_away', 'rec_td_MA_5_away', 'fumbles_MA_5_away',
       'fumbles_rec_MA_5_away', 'defense_sacks_MA_5_away',
       'defense_td_MA_5_away', 'interceptions_MA_5_away',
       'interceptions_td_MA_5_away', 'kicks_return_yds_MA_5_away',
       'kicks_return_td_MA_5_away', 'punt_return_yds_MA_5_away',
       'kicking_pts_MA_5_away', 'streak_home', 'pass_yds_MA_5_home',
       'pass_td_MA_5_home', 'rush_yds_MA_5_home', 'rush_td_MA_5_home',
       'rec_yds_MA_5_home', 'rec_td_MA_5_home', 'fumbles_MA_5_home',
       'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home',
       'defense_td_MA_5_home', 'interceptions_MA_5_home',
       'interceptions_td_MA_5_home', 'kicks_return

In [43]:
df_games_topredict

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location,idgame,winner_home,home_coach,...,kicks_return_td_MA_5_home,punt_return_yds_MA_5_home,kicking_pts_MA_5_home,delta_day_away,delta_day_home,stade,people,month,dayofmonth,dayofweek
0,2022,5,2022-10-06,Colts,Broncos,Empower Field at Mile High,"Denver, CO",,,,...,,,,,,,,,,
1,2022,5,2022-10-09,Giants,Packers,Tottenham Hotspur Stadium,London,,,,...,,,,,,,,,,
2,2022,5,2022-10-09,Steelers,Bills,Highmark Stadium,"Orchard Park, NY",,,,...,,,,,,,,,,
3,2022,5,2022-10-09,Chargers,Browns,FirstEnergy Stadium,"Cleveland, OH",,,,...,,,,,,,,,,
4,2022,5,2022-10-09,Bears,Vikings,U.S. Bank Stadium,"Minneapolis, MN",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,2022,18,2023-01-08,Browns,Steelers,Acrisure Stadium,"Pittsburgh, PA",,,,...,,,,,,,,,,
268,2022,18,2023-01-08,Cardinals,49ers,Levi's Stadium,"Santa Clara, CA",,,,...,,,,,,,,,,
269,2022,18,2023-01-08,Rams,Seahawks,Lumen Field,"Seattle, WA",,,,...,,,,,,,,,,
270,2022,18,2023-01-08,Cowboys,Washington,FedExField,"Landover, MD",,,,...,,,,,,,,,,


In [44]:

df_games_topredict = preprocessor.transform(df_games_topredict)
Y_pred = model.predict(df_games_topredict)

In [45]:
Y_pred = pd.DataFrame(Y_pred, columns = ["winner"] )

In [46]:
Y_pred

Unnamed: 0,winner
0,0
1,0
2,0
3,0
4,0
...,...
267,0
268,0
269,0
270,0


In [47]:
Y_pred_proba = model.predict_proba(df_games_topredict).round(4)

In [48]:
Y_pred_proba

array([[0.597 , 0.403 ],
       [0.508 , 0.492 ],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.5864, 0.4136],
       [0.6059, 0.3941],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.7391, 0.2609],
       [0.5926, 0.4074],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.4665, 0.5335],
       [0.5836, 0.4164],
       [0.597 , 0.403 ],
       [0.6134, 0.3866],
       [0.597 , 0.403 ],
       [0.5041, 0.4959],
       [0.4555, 0.5445],
       [0.597 , 0.403 ],
       [0.4757, 0.5243],
       [0.6159, 0.3841],
       [0.5188, 0.4812],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.5836, 0.4164],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.5836, 0.4164],
       [0.5634, 0.4366],
       [0.597 , 0.403 ],
       [0.6059, 0.3941],
       [0.6159, 0.3841],
       [0.5188, 0.4812],
       [0.7391, 0.2609],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],
       [0.597 , 0.403 ],


In [49]:
Y_pred_proba = pd.DataFrame(Y_pred_proba, columns=["proba_away", "proba_home"])

In [50]:
Y_pred_proba

Unnamed: 0,proba_away,proba_home
0,0.5970,0.4030
1,0.5080,0.4920
2,0.5970,0.4030
3,0.5970,0.4030
4,0.5970,0.4030
...,...,...
267,0.5188,0.4812
268,0.5970,0.4030
269,0.6688,0.3312
270,0.5970,0.4030


In [51]:
res = pd.concat([df_games,Y_pred], axis =1)

In [52]:
res =pd.concat([res,Y_pred_proba], axis = 1)

In [60]:
res["winner"].value_counts()

0    253
1     19
Name: winner, dtype: int64

In [56]:
os.getcwd()

'c:\\Users\\sylva\\Desktop\\nflpredictor\\nfl\\03_modelisation\\03_model3_XGBoost'

In [144]:
res.to_csv("05_results/results_games_2022_xgb.csv")