In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
# import ensemble methods
from xgboost import XGBClassifier
# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


In [2]:
os.getcwd()

'c:\\Users\\sylva\\Desktop\\nflpredictor\\nfl\\03_modelisation\\03_model3_XGBoost'

# Import dataset final

In [4]:
# df = pd.read_csv("../../04_datasets/nfl_dataset_vf.csv", index_col=0)
dataset = pd.read_csv("../../04_datasets/nfl_dataset_vf.csv", index_col=0)


# preprocessing and modelisation

In [5]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "winner_home"

X = dataset.drop(target_variable, axis = 1)
features_drop_list = {'idgame', 'location'}
X.drop(features_drop_list, axis=1, inplace=True)

Y = dataset.loc[:,target_variable]

# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

# print('Found numeric features ', numeric_features)
# print('Found categorical features ', categorical_features)

# print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)


# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
#categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
# print("Performing preprocessings on train set...")
# print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
# print('...Done preprocessing train set.')
# print(X_train)
# print('...Done.')
# print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
# print()
# Label encoding
# print("Encoding labels...")
# print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
# print("...Done")
# print(Y_train[0:5])

# Preprocessings on test set
# print("Performing preprocessings on test set...")
# print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
# print('...Done.')
# print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
# print()
# Label encoding
# print("Encoding labels...")
# print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
# print("...Done")
# print(Y_test[0:5])

model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=-1,
    seed=42,
    learning_rate= 0.1, 
    max_depth= 2,
    min_child_weight= 3,
    n_estimators= 140
    )

model.fit(X_train, Y_train)


Separating labels from features...




import of calculated features

In [6]:
df_ma = pd.read_csv("../../04_datasets/nfl_team_topredict.csv", index_col=0)

import of future games

In [7]:
df_games = pd.read_csv("../../04_datasets/nfl2022_total.csv", index_col=0)
df_games

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location
0,2022,5,2022-10-06,Colts,Broncos,Empower Field at Mile High,"Denver, CO"
1,2022,5,2022-10-09,Giants,Packers,Tottenham Hotspur Stadium,London
2,2022,5,2022-10-09,Steelers,Bills,Highmark Stadium,"Orchard Park, NY"
3,2022,5,2022-10-09,Chargers,Browns,FirstEnergy Stadium,"Cleveland, OH"
4,2022,5,2022-10-09,Bears,Vikings,U.S. Bank Stadium,"Minneapolis, MN"
...,...,...,...,...,...,...,...
267,2022,18,2023-01-08,Browns,Steelers,Acrisure Stadium,"Pittsburgh, PA"
268,2022,18,2023-01-08,Cardinals,49ers,Levi's Stadium,"Santa Clara, CA"
269,2022,18,2023-01-08,Rams,Seahawks,Lumen Field,"Seattle, WA"
270,2022,18,2023-01-08,Cowboys,Washington,FedExField,"Landover, MD"


# Create df to predict

In [8]:
df_games_topredict = pd.DataFrame(columns = [col for col in dataset.columns])
df_games_topredict = pd.concat([df_games, df_games_topredict])

In [9]:
df_games_topredict.columns

Index(['season', 'week', 'date', 'awayteam', 'hometeam', 'stadium', 'location',
       'idgame', 'winner_home', 'home_coach', 'away_coach', 'weather_type',
       'temperature', 'humidity', 'wind', 'streak_away', 'pass_yds_MA_5_away',
       'pass_td_MA_5_away', 'rush_yds_MA_5_away', 'rush_td_MA_5_away',
       'rec_yds_MA_5_away', 'rec_td_MA_5_away', 'fumbles_MA_5_away',
       'fumbles_rec_MA_5_away', 'defense_sacks_MA_5_away',
       'defense_td_MA_5_away', 'interceptions_MA_5_away',
       'interceptions_td_MA_5_away', 'kicks_return_yds_MA_5_away',
       'kicks_return_td_MA_5_away', 'punt_return_yds_MA_5_away',
       'kicking_pts_MA_5_away', 'streak_home', 'pass_yds_MA_5_home',
       'pass_td_MA_5_home', 'rush_yds_MA_5_home', 'rush_td_MA_5_home',
       'rec_yds_MA_5_home', 'rec_td_MA_5_home', 'fumbles_MA_5_home',
       'fumbles_rec_MA_5_home', 'defense_sacks_MA_5_home',
       'defense_td_MA_5_home', 'interceptions_MA_5_home',
       'interceptions_td_MA_5_home', 'kicks_return

Filter week 1

In [9]:
df_week = df_games_topredict[df_games_topredict["week"] == 1]

List of columns

In [99]:
features_ma_home = [col for col in df_week.columns if "_MA_5_home" in col]
features_ma_away = [col for col in df_week.columns if "_MA_5_away" in col]
features_ma_topredict = [col.split("_home")[0] for col in df_week.columns if "_MA_5_home" in col]


In [60]:
print(len(features_ma_home))
print(len(features_ma_away))
print(len(features_ma_topredict))

16
16


In [121]:
df_comp = pd.concat([pd.DataFrame(features_ma_home), 
                    pd.DataFrame(features_ma_away),
                    pd.DataFrame(features_ma_topredict)],
                    axis =1)
df_comp                

Unnamed: 0,0,0.1,0.2
0,pass_yds_MA_5_home,pass_yds_MA_5_away,pass_yds_MA_5
1,pass_td_MA_5_home,pass_td_MA_5_away,pass_td_MA_5
2,rush_yds_MA_5_home,rush_yds_MA_5_away,rush_yds_MA_5
3,rush_td_MA_5_home,rush_td_MA_5_away,rush_td_MA_5
4,rec_yds_MA_5_home,rec_yds_MA_5_away,rec_yds_MA_5
5,rec_td_MA_5_home,rec_td_MA_5_away,rec_td_MA_5
6,fumbles_MA_5_home,fumbles_MA_5_away,fumbles_MA_5
7,fumbles_rec_MA_5_home,fumbles_rec_MA_5_away,fumbles_rec_MA_5
8,defense_sacks_MA_5_home,defense_sacks_MA_5_away,defense_sacks_MA_5
9,defense_td_MA_5_home,defense_td_MA_5_away,defense_td_MA_5


fill in the moving averages of the home team. 

In [77]:
for team in df_week["hometeam"].unique():
    df_week.loc[df_week["hometeam"] == team, features_ma_home] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["hometeam"] == team, features_ma_home] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["awayteam"] == team, features_ma_away] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["hometeam"] == team, features_ma_home] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is

fill in the moving averages of the away team. 

In [82]:
for team in df_week["awayteam"].unique():
    df_week.loc[df_week["awayteam"] == team, features_ma_away] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["awayteam"] == team, features_ma_away] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["awayteam"] == team, features_ma_away] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week.loc[df_week["awayteam"] == team, features_ma_away] = df_ma.loc[df_ma["team"]==team, features_ma_topredict].values
A value is

concat features_ma_away + features_ma_home

In [100]:
features_ma_away_home = features_ma_away + features_ma_home

['pass_yds_MA_5_away',
 'pass_td_MA_5_away',
 'rush_yds_MA_5_away',
 'rush_td_MA_5_away',
 'rec_yds_MA_5_away',
 'rec_td_MA_5_away',
 'fumbles_MA_5_away',
 'fumbles_rec_MA_5_away',
 'defense_sacks_MA_5_away',
 'defense_td_MA_5_away',
 'interceptions_MA_5_away',
 'interceptions_td_MA_5_away',
 'kicks_return_yds_MA_5_away',
 'kicks_return_td_MA_5_away',
 'punt_return_yds_MA_5_away',
 'kicking_pts_MA_5_away',
 'pass_yds_MA_5_home',
 'pass_td_MA_5_home',
 'rush_yds_MA_5_home',
 'rush_td_MA_5_home',
 'rec_yds_MA_5_home',
 'rec_td_MA_5_home',
 'fumbles_MA_5_home',
 'fumbles_rec_MA_5_home',
 'defense_sacks_MA_5_home',
 'defense_td_MA_5_home',
 'interceptions_MA_5_home',
 'interceptions_td_MA_5_home',
 'kicks_return_yds_MA_5_home',
 'kicks_return_td_MA_5_home',
 'punt_return_yds_MA_5_home',
 'kicking_pts_MA_5_home']

# Fill in the data for week 1

In [102]:
df_games_topredict.loc[df_games_topredict["week"] == 1, features_ma_away_home] = df_week.loc[:, features_ma_away_home].values

In [None]:
df_games_topredict = preprocessor.transform(df_games_topredict)

Y_pred = model.predict(df_games_topredict)
Y_pred = pd.DataFrame(Y_pred, columns = ["winner"] )

Y_pred_proba = model.predict_proba(df_games_topredict).round(4)
Y_pred_proba = pd.DataFrame(Y_pred_proba, columns=["proba_away", "proba_home"])

In [None]:
res = pd.concat([df_games,Y_pred,Y_pred_proba], axis =1)

In [122]:
res[res["week"]==1]

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location,winner,proba_away,proba_home
81,2022,1,2022-09-08,Bills,Rams,SoFi Stadium,"Inglewood, CA",0,0.5555,0.4445
82,2022,1,2022-09-11,Saints,Falcons,Mercedes-Benz Stadium,"Atlanta, GA",0,0.7982,0.2018
108,2022,1,2022-09-11,49ers,Bears,Soldier Field,"Chicago, IL",0,0.6197,0.3803
109,2022,1,2022-09-11,Steelers,Bengals,Paycor Stadium,"Cincinnati, OH",1,0.4791,0.5209
110,2022,1,2022-09-11,Eagles,Lions,Ford Field,"Detroit, MI",0,0.615,0.385
111,2022,1,2022-09-11,Patriots,Dolphins,Hard Rock Stadium,"Miami Gardens, FL",1,0.4633,0.5367
112,2022,1,2022-09-11,Ravens,Jets,MetLife Stadium,"East Rutherford, NJ",0,0.6757,0.3243
113,2022,1,2022-09-11,Jaguars,Washington,FedExField,"Landover, MD",1,0.4462,0.5538
114,2022,1,2022-09-11,Browns,Panthers,Bank of America Stadium,"Charlotte, NC",0,0.6045,0.3955
115,2022,1,2022-09-11,Colts,Texans,NRG Stadium,"Houston, TX",0,0.5243,0.4757


In [None]:
res.to_csv("../05_results/results_games_2022_xgb.csv")

## predicting the winner of the 2021 matches

In [9]:
df_2021 = dataset[dataset["season"]==2021]
df_other_year = dataset[dataset["season"]!=2021]

In [None]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "winner_home"

X = df_other_year.drop(target_variable, axis = 1)
features_drop_list = {'idgame', 'location'}
X.drop(features_drop_list, axis=1, inplace=True)

Y = df_other_year.loc[:,target_variable]

# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

# print('Found numeric features ', numeric_features)
# print('Found categorical features ', categorical_features)

# print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)


# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
#categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # first column will be dropped to avoid creating correlations between features
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
# print("Performing preprocessings on train set...")
# print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
# print('...Done preprocessing train set.')
# print(X_train)
# print('...Done.')
# print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
# print()
# Label encoding
# print("Encoding labels...")
# print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
# print("...Done")
# print(Y_train[0:5])

# Preprocessings on test set
# print("Performing preprocessings on test set...")
# print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
# print('...Done.')
# print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
# print()
# Label encoding
# print("Encoding labels...")
# print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
# print("...Done")
# print(Y_test[0:5])

model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=-1,
    seed=42,
    learning_rate= 0.1, 
    max_depth= 2,
    min_child_weight= 3,
    n_estimators= 140
    )

model.fit(X_train, Y_train)

In [10]:
df_2021 = df_2021.reset_index()

In [16]:
X_2021 = df_2021.drop(target_variable, axis = 1)
features_drop_list = {'idgame', 'location'}
X_2021.drop(features_drop_list, axis=1, inplace=True)

Y_2021 = df_2021.loc[:,target_variable]


In [17]:
X_2021 = preprocessor.transform(X_2021)
Y_pred_2021 = pd.DataFrame(model.predict(X_2021), columns = ["pred_winner"])

In [18]:
res_2021 = pd.concat([df_2021,Y_pred_2021],axis=1)

In [19]:
accuracy_score(Y_2021, Y_pred_2021)

0.7638376383763837

In [14]:
res_2021

Unnamed: 0,index,season,week,awayteam,hometeam,idgame,winner_home,home_coach,away_coach,weather_type,...,kicking_pts_MA_5_home,delta_day_away,delta_day_home,stade,location,people,month,dayofmonth,dayofweek,pred_winner
0,1018,2021,15,Chiefs,Chargers,401326537,0,Brandon Staley,Andy Reid,cloudy,...,8.4,4.0,4.0,SoFi Stadium,"Inglewood, CA",98.0,12,17,4,0
1,1019,2021,1,49ers,Lions,401326317,0,Dan Campbell,Kyle Shanahan,indoor,...,4.2,251.0,251.0,Ford Field,"Detroit, MI",92.0,9,12,6,0
2,1020,2021,18,Bengals,Browns,401326588,1,Kevin Stefanski,Zac Taylor,rain,...,3.6,7.0,5.0,FirstEnergy Stadium,"Cleveland, OH",100.0,1,9,6,1
3,1021,2021,18,Seahawks,Cardinals,401326597,0,Kliff Kingsbury,Pete Carroll,sunny,...,8.6,7.0,7.0,State Farm Stadium,"Glendale, AZ",98.0,1,9,6,0
4,1022,2021,18,Patriots,Dolphins,401326592,1,Brian Flores,Bill Belichick,cloudy,...,7.0,7.0,7.0,Hard Rock Stadium,"Miami Gardens, FL",100.0,1,9,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,1284,2021,12,Raiders,Cowboys,401326495,0,Mike McCarthy,Rich Bisaccia,,...,6.6,4.0,4.0,AT&T Stadium,"Arlington, TX",93.0,11,25,3,1
267,1285,2021,12,Bears,Lions,401326494,0,Dan Campbell,Matt Nagy,indoor,...,5.8,3.0,3.0,Ford Field,"Detroit, MI",87.0,11,25,3,0
268,1286,2021,11,Giants,Buccaneers,401326493,1,Bruce Arians,Joe Judge,cloudy,...,6.2,15.0,8.0,Raymond James Stadium,"Tampa, FL",100.0,11,23,1,1
269,1287,2021,2,Raiders,Steelers,401326356,0,Mike Tomlin,Jon Gruden,sunny,...,6.2,5.0,7.0,Acrisure Stadium,"Pittsburgh, PA",93.0,9,19,6,0


In [83]:
res_2021.to_csv("../05_results/prediction_xgb_games_2021.csv")

# ----------------- end of prediction 2021 -----------------