In [115]:
import pandas as pd
import os


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn import metrics
import matplotlib.pyplot as plt


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)

In [116]:
os.getcwd()

'c:\\Users\\sylva\\Desktop\\nflpredictor\\nfl\\03_modelisation'

In [117]:
df = pd.read_csv("../04_datasets/nfl_dataset_vf.csv", index_col=0)
df.head()

Unnamed: 0,season,week,awayteam,hometeam,idgame,winner_home,home_coach,away_coach,weather_type,temperature,...,punt_return_yds_MA_5_home,kicking_pts_MA_5_home,delta_day_away,delta_day_home,stade,location,people,month,dayofmonth,dayofweek
0,2017,13,Vikings,Falcons,400951677,0,Dan Quinn,Mike Zimmer,,,...,10.4,9.4,10.0,7.0,Mercedes-Benz Stadium,"Atlanta, GA",95.0,12,3,6
1,2017,9,Falcons,Panthers,400951749,1,Ron Rivera,Dan Quinn,cloudy,61.0,...,22.4,7.4,7.0,7.0,Bank of America Stadium,"Charlotte, NC",100.0,11,5,6
2,2017,9,Bengals,Jaguars,400951753,1,Doug Marrone,Marvin Lewis,,78.0,...,0.4,8.4,7.0,14.0,TIAA Bank Field,"Jacksonville, FL",89.0,11,5,6
3,2017,4,Bears,Packers,400951678,1,Mike McCarthy,John Fox,cloudy,65.0,...,,,4.0,4.0,Lambeau Field,"Green Bay, WI",97.0,9,29,4
4,2017,9,Colts,Texans,400951751,0,Bill O'Brien,Chuck Pagano,cloudy,84.0,...,31.4,9.0,7.0,6.0,NRG Stadium,"Houston, TX",100.0,11,5,6


Preprocessing with awayteam and hometeam

In [118]:
Y = df.loc[:,"winner_home"]
X = df.loc[:,['awayteam','hometeam']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)

# select automatically the numerical and categorical columns
num_col = X.select_dtypes([np.number]).columns
cat_col = X.select_dtypes("object").columns

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values in Age will be replaced by columns' mean
    ('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns 
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


Logistic Regression

In [119]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

scores = cross_val_score(model, X_train, Y_train, cv=5)

avg = scores.mean()
std = scores.std()

print(avg)
print(std)

Accuracy on training set :  0.6741028128031038
Accuracy on test set :  0.6317829457364341
0.6314478682988602
0.013063476018031307


In [121]:
df_games = pd.read_csv("../04_datasets/nfl2022_total.csv", index_col=0)
df_games

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location
0,2022.0,5.0,2022-10-06,Colts,Broncos,Empower Field at Mile High,"Denver, CO"
1,2022.0,5.0,2022-10-09,Giants,Packers,Tottenham Hotspur Stadium,London
2,2022.0,5.0,2022-10-09,Steelers,Bills,Highmark Stadium,"Orchard Park, NY"
3,2022.0,5.0,2022-10-09,Chargers,Browns,FirstEnergy Stadium,"Cleveland, OH"
4,2022.0,5.0,2022-10-09,Bears,Vikings,U.S. Bank Stadium,"Minneapolis, MN"
...,...,...,...,...,...,...,...
270,2022.0,18.0,2023-01-08,Cowboys,Commanders,FedExField,"Landover, MD"
=======,,,,,,,
270,2022.0,18.0,2023-01-08,Cowboys,Washington,FedExField,"Landover, MD"
>>>>>>> c0c2a3d4d9f977d559db69db84226e348bed2e83,,,,,,,


In [122]:
df_games = df_games.reset_index()

In [123]:
df_games = df_games.drop(columns='index', axis=1)

In [124]:
df_games

Unnamed: 0,season,week,date,awayteam,hometeam,stadium,location
0,2022.0,5.0,2022-10-06,Colts,Broncos,Empower Field at Mile High,"Denver, CO"
1,2022.0,5.0,2022-10-09,Giants,Packers,Tottenham Hotspur Stadium,London
2,2022.0,5.0,2022-10-09,Steelers,Bills,Highmark Stadium,"Orchard Park, NY"
3,2022.0,5.0,2022-10-09,Chargers,Browns,FirstEnergy Stadium,"Cleveland, OH"
4,2022.0,5.0,2022-10-09,Bears,Vikings,U.S. Bank Stadium,"Minneapolis, MN"
...,...,...,...,...,...,...,...
335,2022.0,18.0,2023-01-08,Cowboys,Commanders,FedExField,"Landover, MD"
336,,,,,,,
337,2022.0,18.0,2023-01-08,Cowboys,Washington,FedExField,"Landover, MD"
338,,,,,,,


In [12]:
df["awayteam"].unique()

array(['Vikings', 'Falcons', 'Bengals', 'Bears', 'Colts', 'Steelers',
       'Eagles', 'Rams', 'Cardinals', 'Washington', 'Chiefs', 'Panthers',
       'Giants', 'Browns', 'Raiders', 'Lions', 'Broncos', 'Texans',
       'Buccaneers', 'Saints', 'Bills', '49ers', 'Patriots', 'Ravens',
       'Seahawks', 'Cowboys', 'Titans', 'Jets', 'Packers', 'Jaguars',
       'Dolphins', 'Chargers'], dtype=object)

In [114]:
X_topredict = df_games.loc[:,["awayteam", "hometeam"]]
X_topredict = preprocessor.transform(X_topredict)
Y_pred = model.predict(X_topredict)

ValueError: Found unknown categories ['Commanders'] in column 0 during transform

In [80]:
Y_pred = pd.DataFrame(Y_pred, columns = ["winner"] )

In [81]:
Y_pred

Unnamed: 0,winner
0,1
1,0
2,1
3,0
4,0
5,0
6,0
7,1
8,1
9,0


In [82]:
Y_pred_proba = model.predict_proba(X_topredict).round(4)

In [83]:
Y_pred_proba = pd.DataFrame(Y_pred_proba, columns=["proba_away", "proba_home"])

In [84]:
Y_pred_proba

Unnamed: 0,proba_away,proba_home
0,0.4529,0.5471
1,0.8172,0.1828
2,0.439,0.561
3,0.7035,0.2965
4,0.6325,0.3675
5,0.5615,0.4385
6,0.7734,0.2266
7,0.3293,0.6707
8,0.48,0.52
9,0.5476,0.4524


In [85]:
res = pd.concat([df_games,Y_pred], axis =1)

In [86]:
res =pd.concat([res,Y_pred_proba], axis = 1)

In [87]:
res

Unnamed: 0,awayteam,hometeam,winner,proba_away,proba_home
0,Bills,Rams,1,0.4529,0.5471
1,Saints,Falcons,0,0.8172,0.1828
2,49ers,Bears,1,0.439,0.561
3,Steelers,Bengals,0,0.7035,0.2965
4,Eagles,Lions,0,0.6325,0.3675
5,Patriots,Dolphins,0,0.5615,0.4385
6,Ravens,Jets,0,0.7734,0.2266
7,Jaguars,Washington,1,0.3293,0.6707
8,Browns,Panthers,1,0.48,0.52
9,Colts,Texans,0,0.5476,0.4524


In [89]:
os.getcwd()

'c:\\Users\\sylva\\Desktop\\nflpredictor\\nfl\\03_modelisation'

In [90]:
res.to_csv("05_results/results_games_week1.csv")