In [100]:
import pandas as pd

In [101]:
df = pd.read_csv("../../04_datasets/nfl_dataset_v3.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,...,capacity,people,attendance_info,month,dayofmonth,dayofweek,hour,minute,win_streak,lose_streak
0,0,2017,9,Bengals,Jaguars,7,23,400951753,16,1,...,67858,89.0,1,11,5,6,18,0,0.0,1.0
1,4,2017,4,Bills,Falcons,23,17,400951685,-6,0,...,75000,95.0,1,10,1,6,17,0,2.0,0.0
2,8,2017,4,Saints,Dolphins,20,0,400950241,-20,0,...,86000,98.0,1,10,1,6,13,30,0.0,2.0
3,12,2017,4,Bears,Packers,14,35,400951678,21,1,...,81041,97.0,1,9,29,4,0,25,0.0,1.0
4,16,2017,16,Colts,Ravens,16,23,400951596,7,1,...,70745,100.0,1,12,23,5,21,30,0.0,6.0


#### Basic preprocessing

- Columns to be removed because they reveal the winner: 'awayscore', 'homescore', 'score_abs', 'winner_team', 'winner_away'
- Columns to be removed because they are in double: all the columns of the metrics without the 'MA' at the end (moving average)

In [102]:
df = df.drop(columns=['Unnamed: 0', 'capacity', 'concat', 'game_date', 'delta_match', 'date','awayscore', 'homescore', 'score_abs', 'winner_team', 'winner_away', 'attendance_info', 'location_x', 'location_y', 'hour', 'minute', 'team'])

- Removing the duplicates in the idgame

In [103]:
# cheking the amount of games in the dataframe
len(df.idgame.unique())

1289

In [104]:
# identifying the double
df.duplicated('idgame')

0       False
1       False
2       False
3       False
4       False
        ...  
1284    False
1285    False
1286    False
1287    False
1288    False
Length: 1289, dtype: bool

In [105]:
print(df.columns.tolist())

['season', 'week', 'awayteam', 'hometeam', 'idgame', 'winner_home', 'home_coach', 'away_coach', 'weather_type', 'temperature', 'humidity', 'wind', 'pass_yds_MA_5', 'pass_td_MA_5', 'rush_yds_MA_5', 'rush_td_MA_5', 'rec_yds_MA_5', 'rec_td_MA_5', 'fumbles_MA_5', 'fumbles_rec_MA_5', 'defense_sacks_MA_5', 'defense_td_MA_5', 'interceptions_MA_5', 'interceptions_td_MA_5', 'kicks_return_yds_MA_5', 'kicks_return_td_MA_5', 'punt_return_yds_MA_5', 'kicking_pct_MA_5', 'stade', 'attendance', 'people', 'month', 'dayofmonth', 'dayofweek', 'win_streak', 'lose_streak']


- Integrating a new column 'Playing_at_home' to distinguish the opponent from the localisation of the game

In [106]:
# creating a new column
df['playing_at_home'] = 1

# renaming the awayteam column
df = df.rename(columns = {'awayteam':'opponent'})

##### Basic analysis

In [107]:
print("Number of rows : {}".format(df.shape[0]))
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 1289

Basics statistics: 


Unnamed: 0,season,week,opponent,hometeam,idgame,winner_home,home_coach,away_coach,weather_type,temperature,...,kicking_pct_MA_5,stade,attendance,people,month,dayofmonth,dayofweek,win_streak,lose_streak,playing_at_home
count,1289.0,1289.0,1289,1289,1289.0,1289.0,1289,1289,1289,1289.0,...,1209.0,1289,1140.0,1140.0,1289.0,1289.0,1289.0,1275.0,1275.0,1289.0
unique,,,32,32,,,63,63,7,,...,,37,,,,,,,,
top,,,Cowboys,Jaguars,,,John Harbaugh,Ron Rivera,cloudy,,...,,MetLife Stadium,,,,,,,,
freq,,,41,41,,,41,41,768,,...,,81,,,,,,,,
mean,2019.027153,9.17533,,,401134100.0,0.539178,,,,57.233514,...,72.989264,,61886.508772,88.013158,10.254461,16.082234,5.098526,1.032941,1.349804,1.0
std,1.422984,5.062066,,,133883.2,0.498656,,,,21.706911,...,17.842267,,19059.984436,24.11428,2.123613,8.765195,1.920486,1.603718,2.210647,0.0
min,2017.0,1.0,,,400950200.0,0.0,,,,1.0,...,6.66,,748.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,2018.0,5.0,,,401030800.0,0.0,,,,45.0,...,60.0,,60546.75,91.0,10.0,9.0,6.0,0.0,0.0,1.0
50%,2019.0,9.0,,,401128000.0,1.0,,,,63.0,...,75.0,,66880.5,97.0,11.0,16.0,6.0,0.0,1.0,1.0
75%,2020.0,14.0,,,401220300.0,1.0,,,,72.0,...,85.0,,71865.25,100.0,12.0,24.0,6.0,1.5,2.0,1.0



Percentage of missing values: 


season                    0.000000
week                      0.000000
opponent                  0.000000
hometeam                  0.000000
idgame                    0.000000
winner_home               0.000000
home_coach                0.000000
away_coach                0.000000
weather_type              0.000000
temperature               0.000000
humidity                  0.000000
wind                      0.000000
pass_yds_MA_5             6.206362
pass_td_MA_5              6.206362
rush_yds_MA_5             6.206362
rush_td_MA_5              6.206362
rec_yds_MA_5              6.206362
rec_td_MA_5               6.206362
fumbles_MA_5              6.206362
fumbles_rec_MA_5          6.206362
defense_sacks_MA_5        6.206362
defense_td_MA_5           6.206362
interceptions_MA_5        6.206362
interceptions_td_MA_5     6.206362
kicks_return_yds_MA_5     6.206362
kicks_return_td_MA_5      6.206362
punt_return_yds_MA_5      6.206362
kicking_pct_MA_5          6.206362
stade               

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   season                 1289 non-null   int64  
 1   week                   1289 non-null   int64  
 2   opponent               1289 non-null   object 
 3   hometeam               1289 non-null   object 
 4   idgame                 1289 non-null   int64  
 5   winner_home            1289 non-null   int64  
 6   home_coach             1289 non-null   object 
 7   away_coach             1289 non-null   object 
 8   weather_type           1289 non-null   object 
 9   temperature            1289 non-null   int64  
 10  humidity               1289 non-null   int64  
 11  wind                   1289 non-null   int64  
 12  pass_yds_MA_5          1209 non-null   float64
 13  pass_td_MA_5           1209 non-null   float64
 14  rush_yds_MA_5          1209 non-null   float64
 15  rush

#### Test with a logistic regression

In [109]:
df_rl = df

In [110]:
df_rl = df_rl.drop(columns=['idgame'])

In [111]:
df_rl.columns

Index(['season', 'week', 'opponent', 'hometeam', 'winner_home', 'home_coach',
       'away_coach', 'weather_type', 'temperature', 'humidity', 'wind',
       'pass_yds_MA_5', 'pass_td_MA_5', 'rush_yds_MA_5', 'rush_td_MA_5',
       'rec_yds_MA_5', 'rec_td_MA_5', 'fumbles_MA_5', 'fumbles_rec_MA_5',
       'defense_sacks_MA_5', 'defense_td_MA_5', 'interceptions_MA_5',
       'interceptions_td_MA_5', 'kicks_return_yds_MA_5',
       'kicks_return_td_MA_5', 'punt_return_yds_MA_5', 'kicking_pct_MA_5',
       'stade', 'attendance', 'people', 'month', 'dayofmonth', 'dayofweek',
       'win_streak', 'lose_streak', 'playing_at_home'],
      dtype='object')

In [112]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [113]:
# defining columns that will not be taken into account in the model because they are redundant
#redundant_columns = ['idgame', 'minute', 'hour', 'attendance', 'people', 'win']

In [114]:
# defining a list of columns to be excluded from X
#to_be_excluded = ['winner_home', 'idgame', 'minute', 'hour', 'attendance', 'people', 'win']

In [116]:
Y = df.loc[:,"winner_home"]
X = df.loc[:,[c for c in df.columns if c!='winner_home']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0, stratify=Y)

# select automatically the numerical and categorical columns
num_col = X.select_dtypes([np.number]).columns
cat_col = X.select_dtypes("object").columns

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values in Age will be replaced by columns' mean
    ('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns 
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

model = LogisticRegression()
model.fit(X_train, Y_train)

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

Accuracy on training set :  0.7224137931034482
Accuracy on test set :  0.5736434108527132


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Test with a random forest

In [117]:
!pip install xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor 


[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: C:\Users\alice\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)
     -------------------------------------- 125.4/125.4 MB 9.3 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [None]:
regressor_ada = AdaBoostRegressor()
regressor_ada.fit(X_train, Y_train)

print("score Adaboost default train {}".format(regressor_ada.score(X_train, y_train)))
print("\n")
print("score Adaboost default test {}".format(regressor_ada.score(X_test, y_test)))