## Setup

This is a copy of the exploratory notebook. With some changes made to incorporate the steps & transformation identified in the model building stage.

In [79]:
# Basic Setup

import pandas as pd
import numpy as np 
import sklearn 
import os

"""
This data is from the 2020-2021 and 2021-2022 seasons.
The data was scraped partway through the 2021-2022 season, so we won't have the complete match history for the season.

"""

matches = pd.read_csv('../data/raw/matches.csv')

matches.index = matches[matches.columns[0]]
matches.head()

Unnamed: 0_level_0,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


# Cleaning & Transformation 

Acting on the initial notes, we shall be doing the following:

1. Dropping columns: 1. notes 2. comp. 3. match report 4. attendance. 
2. Run one hot encoding on the categorical columns 

In [80]:
#  Dropping uninformative columns 

matches.drop(['comp', 'match report', 'attendance', 'notes', 'Unnamed: 0','round'], axis = 1, inplace = True)



In [81]:
len(matches.index)

1389

In [82]:
# Handling datetime 

import datetime
from datetime import date, time 

matches['date'] = pd.to_datetime(matches.date).dt.date
matches['time'] = pd.to_datetime(matches.time).dt.time
matches['datetime'] = matches.apply(lambda r : pd.datetime.combine(r['date'],r['time']),1)


  matches['datetime'] = matches.apply(lambda r : pd.datetime.combine(r['date'],r['time']),1)


In [83]:
# Handling Categoricals 

matches['venue']  =   pd.Categorical(matches.venue, ordered = False)
matches['opp_code'] = pd.Categorical(matches.opponent, ordered = False)
matches['day'] =      pd.Categorical(matches.day, ordered = False)
matches['hour'] =     matches.datetime.apply(lambda x: x.hour)
matches['formation'] = pd.Categorical(matches.formation, ordered = False)
matches['result'] =  pd.Categorical(matches.result, ordered = False)

matches.venue = matches.venue.cat.codes
matches.opp_code = matches.opp_code.cat.codes
matches.day = matches.day.cat.codes
matches.formation = matches.formation.cat.codes
matches.result = matches.result.cat.codes



# matches['venue'] = matches.venue.cat.codes
# matches['opponent'] 
# Droppping uninformative columns 

matches.drop(['referee', 'captain'], axis = 1, inplace = True)

# Separating all categorical columns 

matches.hour = matches.hour.astype('int8')
matches_cats = matches.select_dtypes(include = 'int8').drop('result', axis = 1)
matches_cats[['datetime', 'team']] = matches[['datetime', 'team']]

# Creating a copy of matches at this stage 

matches_cat_transforms = matches

In [84]:
matches_backup = matches

group = matches.groupby('team')
group = group.get_group('Manchester City')
# group = group.sort_values('date')
# rolling_stats = group(cols).rolling(3, closed = 'left').mean()
# group[new_cols] = rolling_stats
# group = group.dropna(subset = new_cols)


In [85]:
# Adding Rolling Statistics 

def get_rolling_avg(col, new_cols, group):
    
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    
    return group

    
cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt", "xg", "xga"]
new_cols = [f"{c}_rolling" for c in cols ]
    

In [86]:
matches_rolling  = matches.groupby('team').apply(lambda x: get_rolling_avg(cols, new_cols,x))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.index.shape[0])

# Separating out the rolling columns

matches_rolling_stats = matches_rolling[new_cols]
matches_rolling_stats[['datetime','team']] = matches_rolling[['datetime', 'team']]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_rolling_stats[['datetime','team']] = matches_rolling[['datetime', 'team']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_rolling_stats[['datetime','team']] = matches_rolling[['datetime', 'team']]


In [87]:
matches_predictors = pd.merge(matches_cats,matches_rolling_stats, how = 'inner', on = ['datetime','team'])
matches_predictors.index = matches_predictors[['datetime','team']]

In [88]:
matches_target = matches[['datetime','team','result']]
matches_target['target'] = [1 if i  == 2 else 0 for i in matches_target.result.tolist()]
matches_target.drop('result', axis = 1, inplace = True)
matches_target.index = matches_target[['datetime','team']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_target['target'] = [1 if i  == 2 else 0 for i in matches_target.result.tolist()]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_target.drop('result', axis = 1, inplace = True)


In [89]:
matches_utils = matches[['season', 'datetime', 'team','opponent']]
matches_utils.index = matches_utils[['datetime','team']]

In [90]:
matches_interim_v2 = pd.merge(matches_predictors, matches_target, how = 'inner', on = ['datetime','team']).merge(matches_utils, how = 'inner', 
                            on = ['datetime','team']) 

# Storing the prepared dataset. 

import pickle

# matches_interim.to_pickle('../data/interim/matches_interim.pkl')
# matches_interim.to_csv('../data/interim/matches_interim.csv')

In [91]:
matches_rolling_stats.drop(['datetime','team'], axis = 1).corr()

Unnamed: 0,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling
gf_rolling,1.0,-0.183986,0.435036,0.654136,-0.164936,0.082976,0.300601,0.302124,0.697853,-0.258521
ga_rolling,-0.183986,1.0,-0.266994,-0.216136,0.094314,-0.0616,0.004624,-0.005909,-0.269928,0.693549
sh_rolling,0.435036,-0.266994,1.0,0.716523,-0.1615,0.206714,0.006313,0.06145,0.748949,-0.389796
sot_rolling,0.654136,-0.216136,0.716523,1.0,-0.208302,0.158849,0.024717,0.055918,0.714087,-0.299476
dist_rolling,-0.164936,0.094314,-0.1615,-0.208302,1.0,0.167196,0.201553,0.194781,-0.300088,0.144595
fk_rolling,0.082976,-0.0616,0.206714,0.158849,0.167196,1.0,-0.012491,0.005171,0.118537,-0.146962
pk_rolling,0.300601,0.004624,0.006313,0.024717,0.201553,-0.012491,1.0,0.901911,0.285066,-0.063036
pkatt_rolling,0.302124,-0.005909,0.06145,0.055918,0.194781,0.005171,0.901911,1.0,0.34778,-0.086713
xg_rolling,0.697853,-0.269928,0.748949,0.714087,-0.300088,0.118537,0.285066,0.34778,1.0,-0.370183
xga_rolling,-0.258521,0.693549,-0.389796,-0.299476,0.144595,-0.146962,-0.063036,-0.086713,-0.370183,1.0


* High Correlation between pk_rolling  & pkaa_rolling
* High Correlation between sh_rolling  & sot rolling 

Possible cause of covariance. One solution is to take a ratio of pk_rolling/pkaa_rolling & sh_rolling/sot_rolling and have them added as predictors as opposed to the 2 columns.  
Since we are dealing with a high bias situation, further removal of 2 columns seems risky. To train 2 different models. One where this transformation is utilized & the other where it isn't.



In [92]:
matches_interim_v3 = matches_interim_v2.copy()
matches_interim_v3['pkr'] = matches_interim_v3['pk_rolling']/matches_interim_v3['pkatt_rolling']
matches_interim_v3['skr'] = matches_interim_v3['sot_rolling']/matches_interim_v3['sh_rolling']

matches_interim_v3.drop(['pk_rolling','pkatt_rolling','sot_rolling','sh_rolling'], axis = 1, inplace = True)
matches_interim_v3.fillna(0, inplace=True)


In [98]:
matches_rolling_stats.drop(['datetime','team'], axis = 1).describe()

Unnamed: 0,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling
count,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0
mean,1.336624,1.37636,12.172868,4.06201,17.021817,0.458365,0.118451,0.145533,1.306859,1.338345
std,0.792509,0.774578,3.387388,1.533963,1.805009,0.385343,0.202503,0.225441,0.512448,0.502923
min,0.0,0.0,3.333333,0.666667,12.233333,0.0,0.0,0.0,0.266667,0.233333
25%,0.666667,0.666667,9.666667,3.0,15.8,0.333333,0.0,0.0,0.933333,0.966667
50%,1.333333,1.333333,12.0,4.0,16.933333,0.333333,0.0,0.0,1.233333,1.3
75%,1.666667,2.0,14.333333,5.0,18.133333,0.666667,0.333333,0.333333,1.6,1.666667
max,5.666667,4.666667,26.666667,11.666667,25.066667,2.333333,1.666667,1.666667,3.566667,3.466667


The columns sh_rolling, sot_rolling, & dist_rolling are on a different scalee hence are candidates for feature scaling. 
They are roughly normally distributed but do have some outliers hence standardization is the method we'll be using for feature scaling. 



## Standardizing Features 

In [149]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

v3_scaled =  std_scaler.fit_transform(matches_rolling_stats[['dist_rolling', 'sh_rolling','sot_rolling']])
matches_rolling_stats_copy = matches_rolling_stats.copy()
matches_rolling_stats_copy[['dist_rolling', 'sh_rolling','sot_rolling']] = v3_scaled
matches_rolling_stats_copy.drop(['datetime','team'], axis = 1).describe()

Unnamed: 0,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_rolling,xga_rolling
count,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0,1317.0
mean,1.336624,1.37636,1.99621e-16,1.456694e-16,3.02129e-16,0.458365,0.118451,0.145533,1.306859,1.338345
std,0.792509,0.774578,1.00038,1.00038,1.00038,0.385343,0.202503,0.225441,0.512448,0.502923
min,0.0,0.0,-2.610534,-2.214286,-2.653895,0.0,0.0,0.0,0.266667,0.233333
25%,0.666667,0.666667,-0.7401433,-0.6925938,-0.677161,0.333333,0.0,0.0,0.933333,0.966667
50%,1.333333,1.333333,-0.0510521,-0.04043982,-0.04903995,0.333333,0.0,0.0,1.233333,1.3
75%,1.666667,2.0,0.6380391,0.6117142,0.6160293,0.666667,0.333333,0.333333,1.6,1.666667
max,5.666667,4.666667,4.280379,4.959407,4.458652,2.333333,1.666667,1.666667,3.566667,3.466667


In [154]:

# Creating scaled versions of the datasets

matches_interim_v2_scaled = matches_interim_v2.copy()
matches_interim_v2_scaled[['dist_rolling', 'sh_rolling','sot_rolling']] = matches_rolling_stats_copy[['dist_rolling', 'sh_rolling','sot_rolling']]
matches_interim_v3_scaled = matches_interim_v3.copy() 
matches_interim_v3_scaled['dist_rolling'] = matches_rolling_stats_copy['dist_rolling']



In [155]:
# Exporting datsets to data folder 

import pickle 

matches_interim_v2.to_pickle('../data/interim/matches_interim_v2.pkl')
matches_interim_v2_scaled.to_pickle('../data/interim/matches_interim_v2_scaled.pkl')
matches_interim_v3.to_pickle('../data/interim/matches_interim_v3.pkl')
matches_interim_v3_scaled.to_pickle('../data/interim/matches_interim_v3_scaled.pkl')



### Next Steps 

* Train a LR model for each of these datasets.
* Optimize the hyperparameters of the model using RandomSearchCV.
* Test out the performance of each of these models against the test set.
* After choosing the best version of the model. Report the general performance using the test main set. 
* Iterate, if required. 
