# Importing final dataframe




In [1]:
# standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ast
import datetime

# Imports nessecary for the Neural Network 
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.engine.sequential import Sequential
from tensorflow.python.keras.layers import Dense
from keras.callbacks import History 

# Import useful packages from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

pd.set_option('notebook_repr_html', True)

## Through Google Colab

In [23]:
df_injury = pd.read_csv('https://docs.google.com/spreadsheets/d/1TwWSYiSJviNgr16MH7TLhf53hKHkPhKD1gn261zhJOU/export?format=csv&gid=2070040166')
df_injury

Unnamed: 0,name,club,injuries
0,Ederson,Manchester City,"[('20/21', 'Virus Infection', 'Dec 27, 2020', ..."
1,Stefan Ortega,Manchester City,"[('21/22', 'Corona virus', 'Dec 17, 2021', 'De..."
2,Scott Carson,Manchester City,"[('20/21', 'Virus Infection', 'Jan 6, 2021', '..."
3,Rúben Dias,Manchester City,"[('21/22', 'Knock', 'Mar 3, 2022', 'Apr 11, 20..."
4,Aymeric Laporte,Manchester City,"[('22/23', 'Knee Surgery', 'Jun 30, 2022', 'Oc..."
...,...,...,...
2652,Álex Collado,Elche CF,"[('20/21', 'Muscular problems', 'Apr 24, 2021'..."
2653,Josan,Elche CF,"[('18/19', 'Tear in the abductor muscle', 'Sep..."
2654,Lucas Boyé,Elche CF,"[('20/21', 'Knock', 'Oct 14, 2020', 'Oct 28, 2..."
2655,Ezequiel Ponce,Elche CF,"[('21/22', 'Torn Meniscus', 'Sep 30, 2021', 'D..."


In [24]:
df_outfield = pd.read_csv('https://docs.google.com/spreadsheets/d/17TDSYYY4TEAEHPxTPP87Pn4WKlI2UGChtnRmozz887s/export?format=csv&gid=1112887342', sep=';')
df_outfield

Unnamed: 0.1,Unnamed: 0,player,nationality,position,team,age,birth_year,games,games_starts,minutes,...,touches_live_ball,dribbles,miscontrols,dispossessed,passes_received,fouls,fouled,ball_recoveries,aerials_won,"aerials_lost,"
0,0,Brenden Aaronson,us USA,"MF,FW",Leeds United,22-034,2000,14.0,14.0,1189.0,...,614.0,54.0,32.0,41.0,420.0,8.0,28.0,68.0,6.0,17.0
1,1,Yunis Abdelhamid,ma MAR,DF,Reims,35-058,1987,15.0,15.0,1350.0,...,811.0,12.0,9.0,12.0,442.0,19.0,9.0,96.0,29.0,"16.0,"
2,2,Himad Abdelli,fr FRA,"MF,FW",Angers,23-008,1999,7.0,2.0,231.0,...,180.0,13.0,9.0,4.0,135.0,3.0,6.0,17.0,2.0,4.0
3,3,Salis Abdul Samed,gh GHA,MF,Lens,22-244,2000,15.0,15.0,1349.0,...,1028.0,16.0,19.0,21.0,783.0,37.0,22.0,98.0,9.0,"7.0,"
4,4,Laurent Abergel,fr FRA,MF,Lorient,29-297,1993,10.0,10.0,807.0,...,475.0,8.0,10.0,6.0,317.0,7.0,9.0,51.0,2.0,"2.0,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2378,2378,Martín Zubimendi,es ESP,MF,Real Sociedad,23-296,1999,14.0,13.0,1151.0,...,771.0,7.0,9.0,7.0,529.0,22.0,11.0,71.0,33.0,"11.0,"
2379,2379,Szymon Żurkowski,pl POL,MF,Fiorentina,25-061,1997,2.0,0.0,32.0,...,25.0,1.0,1.0,1.0,18.0,1.0,0.0,2.0,1.0,"1.0,"
2380,2380,Martin Ødegaard,no NOR,MF,Arsenal,23-343,1998,13.0,13.0,1074.0,...,690.0,22.0,16.0,19.0,489.0,12.0,8.0,65.0,5.0,"9.0,"
2381,2381,Milan Đurić,ba BIH,FW,Hellas Verona,32-187,1990,11.0,2.0,283.0,...,133.0,1.0,5.0,0.0,112.0,6.0,7.0,4.0,47.0,"16.0,"


## Through Local Host

In [None]:
# Get the current working directory
cwd = os.getcwd()
# Get the parent directory
parent_dir = os.path.dirname(cwd)

# Construct the path to your file
file_path = os.path.join(parent_dir, 'raw_data', 'transfermarkt_data_2022.csv')
# Now you can read the file
df_injury = pd.read_csv(file_path, index_col=0)
# Display the first few rows
df_injury

In [None]:
# Construct the path to your file
file_path = os.path.join(parent_dir, 'raw_data', 'fbref_data_2022.csv')
# Now you can read the file
df_outfield = pd.read_csv(file_path, index_col=0)
# Display the first few rows
df_outfield

# Data Cleansing

In this section, we will outline the steps we took to clean the data obtained from Transfermarkt / FBRef and make it usable to train our model.

## Removing Duplicates

In [3]:
df_outfield = df_outfield.drop_duplicates(subset='player')
df_injury = df_injury.drop_duplicates(subset='name')
print(df_outfield['player'].value_counts())
print(df_injury['name'].value_counts())

Brenden Aaronson     1
Jamal Musiala        1
Álvaro Negredo       1
Reiss Nelson         1
Ilija Nestorovski    1
                    ..
Ryan Fraser          1
Davide Frattesi      1
Fred                 1
Ryan Fredericks      1
Filip Đuričić        1
Name: player, Length: 2330, dtype: int64
Ederson                  1
Pedro                    1
Ciro Immobile            1
Pierluigi Gollini        1
Pietro Terracciano       1
                        ..
Amar Abdirahman Ahmed    1
Levi Lumeka              1
Wilson Odobert           1
Rony Lopes               1
Roger Martí              1
Name: name, Length: 2654, dtype: int64


### Merging dataframes

We must now merge the Transfermarkt dataframe (containing the injury data) with the FBRef dataframe (containing in-game statistics).

In [4]:
df_outfield = df_outfield.rename(columns={"player": "name", "team": "club"})
df_injury = pd.merge(df_injury, df_outfield, how='inner', on='name')
df_injury.head()

Unnamed: 0.1,name,club_x,injuries,Unnamed: 0,nationality,position,club_y,age,birth_year,games,...,touches_live_ball,dribbles,miscontrols,dispossessed,passes_received,fouls,fouled,ball_recoveries,aerials_won,"aerials_lost,"
0,Ederson,Manchester City,"[('20/21', 'Virus Infection', 'Dec 27, 2020', ...",689,br BRA,GK,Manchester City,29-100,1993,14.0,...,547.0,0.0,0.0,0.0,345.0,0.0,2.0,19.0,3.0,"0.0,"
1,Rúben Dias,Manchester City,"[('21/22', 'Knock', 'Mar 3, 2022', 'Apr 11, 20...",615,pt POR,DF,Manchester City,25-195,1997,12.0,...,972.0,3.0,0.0,0.0,762.0,10.0,3.0,41.0,26.0,"13.0,"
2,Aymeric Laporte,Manchester City,"[('22/23', 'Knee Surgery', 'Jun 30, 2022', 'Oc...",1232,es ESP,DF,Manchester City,28-182,1994,4.0,...,279.0,2.0,1.0,0.0,216.0,0.0,2.0,11.0,11.0,"4.0,"
3,John Stones,Manchester City,"[('22/23', 'Hamstring Injury', 'Sep 26, 2022',...",2101,eng ENG,DF,Manchester City,28-181,1994,9.0,...,686.0,5.0,3.0,1.0,541.0,3.0,3.0,37.0,10.0,"6.0,"
4,Nathan Aké,Manchester City,"[('20/21', 'Hamstring Injury', 'Dec 27, 2020',...",37,nl NED,DF,Manchester City,27-280,1995,8.0,...,648.0,0.0,3.0,0.0,538.0,2.0,4.0,24.0,13.0,"10.0,"


In [None]:
# from google.colab import files

# df_injury.to_csv('df.csv')
# files.download('df.csv')

### Unpacking arrays

There are certain columns containing arrays of tuples (historical data) that cannot be used as features.

Therefore, we must unpack the values within these arrays to make usable features for the model.

In [5]:
# INJURIES COLUMN
df_injury['injuries'] = df_injury['injuries'].apply(lambda arr: ast.literal_eval(arr)) # only run this once to transform into array (currently held as string)
df_injury['n_injuries'] = df_injury['injuries'].apply(lambda arr: len(arr)) # total number of injuries suffered
df_injury['n_severe_injuries'] = df_injury['injuries'].apply(lambda arr: sum([1 for x in arr if int(x[4]) > 60])) # total number of severe injuries suffered (defined as 60 days missed or greater)

### Target column

In the cell below we create our target variable column.

In [6]:
from datetime import datetime

df_injury['currently_injured'] = df_injury['injuries'].apply(lambda arr: sum([1 for x in arr if x[3] == '-' or datetime.strptime(x[3], "%b %d, %Y") > datetime.now()])) # currently injured players

### Datatype conversions, non-usable features and dummy variables

In [7]:
# players who have not stepped foot onto the field, will have 0s for 90% of columns
# we will remove these rows from the dataset (total of 24, 1 injured)
df_injury = df_injury[df_injury['minutes_90s'] != 0]

In [8]:
# df_injury['height'] = df_injury['height'].apply(lambda x: int(str(x).replace(",","")))

#Drop non usable features (categorical/ non numerical)
colums_to_drop = ["name", "club_x", "club_y", "Unnamed: 0", "birth_year", "nationality", "injuries"]
df_injury.drop(columns = colums_to_drop, inplace = True)

# convert age to float64 column
df_injury['age'] = df_injury['age'].apply(lambda row : float(row[0:2])+(float(row[3:6])/365))

# alternate position column
df_injury['alt_position'] = df_injury['position'].apply(lambda row: row.split(',')[1] if ',' in row else '') # might be null instead of ''
df_injury['position'] = df_injury['position'].apply(lambda row: row[0:2])

#dummy variables for role data
df_injury = pd.get_dummies(df_injury, columns=['position', 'alt_position'])
df_injury['position_DF'] = df_injury['position_DF'] + df_injury['alt_position_DF']
df_injury['position_MF'] = df_injury['position_MF'] + df_injury['alt_position_MF']
df_injury['position_FW'] = df_injury['position_FW'] + df_injury['alt_position_FW']

df_injury = df_injury.rename(columns={'aerials_lost,': 'aerials_lost'})
df_injury['aerials_lost'] = df_injury['aerials_lost'].apply(lambda row: float(row.replace(',', '')))
df_injury.head()

#Drop alternate columns
colums_to_drop = ["alt_position_", "alt_position_DF", "alt_position_MF", "alt_position_FW"]
df_injury.drop(columns = colums_to_drop, inplace = True)
df_injury['currently_injured'] = df_injury['currently_injured'].replace(2, 1)


# NEW ADDITION SINCE SUNDAY
#Make columns on per 90 minutes basis
season_tot_cols = list(df_injury.columns[7:43])
season_tot_cols.extend(['cards_yellow', 'cards_red'])

for col in season_tot_cols:
    df_injury[col] = np.where(df_injury['minutes_90s'] != 0, df_injury[col] / df_injury['minutes_90s'], 0)

# engineered?
df_injury['minutes_per_appearance'] = df_injury['minutes'] / df_injury['games']
df_injury['game_starts_percent'] = df_injury['games_starts'] / df_injury['games']

#Drop seasonal cumulative columns
colums_to_drop = ["games", "games_starts", "minutes", "minutes_90s"]
df_injury.drop(columns = colums_to_drop, inplace = True)

In [9]:
#final data set columns
df_injury.dtypes

age                            float64
cards_yellow                   float64
cards_red                      float64
goals                          float64
shots                          float64
shots_free_kicks               float64
passes_completed               float64
passes                         float64
passes_total_distance          float64
passes_progressive_distance    float64
passes_short                   float64
passes_medium                  float64
passes_long                    float64
passes_live                    float64
passes_dead                    float64
passes_free_kicks              float64
through_balls                  float64
passes_switches                float64
crosses                        float64
corner_kicks                   float64
tackles                        float64
tackles_won                    float64
dribble_tackles                float64
dribbled_past                  float64
blocks                         float64
interceptions            

# Splitting data into train and test sets

In [10]:
#create test and train sets
properties = list(df_injury.columns.values)
properties.remove('currently_injured')
X = df_injury[properties]
y = df_injury['currently_injured']


isf = IsolationForest(n_jobs=-1, random_state=1)
isf.fit(X, y)
preds = isf.predict(X)

X['outlier'] = preds
X = X.drop(X[X['outlier'] == -1].index)
X = X.drop('outlier', axis=1)

y = pd.DataFrame(y)
y['outlier'] = preds
y = y.drop(y[y['outlier'] == -1].index)
y = y.drop('outlier', axis=1)
y = y['currently_injured'].squeeze()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=1)
X_val, X_train, y_val, y_train = train_test_split(X_train, y_train, test_size=0.8, random_state=1)

## Fixing Class Imbalance

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', n_jobs=-1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Model Engineering Techniques

## Feature Engineering

### Algorithmic Method using F-score

In [12]:
# Create a list of F-values for the existing features
feature_F_scores, _ = f_classif(X_train, y_train)

# Iterate through each combination of features
for f1_index, f1 in enumerate(X_train.columns):
  for f2_index, f2 in enumerate(X_train.columns[f1_index + 1:]):
    
    # Multiply the two features to create a new feature
    new_feature = X_train[[f1]].multiply(X_train[f2], axis=0)
    
    # Evaluate F-value of new feature
    F_Score_new, p_value_new = f_classif(new_feature, y_train)
    
    # Evaluate the relative improvement of the new feature
    
    F_score_improvement = F_Score_new[0] / max(feature_F_scores[[f1_index, f2_index]])
    # Print out features that is sufficiently improved 
    
    if F_score_improvement >= 1.5 and F_Score_new[0] >= 250 and p_value_new < 0.05:
        '''
        Note that F_score_improvement >= 1.5 and F_Score_new[0] >= 250 is
        relatively arbitrary, and that other values could be used.
        '''

        print(f'{f1} * {f2} has an F-score of {F_Score_new[0]:.2f}')
        print(f'\tBetter by a factor of {F_score_improvement:.2f} over features in isolation')
        print(f'\tThe result is significant (p = {p_value_new})')
        print('')


age * position_MF has an F-score of 303.11
	Better by a factor of 4.60 over features in isolation
	The result is significant (p = [1.0577829e-63])

passes_completed * position_MF has an F-score of 340.00
	Better by a factor of 23.71 over features in isolation
	The result is significant (p = [1.03320706e-70])

passes * position_MF has an F-score of 350.28
	Better by a factor of 37.52 over features in isolation
	The result is significant (p = [1.19699239e-72])

passes_total_distance * position_MF has an F-score of 317.11
	Better by a factor of 35.90 over features in isolation
	The result is significant (p = [2.25138019e-66])

passes_progressive_distance * position_MF has an F-score of 311.07
	Better by a factor of 8.51 over features in isolation
	The result is significant (p = [3.1796545e-65])

passes_short * position_MF has an F-score of 353.02
	Better by a factor of 19.05 over features in isolation
	The result is significant (p = [3.6716064e-73])

passes_medium * position_MF has an F-s

### Using Correlation

In [13]:
# Iterate through each combination of features
for f1_index, f1 in enumerate(X_train.columns):
  for f2_index, f2 in enumerate(X_train.columns[f1_index + 1:]):
    # Multiply the two features to create a new feature
    new_feature = X_train[[f1]].multiply(X_train[f2], axis=0)
    
    new_feature_ser = new_feature.iloc[:,0]
    corr_val = new_feature_ser.corr(y_train)

    # Evaluate F-value of new feature
    F_Score_new, p_value_new = f_classif(new_feature, y_train)
    if corr_val > 0.2 and p_value_new < 0.05:
        print(f'{f1} + {f2} has a correlation of {corr_val:.2f} with the target feature')
        print(f'\tThe result is significant (p = {p_value_new})')


age + n_injuries has a correlation of 0.26 with the target feature
	The result is significant (p = [4.80667343e-36])
passes + n_injuries has a correlation of 0.22 with the target feature
	The result is significant (p = [3.59737881e-25])
passes_medium + n_injuries has a correlation of 0.22 with the target feature
	The result is significant (p = [2.65835646e-25])
passes_live + n_injuries has a correlation of 0.21 with the target feature
	The result is significant (p = [1.5808591e-22])
passes_dead + n_injuries has a correlation of 0.21 with the target feature
	The result is significant (p = [7.50529796e-23])
tackles_won + n_injuries has a correlation of 0.21 with the target feature
	The result is significant (p = [1.84853627e-22])
dribbled_past + n_injuries has a correlation of 0.23 with the target feature
	The result is significant (p = [7.63445773e-29])
clearances + n_injuries has a correlation of 0.21 with the target feature
	The result is significant (p = [6.25086589e-24])
touches + n

### Final selection

In [14]:
def new_feature_combos(X):
    X_new = X.copy()
    
    # Correlation of 0.3
    X_new['n_injuries * age'] = X_new.n_injuries * X_new.age

    # Correlation of 0.28
    X_new['n_injuries * touches'] = X_new.n_injuries * X_new.touches
    
    # Correlation of 0.29
    X_new['n_injuries * miscontrols'] = X_new.n_injuries * X_new.miscontrols

    return X_new

X_train = new_feature_combos(X_train)
X_test = new_feature_combos(X_test)
X_val = new_feature_combos(X_val)

## Feature Selection

In [15]:
from sklearn.feature_selection import SelectKBest, f_classif

selectk = SelectKBest(k=15)
X_new_train = selectk.fit(X_train, y_train)
X_new_train.transform(X_train)
new_cols = X_new_train.get_feature_names_out()

X_train = X_train[new_cols]
X_test = X_test[new_cols]
X_val = X_val[new_cols]

# select_features = SelectKBest(f_classif, k=15)
# select_features = select_features.fit(X_train_scaled, y_train)

# # Get mask of columns that have good features
# feature_mask = select_features.get_support()
# X_train_fe.columns[feature_mask]

# # select features
# X_train_scaled_fs = X_train_scaled.iloc[:, feature_mask]
# X_test_scaled_fs = X_test_scaled.iloc[:, feature_mask]
# X_val_scaled_fs = X_val_scaled.iloc[:, feature_mask]

## Feature Scaling 

In [16]:
from sklearn.preprocessing import StandardScaler

technique_name = 'Scaling'

def standardize_data(X_train, X_out_of_sample, X_val):
    """standardizes all of the data in X_train and X_out_of_sample. The mean and
    standard deviation of each feature (i.e., each column) from the X_train
    data is used to standardize both the X_train and X_out_of sample."""

    X_train_standardized = X_train.copy()
    X_out_of_sample_standardized = X_out_of_sample.copy()
    X_val_standardized = X_val.copy() 

    scaler = StandardScaler()
    
    '''Use scaler to standardize your data. You'll need to fit scaler with your
    training data (use the fit method) and standardize your training and 
    out-of-sample data (use the transform method)'''

    scaler.fit(X_train) 
    X_train_standardized = scaler.transform(X_train_standardized)
    X_out_of_sample_standardized = scaler.transform(X_out_of_sample_standardized)
    X_val_standardized = scaler.transform(X_val_standardized)

    return X_train_standardized, X_out_of_sample_standardized, X_val_standardized, scaler

# Make new data that is scaled
X_train, X_test, X_val, scaler = standardize_data(X_train, X_test, X_val)