# Imports

### Libraries & Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Useful Imports from sklearn
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### Data

In [2]:
import os
cwd = os.getcwd()
df_injury=pd.read_csv(os.path.join(cwd,'final_data.csv'), index_col=0)
df_injury.head()

Unnamed: 0,age,games,games_starts,minutes,cards_yellow,cards_red,minutes_90s,goals,shots,shots_free_kicks,...,ball_recoveries,aerials_won,aerials_lost,n_injuries,n_severe_injuries,currently_injured,position_DF,position_FW,position_GK,position_MF
0,29.273973,14.0,14.0,1260.0,0.0,0.0,14.0,0.0,0.0,0.0,...,19.0,3.0,0.0,6,0,0,0,0,1,0
1,25.534247,12.0,9.0,873.0,0.0,0.0,9.7,0.0,5.0,0.0,...,41.0,26.0,13.0,4,0,0,1,0,0,0
2,28.49863,4.0,3.0,286.0,0.0,0.0,3.2,0.0,7.0,0.0,...,11.0,11.0,4.0,14,4,0,1,0,0,0
3,28.49589,9.0,8.0,731.0,1.0,0.0,8.1,0.0,5.0,0.0,...,37.0,10.0,6.0,15,1,0,1,0,0,0
4,27.767123,8.0,7.0,571.0,0.0,0.0,6.3,0.0,3.0,0.0,...,24.0,13.0,10.0,10,1,0,1,0,0,0


# Splitting data into train, test and validation sets

In [3]:
from sklearn.ensemble import IsolationForest

#create test and train sets
properties = list(df_injury.columns.values)
properties.remove('currently_injured')
X = df_injury[properties]
y = df_injury['currently_injured']


isf = IsolationForest(n_jobs=-1, random_state=1)
isf.fit(X, y)
preds = isf.predict(X)

X['outlier'] = preds
X = X.drop(X[X['outlier'] == -1].index)
X = X.drop('outlier', axis=1)

y = pd.DataFrame(y)
y['outlier'] = preds
y = y.drop(y[y['outlier'] == -1].index)
y = y.drop('outlier', axis=1)
y = y['currently_injured'].squeeze()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=1)
X_val, X_train, y_val, y_train = train_test_split(X_train, y_train, test_size=0.8, random_state=1)

### Fix Class Imbalance

In [4]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', n_jobs=-1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Model Engineering Techniques

### Feature Engineering

#### F-score method

In [5]:
# Create a list of F-values for the existing features
feature_F_scores, _ = f_classif(X_train, y_train)

# Iterate through each combination of features
for f1_index, f1 in enumerate(X_train.columns):
  for f2_index, f2 in enumerate(X_train.columns[f1_index + 1:]):
    
    # Multiply the two features to create a new feature
    new_feature = X_train[[f1]].multiply(X_train[f2], axis=0)
    # Evaluate F-value of new feature
    F_Score_new, p_value_new = f_classif(new_feature, y_train)
    # Evaluate the relative improvement of the new feature
    F_score_improvement = F_Score_new[0] / max(feature_F_scores[[f1_index, f2_index]])
    # Print out features that is sufficiently improved 
    if F_score_improvement >= 1.5 and F_Score_new[0] >= 250 and p_value_new < 0.05:
        '''Note that F_score_improvement >= 1.5 and F_Score_new[0] >= 75 is
         relatively arbitrary, and that other values could be used.'''
        print(f'{f1} * {f2} has an F-score of {F_Score_new[0]:.2f}')
        print(f'\tBetter by a factor of {F_score_improvement:.2f} over features in isolation')
        print(f'\tThe result is significant (p = {p_value_new})')


age * games has an F-score of 286.77
	Better by a factor of 14541.24 over features in isolation
	The result is significant (p = [2.65445942e-60])
age * position_MF has an F-score of 269.03
	Better by a factor of 4.21 over features in isolation
	The result is significant (p = [6.71013689e-57])


#### Correlation

In [6]:
# Iterate through each combination of features
for f1_index, f1 in enumerate(X_train.columns):
  for f2_index, f2 in enumerate(X_train.columns[f1_index + 1:]):
    # Multiply the two features to create a new feature
    new_feature = X_train[[f1]].multiply(X_train[f2], axis=0)
    
    new_feature_ser = new_feature.iloc[:,0]
    corr_val = new_feature_ser.corr(y_train)

    # Evaluate F-value of new feature
    F_Score_new, p_value_new = f_classif(new_feature, y_train)
    if corr_val > 0.2 and p_value_new < 0.05:
        print(f'{f1} * {f2} has a correlation of {corr_val:.2f} with the target feature')
        print(f'\tThe result is significant (p = {p_value_new})')


age * n_injuries has a correlation of 0.24 with the target feature
	The result is significant (p = [1.16488646e-28])


#### Final selection

In [7]:
def new_feature_combos(X):
    X_new = X.copy()
    
    X_new['age * n_injuries'] = X_new.n_injuries * X_new.age
    X_new['age * games'] = X_new.n_injuries * X_new.age
    X_new['age * position_MF'] = X_new.n_injuries * X_new.touches

    return X_new

X_train = new_feature_combos(X_train)
X_test = new_feature_combos(X_test)
X_val = new_feature_combos(X_val)

### Feature Selection

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

selectk = SelectKBest(k=15)
X_new_train = selectk.fit(X_train, y_train)
X_new_train.transform(X_train)
selected_cols = X_new_train.get_feature_names_out()

X_train = X_train[selected_cols]
X_test = X_test[selected_cols]
X_val = X_val[selected_cols]

print(selected_cols)

['games' 'games_starts' 'minutes' 'minutes_90s' 'passes_completed'
 'passes' 'passes_total_distance' 'passes_short' 'passes_live' 'touches'
 'touches_live_ball' 'passes_received' 'ball_recoveries' 'n_injuries'
 'position_MF']


### Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

technique_name = 'Scaling'

def standardize_data(X_train, X_out_of_sample, X_val):
    """standardizes all of the data in X_train and X_out_of_sample. The mean and
    standard deviation of each feature (i.e., each column) from the X_train
    data is used to standardize both the X_train and X_out_of sample."""

    X_train_standardized = X_train.copy()
    X_out_of_sample_standardized = X_out_of_sample.copy()
    X_val_standardized = X_val.copy() 

    scaler = StandardScaler()

    scaler.fit(X_train) 
    X_train_standardized = scaler.transform(X_train_standardized)
    X_out_of_sample_standardized = scaler.transform(X_out_of_sample_standardized)
    X_val_standardized = scaler.transform(X_val_standardized)

    return X_train_standardized, X_out_of_sample_standardized, X_val_standardized, scaler

# Make new data that is scaled
X_train, X_test, X_val, scaler = standardize_data(X_train, X_test, X_val)

# Download Final Data

In [14]:
# Feature data stored in np arrays so must convert back to df.
pd.DataFrame(data=X_train).to_csv('X_train.csv')
pd.DataFrame(data=X_test).to_csv('X_test.csv')
pd.DataFrame(data=X_val).to_csv('X_val.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')
y_val.to_csv('y_val.csv')