<center><h1><font size=6> Initial Data Processing and Feature Engineering </h1></center>

### Load libraries and setup notebook configuration

In [1]:
# import packages
import pandas as pd 
import numpy as np
import os
from pathlib import Path
import warnings



# set pandas configurations
pd.set_option("display.precision", 2) # display to 1 decimpal place
pd.set_option("display.max.columns", None) # display all columns so we can view the whole dataset
pd.set_option('display.float_format', '{:.2f}'.format) # Disable scientific notation for pandas
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning) # Disable setting with copy warnings


# set directories
os.chdir('..') # change current working directory to the parent directory to help access files/directories at a higher level
DATAPATH = Path(r'data') # set data path


# import from source directory
from src import constants

### Load training data

In [70]:
X_train_full = pd.read_csv(f"{DATAPATH}/processed/X_train_full.csv")
y_train_full = pd.read_csv(f"{DATAPATH}/processed/y_train_full.csv")

In [71]:
train = X_train_full.copy()

### Dealing with missing values

In [72]:
# define function to summarise missing values
def missing_values_summary(df):
    missing_values = df.isnull().sum()  # Calculate total missing values per column
    total_values = df.shape[0]  # Total number of rows in the DataFrame
    missing_percentage = (missing_values / total_values) * 100  # Calculate percentage of missing values

    summary_df = pd.DataFrame({'Total Missing Values': missing_values, 'Missing Percentage': missing_percentage})
    return summary_df

# summarise missing values for data frame
summary = missing_values_summary(train)
print(summary)

                                    Total Missing Values  Missing Percentage
unique_match_id                                        0                0.00
season                                                 0                0.00
date                                                   0                0.00
day_of_week                                            0                0.00
round                                                  0                0.00
day                                                    0                0.00
team                                                   0                0.00
promoted                                               0                0.00
opponent                                               0                0.00
promoted_opponent                                      0                0.00
home                                                   0                0.00
days_since_last_game                                 434                2.43

#### Define a set of custom transformers inhereted from scikit-learn base classes to transform the data

For some columns, we only have a small amount of irregular missing values. Here, it makes sense to just remove the rows associated with these observations from the data.

In [73]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# define custom transformer to remove any rows that have NA values for 
class DropNaRowsTransformer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.dropna(subset=self.columns, inplace=False)
        return X_transformed
    
# define list of columns to remove NAs
columns_to_remove_nas = ['unique_match_id', 'season', 'date', 'day_of_week', 'round', 'day', 'team', 'home', 'promoted', 'opponent', 'promoted_opponent', 'pl_position', 'pl_position_opponent',
                        'pl_total_points', 'pl_total_gf', 'pl_total_ga', 'pl_total_goal_diff', 'pl_total_points_opponent', 'pl_total_gf_opponent', 'pl_total_ga_opponent', 'pl_total_goal_diff_opponent']

For other columns, it makes sense to impute the missing values with the averages of the values. An example of this is the days since last game and number of games in last 21 days feature. From the EDA, we saw that the missing values for these features tend to all be in the first game week due to the lack of data. It therefore makes sense to just impute them as the average over the course of the season. This means all teams/opponents will have the same value in the first gameweek which makes sense intuitively.

In [74]:
# define a custom transformer to impute missing values with the mean of each column
class MeanImputer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
        self.means = {}
        
    def fit(self, X, y=None):
        for column in self.columns:
            self.means[column] = {
                'mean': X[column].mean()
            }
        return self 
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = X_transformed[column].fillna(value=self.means[column]['mean'])
        return X_transformed
    
    
# define list of columns to be imputed using the mean imputer
columns_to_impute_with_mean = ['days_since_last_game', 'games_played_last_21_days', 'days_since_last_game_opponent', 'games_played_last_21_days_opponent']

For some variables, whether there is a missing value or not is highly linked to the team's promotion status. This is because promoted teams dont appear in the previous season's data so we can't calculate some metrics. For these cases, it makes sense to impute the missing values based on the average historic values of promoted or non-promoted teams.

In [75]:
# Define custom transformer for imputing missing values based on the promotion status - if the team is promoted, replace NA the 10th percentile value or the 50th percentile if not
class PromotedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, columns_opponent):
        self.columns = columns
        self.columns_opponent = columns_opponent
        self.percentiles = {}
        self.percentiles_opponent = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.percentiles[column] = {
                '10th': X[column].quantile(0.1),
                '50th': X[column].quantile(0.5)
            }
        for column_opponent in self.columns_opponent:
            self.percentiles_opponent[column_opponent] = {
                '10th': X[column_opponent].quantile(0.1),
                '50th': X[column_opponent].quantile(0.5)
            }
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed.loc[X_transformed['promoted'] == 1, column] = X_transformed.loc[X_transformed['promoted'] == 1, column].fillna(value=self.percentiles[column]['10th'])
            X_transformed.loc[X_transformed['promoted'] == 0, column] = X_transformed.loc[X_transformed['promoted'] == 0, column].fillna(value=self.percentiles[column]['50th'])
        for column_opponent in self.columns_opponent:
            X_transformed.loc[X_transformed['promoted_opponent'] == 1, column_opponent] = X_transformed.loc[X_transformed['promoted_opponent'] == 1, column_opponent].fillna(value=self.percentiles_opponent[column_opponent]['10th'])
            X_transformed.loc[X_transformed['promoted_opponent'] == 0, column_opponent] = X_transformed.loc[X_transformed['promoted_opponent'] == 0, column_opponent].fillna(value=self.percentiles_opponent[column_opponent]['50th'])
        return X_transformed
    
    
# define list of columns to impute in this way
columns_to_impute_based_on_promotion_status = ['prev_season_points', 'prev_season_gf', 'prev_season_ga', 'prev_season_goal_diff', 'points_pl_form', 'gf_pl_form', 'ga_pl_form']
columns_to_impute_based_on_promotion_status_oppontent = ['prev_season_points_opponent', 'prev_season_gf_opponent', 'prev_season_ga_opponent', 'prev_season_goal_diff_opponent', 'points_pl_form_opponent', 'gf_pl_form_opponent', 'ga_pl_form_opponent']

For some other columns, we might want to impute the missing values in a more sophisticated way. For example, the teams' and opponents' H2H record is found to have a high correlation with the predictor (see EDA) and so could be an important determinant in our model. However, where two teams havent played each other before, we won't have this metric. I want to impute the missing values here using a clustering algorithm based on important determinants of game results, namely the venue of the match and the team and opponents' points acheived in previous seasons. This will impute the head to head records based on a metric of how good the team and opponent is, as well as the venue identifier.

In [80]:
from sklearn.impute import KNNImputer

# define a custom transformer to impute missing values based on a decision tree algorithm trained on the relationship between venue and team metrics for the non-missing value data we have
class DecisionTreeClassifierImputer(BaseEstimator, TransformerMixin):
    def __init__(self, target_columns, reference_columns):
        self.target_columns = target_columns
        self.reference_columns = reference_columns
        self.decision_tree_classifiers = {}
        
    def fit(self, X, y=None):
        
        # Fit a separate Decision Tree model for each target column
        for target_column in self.target_columns:
    
            # Split out X and y for decision tree: the X will be the reference columns and the data where the target column is not NA
            X_where_target_non_na = X[X[target_column].notna()]
            X_decision_tree = X_where_target_non_na[self.reference_columns]
            y_decision_tree = X_where_target_non_na[target_column]

            # Create an instance of the Decision Tree regressor
            decision_tree = DecisionTreeRegressor()

            # Fit the model on the reference columns and target variable
            decision_tree.fit(X_decision_tree, y_decision_tree)

            # assign decision tree model to column
            self.decision_tree_classifiers[target_column] = decision_tree
        
        return self
    
    def transform(self, X, y=None):
        
        X_transformed = X.copy()
        
        # Apply Decision Tree model to target columns
        for target_column in self.target_columns:
    
            # Split out the X where these is a missing value in the target column for prediction
            X_where_target_is_na = X[X[target_column].isna()]
            X_decision_tree_predict = X_where_target_is_na[self.reference_columns]

            # collect classifier for specific variable
            decision_tree = self.decision_tree_classifiers[target_column]

            # use decision tree to predict target column
            y_decision_tree_predicted = decision_tree.predict(X_decision_tree_predict)

            # Replace the missing values in the target column with the predicted values
            X_transformed.loc[X_where_target_is_na.index, target_column] = y_decision_tree_predicted
        
        return X_transformed

# define a list of columns to be imputed in this way alongside the columns to use as reference columns
columns_to_impute_based_on_decision_tree = ['last_h2h', 'last_h2h_form', 'last_h2h_venue', 'last_h2h_venue_form']
reference_columns_for_decision_tree = ['home', 'prev_season_points', 'prev_season_points_opponent']

#### Transform data using transformation pipeline based on pre-defined custom transformers

In [81]:
pipe = Pipeline(
    steps=[
        ("remove_rows_with_nas", DropNaRowsTransformer(columns=columns_to_remove_nas)),
        ("impute_with_means_of_columns", MeanImputer(columns=columns_to_impute_with_mean)),
        ("impute_with_mean_of_columns_by_promotion_status", PromotedImputer(columns=columns_to_impute_based_on_promotion_status,
                                                                           columns_opponent=columns_to_impute_based_on_promotion_status_oppontent)),
        ("impute_based_on_decision_tree_algorithm", DecisionTreeClassifierImputer(target_columns=columns_to_impute_based_on_decision_tree,
                                                                                  reference_columns=reference_columns_for_decision_tree))
    ]
)

In [82]:
transformed_train = pipe.fit_transform(train)
summary = missing_values_summary(transformed_train)
print(summary)

                                    Total Missing Values  Missing Percentage
unique_match_id                                        0                0.00
season                                                 0                0.00
date                                                   0                0.00
day_of_week                                            0                0.00
round                                                  0                0.00
day                                                    0                0.00
team                                                   0                0.00
promoted                                               0                0.00
opponent                                               0                0.00
promoted_opponent                                      0                0.00
home                                                   0                0.00
days_since_last_game                                   0                0.00

In [21]:
 X_transformed = X.copy()
        
# Apply KNN imputation for each target column
for target_column in target_columns:
    imputer = imputers[target_column]

    # Separate the target column and the reference columns
    X_reference = X.loc[:, reference_columns]
    X_target = X.loc[:, target_column]

    # Find the indices of missing values in the target column
    missing_indices = np.isnan(X_target)

    # Impute the missing values in the target column using KNN
    imputed_values = imputer.transform(X_reference)
    #X_transformed.loc[missing_indices, target_column] = imputed_values[missing_indices, reference_columns.index(target_column)]
imputed_values.shape

(17879, 3)

In [22]:
X_target_transformed = X_target

# Fit the KNNImputer on the reference columns
imputer.fit(X_reference)

# Find the indices of missing values in the target column
missing_indices = np.isnan(X_target_transformed)

# Impute the missing values in the target column using KNN
imputed_values = imputer.transform(X_reference)
#X_target_transformed[missing_indices] = imputed_values[missing_indices, target_columns.index(target_column)]
imputed_values

array([[ 1., 41., 35.],
       [ 0., 58., 50.],
       [ 0., 68., 54.],
       ...,
       [ 0., 44., 53.],
       [ 0., 61., 60.],
       [ 1., 59., 55.]])

In [12]:
X_target_transformed.isna().sum()

0

In [24]:
from sklearn.impute import KNNImputer

# set inputs
X = transformed_train
target_columns = ['last_h2h', 'last_h2h_form', 'last_h2h_venue', 'last_h2h_venue_form']
reference_columns = ['home', 'prev_season_points', 'prev_season_points_opponent']
n_neighbors = 5
imputer = KNNImputer(n_neighbors=n_neighbors)


target_column = target_columns[0]

# Separate the target column and the reference columns
X_reference = X.loc[:, reference_columns]
X_target = X.loc[:, target_column]
X_target_transformed = X_target

# Fit the KNNImputer on the reference columns
imputer.fit(X_reference)

# Find the indices of missing values in the target column
missing_indices = np.isnan(X_target_transformed)

# Impute the missing values in the target column using KNN
imputed_values = imputer.transform(X_reference)
X_target_transformed[missing_indices] = imputed_values[missing_indices, target_columns.index(target_column)]

In [26]:
X_target

0       1.00
1       1.00
2       1.00
3       1.00
4       0.00
        ... 
17884   1.00
17885   0.00
17886   3.00
17887   0.00
17888   0.00
Name: last_h2h, Length: 17879, dtype: float64

In [27]:
X_target_transformed

0       1.00
1       1.00
2       1.00
3       1.00
4       0.00
        ... 
17884   1.00
17885   0.00
17886   3.00
17887   0.00
17888   0.00
Name: last_h2h, Length: 17879, dtype: float64

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Define the columns for each preprocessing step
columns_to_remove_nas = ['unique_match_id', 'season', 'date', 'day_of_week', 'round', 'day', 'team', 'home', 'promoted', 'opponent', 'promoted_opponent',
                        'pl_total_points', 'pl_total_gf', 'pl_total_ga', 'pl_total_goal_diff', 'pl_position', 'pl_position_opponent']
columns_to_impute_with_mean = ['days_since_last_game', 'games_played_last_21_days']
columns_to_impute_based_on_promotion_status = ['prev_season_points', 'prev_season_gf', 'prev_season_ga', 'prev_season_goal_diff', 'points_pl_form', 'gf_pl_form', 'ga_pl_form']
columns_to_impute_based_on_pl_position = ['last_h2h', 'last_h2h_form', 'last_h2h_venue', 'last_h2h_venue_form']


# define custom transformer, inherited from the sklearn.base BaseEstimator and TransformerMixin classes
class DropNaRowsTransformer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.dropna(subset=self.columns, inplace=False)
        return X_transformed


# define a custom transformer to impute missing values with the mean of each column
class MeanImputer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
        self.means = {}
        
    def fit(self, X, y=None):
        for column in self.columns:
            self.means[column] = {
                'mean': X[column].mean()
            }
        return self 
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = X_transformed[column].fillna(value=self.means[column]['mean'])
        return X_transformed
    
    
# Define custom transformer for imputing missing values based on the promotion status - if the team is promoted, replace NA the 10th percentile value or the 50th percentile if not
class PromotedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.percentiles = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.percentiles[column] = {
                '10th': X[column].quantile(0.1),
                '50th': X[column].quantile(0.5)
            }
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed.loc[X_transformed['promoted'] == 1, column] = X_transformed.loc[X_transformed['promoted'] == 1, column].fillna(value=self.percentiles[column]['10th'])
            X_transformed.loc[X_transformed['promoted'] == 0, column] = X_transformed.loc[X_transformed['promoted'] == 0, column].fillna(value=self.percentiles[column]['50th'])
        return X_transformed
    
# define a custom transformer to impute missing values based on the team and opponents premier league position
class PlPositionImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.position_means = pd.DataFrame()
        
    def fit(self, X, y=None):
        for column in self.columns:
            column_means = pd.DataFrame(X.groupby(['pl_position', 'pl_position_opponent'])[column].mean())
            self.position_means = pd.concat([self.position_means, column_means[column]], axis=1)
        return self
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in columns:
            X_transformed[column] = X_transformed.apply(lambda row: row[column] if not pd.isna(row[column]) else self.position_means.loc[(row['pl_position'], row['pl_position_opponent']), column], axis=1)
        return X_transformed


pipe = Pipeline(
    steps=[
        ("remove_rows_with_nas", DropNaRowsTransformer(columns=columns_to_remove_nas)),
        ("impute_with_means_of_columns", MeanImputer(columns=columns_to_impute_with_mean)),
        ("impute_with_mean_of_columns_by_promotion_status", PromotedImputer(columns=columns_to_impute_based_on_promotion_status)),
        ("impute_with_mean_of_columns_by_pl_position", PlPositionImputer(columns=columns_to_impute_based_on_pl_position))
    ]
)

transformed_train = pipe.fit_transform(train)

In [56]:
summary = missing_values_summary(transformed_train)
print(summary)

                           Total Missing Values  Missing Percentage
unique_match_id                               0                0.00
season                                        0                0.00
date                                          0                0.00
day_of_week                                   0                0.00
round                                         0                0.00
day                                           0                0.00
team                                          0                0.00
home                                          0                0.00
promoted                                      0                0.00
opponent                                      0                0.00
promoted_opponent                             0                0.00
days_since_last_game                          0                0.00
games_played_last_21_days                     0                0.00
pl_total_points                               0 