<center><h1><font size=6> Initial Data Processing and Feature Engineering </h1></center>

### Load libraries and setup notebook configuration

In [1]:
# import packages
import pandas as pd 
import numpy as np
import os
from pathlib import Path
import warnings



# set pandas configurations
pd.set_option("display.precision", 2) # display to 1 decimpal place
pd.set_option("display.max.columns", None) # display all columns so we can view the whole dataset
pd.set_option('display.float_format', '{:.2f}'.format) # Disable scientific notation for pandas
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning) # Disable setting with copy warnings


# set directories
os.chdir('..') # change current working directory to the parent directory to help access files/directories at a higher level
DATAPATH = Path(r'data') # set data path


# import from source directory
from src import constants

### Load training data

In [31]:
X_train_full = pd.read_csv(f"{DATAPATH}/processed/X_train_full.csv")
y_train_full = pd.read_csv(f"{DATAPATH}/processed/y_train_full.csv")

In [32]:
train = X_train_full.copy()

### Dealing with missing values

In [33]:
# define function to summarise missing values
def missing_values_summary(df):
    missing_values = df.isnull().sum()  # Calculate total missing values per column
    total_values = df.shape[0]  # Total number of rows in the DataFrame
    missing_percentage = (missing_values / total_values) * 100  # Calculate percentage of missing values

    summary_df = pd.DataFrame({'Total Missing Values': missing_values, 'Missing Percentage': missing_percentage})
    return summary_df

# summarise missing values for data frame
summary = missing_values_summary(train)
print(summary)

                           Total Missing Values  Missing Percentage
unique_match_id                               0                0.00
season                                        0                0.00
date                                          0                0.00
day_of_week                                   0                0.00
round                                         0                0.00
day                                           0                0.00
team                                          0                0.00
home                                          0                0.00
promoted                                      0                0.00
opponent                                      0                0.00
promoted_opponent                             0                0.00
days_since_last_game                        434                2.43
games_played_last_21_days                   465                2.60
pl_total_points                               4 

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Define the columns for each preprocessing step
columns_to_remove_nas = ['unique_match_id', 'season', 'date', 'day_of_week', 'round', 'day', 'team', 'home', 'promoted', 'opponent', 'promoted_opponent',
                        'pl_total_points', 'pl_total_gf', 'pl_total_ga', 'pl_total_goal_diff', 'pl_position', 'pl_position_opponent']
columns_to_impute_with_mean = ['days_since_last_game', 'games_played_last_21_days']
columns_to_impute_based_on_promotion_status = ['prev_season_points', 'prev_season_gf', 'prev_season_ga', 'prev_season_goal_diff', 'points_pl_form', 'gf_pl_form', 'ga_pl_form']
columns_to_impute_based_on_pl_position = ['last_h2h', 'last_h2h_form', 'last_h2h_venue', 'last_h2h_venue_form']


# define custom transformer, inherited from the sklearn.base BaseEstimator and TransformerMixin classes
class DropNaRowsTransformer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.dropna(subset=self.columns, inplace=False)
        return X_transformed


# define a custom transformer to impute missing values with the mean of each column
class MeanImputer(BaseEstimator, TransformerMixin):
    # initalise additional parameters based on the chosen columns
    def __init__(self, columns):
        self.columns = columns
        self.means = {}
        
    def fit(self, X, y=None):
        for column in self.columns:
            self.means[column] = {
                'mean': X[column].mean()
            }
        return self 
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = X_transformed[column].fillna(value=self.means[column]['mean'])
        return X_transformed
    
    
# Define custom transformer for imputing missing values based on the promotion status - if the team is promoted, replace NA the 10th percentile value or the 50th percentile if not
class PromotedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.percentiles = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.percentiles[column] = {
                '10th': X[column].quantile(0.1),
                '50th': X[column].quantile(0.5)
            }
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed.loc[X_transformed['promoted'] == 1, column] = X_transformed.loc[X_transformed['promoted'] == 1, column].fillna(value=self.percentiles[column]['10th'])
            X_transformed.loc[X_transformed['promoted'] == 0, column] = X_transformed.loc[X_transformed['promoted'] == 0, column].fillna(value=self.percentiles[column]['50th'])
        return X_transformed
    
# define a custom transformer to impute missing values based on the team and opponents premier league position
class PlPositionImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.position_means = pd.DataFrame()
        
    def fit(self, X, y=None):
        for column in self.columns:
            column_means = pd.DataFrame(X.groupby(['pl_position', 'pl_position_opponent'])[column].mean())
            self.position_means = pd.concat([self.position_means, column_means[column]], axis=1)
        return self
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in columns:
            X_transformed[column] = X_transformed.apply(lambda row: row[column] if not pd.isna(row[column]) else self.position_means.loc[(row['pl_position'], row['pl_position_opponent']), column], axis=1)
        return X_transformed


pipe = Pipeline(
    steps=[
        ("remove_rows_with_nas", DropNaRowsTransformer(columns=columns_to_remove_nas)),
        ("impute_with_means_of_columns", MeanImputer(columns=columns_to_impute_with_mean)),
        ("impute_with_mean_of_columns_by_promotion_status", PromotedImputer(columns=columns_to_impute_based_on_promotion_status)),
        ("impute_with_mean_of_columns_by_pl_position", PlPositionImputer(columns=columns_to_impute_based_on_pl_position))
    ]
)

transformed_train = pipe.fit_transform(train)

In [56]:
summary = missing_values_summary(transformed_train)
print(summary)

                           Total Missing Values  Missing Percentage
unique_match_id                               0                0.00
season                                        0                0.00
date                                          0                0.00
day_of_week                                   0                0.00
round                                         0                0.00
day                                           0                0.00
team                                          0                0.00
home                                          0                0.00
promoted                                      0                0.00
opponent                                      0                0.00
promoted_opponent                             0                0.00
days_since_last_game                          0                0.00
games_played_last_21_days                     0                0.00
pl_total_points                               0 