In [1]:
'''
# Data Preprocessing

## Project: E-commerce Product Delivery Prediction

The preprocessing pipeline is designed to be consistent, reusable, and
safe from data leakage.

### Preprocessing Steps

- Encode categorical features using One-Hot Encoding
- Scale numerical features using StandardScaler
- Apply all transformations using a unified preprocessing pipeline
- Handle missing values using appropriate imputation strategies
- Separate the target variable from input features
- Perform a stratified trainâ€“test split to preserve class distribution

These steps ensure that the dataset is transformed into a model-ready format
and that the same preprocessing logic can be consistently applied during
model training and evaluation.
'''

# Library Imports

from typing import Tuple, List
from pathlib import Path
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

import sys
from pathlib import Path

# Add parent directory to path to import src module
sys.path.insert(0, str(Path.cwd().parent))

from src.config import DATA_FILE, TARGET_COL, TEST_SIZE, RANDOM_STATE


def data_load(data_path: Path = DATA_FILE) -> pd.DataFrame:
    '''
        Load cleaned dataset from the specified path.
    Returns:
    -------
    pd.DataFrame: 
                 The cleaned dataset loaded from the CSV file.
    '''
    df = pd.read_csv(data_path)
    # Normalize column names
    df.columns = [str(c).strip() for c in df.columns]
    return df

def infer_feature_columns_from_df(df: pd.DataFrame) -> tuple[list[str], list[str]]:
    '''
       Infers feature columns from a pandas DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        tuple[list[str], list[str]]:
            - Numerical feature column names
            - Categorical feature column names
    '''
    numerical_cols = df.select_dtypes(include = ['int64', 'float64', 'number']).columns.tolist()
    categorical_cols = df.select_dtypes(include = ['object', 'category', 'bool']).columns.tolist()
    return numerical_cols, categorical_cols


def build_preprocessor(df_or_X: pd.DataFrame):
    '''
    Build a ColumnTransformer for Imputation, Scaling and Encoding.

    Parameters
    ----------
    df_or_X : pd.DataFrame
        Either:
        - the full cleaned DataFrame (including TARGET_COL)
        - OR a features-only DataFrame (excluding the target column)

    Returns
    -------
    ColumnTransformer
        A transformer that applies numeric and categorical preprocessing
        pipelines to their respective feature columns.
    '''

    # Create a copy of the DataFrame to prevent unintended side effects
    df = df_or_X.copy()

    # If the target column is present, drop it for feature inference
    if TARGET_COL in df.columns:
        X = df.drop(columns = [TARGET_COL])
    else:
         X = df

    # Infer feature columns
    num_columns, cat_columns = infer_feature_columns_from_df(X)

    # Numeric Pipeline: impute missing values and scale features
    numeric_transformer = Pipeline(
        steps = [
            ('impute', SimpleImputer(strategy = 'mean')),
            ('scale', StandardScaler())
        ]
    )

    # Categorical pipeline: impute missing values and one-hot encode features
    categorical_transformer = Pipeline(
        steps = [
            ('impute', SimpleImputer(strategy = 'most_frequent')),
            ('Encoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ]
    )
    # Combine numeric and categorical pipelines into a ColumnTransformer
    transformers = []
    if num_columns:
        transformers.append(('num', numeric_transformer, num_columns))
    if cat_columns:
        transformers.append(('cat', categorical_transformer, cat_columns))

    preprocessor = ColumnTransformer(transformers = transformers, remainder = 'drop', verbose_feature_names_out = False)
    return preprocessor


def split_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    '''
       Stratified train-test split. Expect target column to exist.
    Return
          (X_train, X_test, y_train, y_test)
    '''
    df = df.copy()
    df.columns = [str(c).strip() for c in df.columns]

    if TARGET_COL not in df.columns:
        raise ValueError(f"Target column '{TARGET_COL}' not found in DataFrame column: {df.columns.tolist()}")
    
    X = df.drop(columns = [TARGET_COL])
    y = df[TARGET_COL]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE, stratify = y
    )
    return X_train, X_test, y_train, y_test

# Quick model level print for varification.
print('Data Preprocessing module loaded.')

Data Preprocessing module loaded.
