In [1]:
# ================================
# IMPORT REQUIRED LIBRARIES
# ================================

import numpy as np
import pandas as pd

# sklearn base classes for building custom transformers
from sklearn.base import BaseEstimator, TransformerMixin

# sklearn preprocessing tools
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# sklearn pipeline tools
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
# numeric features used in logistic regression
numeric_features = base_model_numeric

# categorical features used in logistic regression
categorical_features = base_model_categorical

# columns where negative values represent missing
negative_missing_cols = columns_missing

# skewed columns requiring log transform
log_features = [
    'proposed_credit_limit',
    'intended_balcon_amount',
    'current_address_months_count',
    'prev_address_months_count',
    'device_distinct_emails_8w'
]

# missing indicator columns to combine
missing_indicator_cols = [
    'intended_balcon_amount_missing',
    'prev_address_months_count_missing',
    'bank_months_count_missing'
]

# columns to drop to avoid multicollinearity and insignificant predictors
drop_features = [
    'payment_type_AC',
    'intended_balcon_amount_missing',
    'prev_address_months_count_missing',
    'bank_months_count_missing',
    'intended_balcon_amount',
    'prev_address_months_count',
    'housing_status_BG',
    'payment_type_AE'
]

In [None]:
class NegativeToNaNTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        # store columns where negative values indicate missing
        self.columns = columns
        
    def fit(self, X, y=None):
        # nothing to learn here, just return self
        return self
        
    def transform(self, X):
        # create copy to avoid modifying original dataframe
        X = X.copy()
        
        # replace negative values with NaN
        for col in self.columns:
            if col in X.columns:
                X[col] = X[col].mask(X[col] < 0, np.nan)
                
        return X

In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        # store columns to log transform
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        
        for col in self.columns:
            if col in X.columns:
                X[col] = np.log1p(X[col])
                
        return X

In [None]:
class MissingIndicatorCombiner(BaseEstimator, TransformerMixin):
    
    def __init__(self, indicator_columns):
        self.indicator_columns = indicator_columns
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        
        # create combined indicator
        X['is_incomplete'] = X[self.indicator_columns].max(axis=1)
        
        return X

In [None]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        
        # drop columns if present
        X = X.drop(columns=[col for col in self.columns if col in X.columns])
        
        return X

In [None]:
numeric_pipeline = Pipeline([
    
    # step 1: convert negative values to NaN
    ("negative_to_nan", NegativeToNaNTransformer(negative_missing_cols)),
    
    # step 2: impute missing using median
    ("imputer", SimpleImputer(strategy="median")),
    
    # step 3: log transform skewed variables
    ("log_transform", LogTransformer(log_features)),
    
    # step 4: scale features for logistic regression
    ("scaler", StandardScaler())
])

In [None]:
categorical_pipeline = Pipeline([
    
    # one hot encoding
    
    OneHotEncoder(
        drop="first",              # prevents dummy variable trap
        handle_unknown="ignore",  # prevents test set errors
        sparse_output=False       # returns dataframe-like output
    )
])

In [None]:
preprocessor = ColumnTransformer([
    
    ("numeric", numeric_pipeline, numeric_features),
    
    ("categorical", categorical_pipeline, categorical_features)
])

In [None]:
logistic_preprocessor = Pipeline([
    
    # combine missing indicators
    ("combine_missing", MissingIndicatorCombiner(missing_indicator_cols)),
    
    # apply numeric and categorical transformations
    ("column_processing", preprocessor),
    
    # drop unwanted features
    ("feature_drop", FeatureDropper(drop_features))
])

In [None]:
# FIT ONLY ON TRAIN DATA (CRITICAL FOR NO LEAKAGE)
X_train_processed = logistic_preprocessor.fit_transform(X_train)

# APPLY SAME TRANSFORM TO TEST DATA
X_test_processed = logistic_preprocessor.transform(X_test)