# 04. Data Preprocessing

In this notebook, we perform data preprocessing for different machine learning models.
The steps include:
- Loading the feature-engineered datasets
- Imputing missing values based on the training set
- Applying model-specific preprocessing:
  - Logistic Regression: scaling and one-hot encoding
  - Random Forest: use imputed features directly
  - XGBoost: same as Random Forest
  - LightGBM: convert categorical columns to 'category' dtype
- Saving the preprocessed datasets for each model


In [20]:
## 1. Import libraries
import sys
# Add project root directory to sys.path
sys.path.append(str(Path('.').resolve().parent))  # or adjust as needed

import pandas as pd
from pathlib import Path
import importlib
import scripts.preprocessing
importlib.reload(scripts.preprocessing)

from scripts.preprocessing import (
    impute_missing_values,
    preprocess_for_logistic,
    select_low_vif_features,
    preprocess_for_tree_models,
    preprocess_for_lightgbm
)

In [21]:
## 2. Define file paths
TRAIN_PATH = Path('../data/processed/train_fe.csv')
VAL_PATH = Path('../data/processed/val_fe.csv')
TEST_PATH = Path('../data/processed/test_fe.csv')

OUTPUT_DIR = Path('../data/processed/')

In [22]:
## 3. Load feature-engineered datasets
df_train = pd.read_csv(TRAIN_PATH)
df_val = pd.read_csv(VAL_PATH)
df_test = pd.read_csv(TEST_PATH)

In [23]:
## 4. Load feature-engineered datasets
df_train = pd.read_csv(TRAIN_PATH)
df_val = pd.read_csv(VAL_PATH)
df_test = pd.read_csv(TEST_PATH)

In [24]:
## 5. Separate features and target
TARGET = 'DRK_YN'
X_train = df_train.drop(columns=[TARGET])
y_train = df_train[TARGET]
X_val = df_val.drop(columns=[TARGET])
y_val = df_val[TARGET]
X_test = df_test.drop(columns=[TARGET])
y_test = df_test[TARGET]

In [25]:
## 6. Impute missing values
X_train_imp, X_val_imp, X_test_imp = impute_missing_values(X_train, X_val, X_test)

In [26]:
## 7. Identify categorical columns
categorical_cols = ['sex'] if 'sex' in X_train.columns else []

In [27]:
## 8. Preprocessing per model
# Logistic Regression
X_train_lr, X_val_lr, X_test_lr = preprocess_for_logistic(X_train_imp, X_val_imp, X_test_imp, categorical_cols)

# Random Forest
X_train_rf, X_val_rf, X_test_rf = preprocess_for_tree_models(X_train_imp, X_val_imp, X_test_imp)

# XGBoost
X_train_xgb, X_val_xgb, X_test_xgb = preprocess_for_tree_models(X_train_imp, X_val_imp, X_test_imp)

# LightGBM
X_train_lgb, X_val_lgb, X_test_lgb = preprocess_for_lightgbm(X_train_imp, X_val_imp, X_test_imp, categorical_cols)

In [28]:
## 9. Save preprocessed datasets
# Logistic Regression datasets
X_train_lr.to_csv(OUTPUT_DIR / 'X_train_lr.csv', index=False)
X_val_lr.to_csv(OUTPUT_DIR / 'X_val_lr.csv', index=False)
X_test_lr.to_csv(OUTPUT_DIR / 'X_test_lr.csv', index=False)
y_train.to_csv(OUTPUT_DIR / 'y_train_lr.csv', index=False)
y_val.to_csv(OUTPUT_DIR / 'y_val_lr.csv', index=False)
y_test.to_csv(OUTPUT_DIR / 'y_test_lr.csv', index=False)

# Random Forest datasets
X_train_rf.to_csv(OUTPUT_DIR / 'X_train_rf.csv', index=False)
X_val_rf.to_csv(OUTPUT_DIR / 'X_val_rf.csv', index=False)
X_test_rf.to_csv(OUTPUT_DIR / 'X_test_rf.csv', index=False)
y_train.to_csv(OUTPUT_DIR / 'y_train_rf.csv', index=False)
y_val.to_csv(OUTPUT_DIR / 'y_val_rf.csv', index=False)
y_test.to_csv(OUTPUT_DIR / 'y_test_rf.csv', index=False)

# XGBoost datasets
X_train_xgb.to_csv(OUTPUT_DIR / 'X_train_xgb.csv', index=False)
X_val_xgb.to_csv(OUTPUT_DIR / 'X_val_xgb.csv', index=False)
X_test_xgb.to_csv(OUTPUT_DIR / 'X_test_xgb.csv', index=False)
y_train.to_csv(OUTPUT_DIR / 'y_train_xgb.csv', index=False)
y_val.to_csv(OUTPUT_DIR / 'y_val_xgb.csv', index=False)
y_test.to_csv(OUTPUT_DIR / 'y_test_xgb.csv', index=False)

# LightGBM datasets
X_train_lgb.to_csv(OUTPUT_DIR / 'X_train_lgb.csv', index=False)
X_val_lgb.to_csv(OUTPUT_DIR / 'X_val_lgb.csv', index=False)
X_test_lgb.to_csv(OUTPUT_DIR / 'X_test_lgb.csv', index=False)
y_train.to_csv(OUTPUT_DIR / 'y_train_lgb.csv', index=False)
y_val.to_csv(OUTPUT_DIR / 'y_val_lgb.csv', index=False)
y_test.to_csv(OUTPUT_DIR / 'y_test_lgb.csv', index=False)

print("All preprocessed datasets saved successfully.")

All preprocessed datasets saved successfully.


In [19]:
# Apply VIF-based feature selection
X_train_lr_vif = select_low_vif_features(X_train_lr)
# Make sure val/test have same columns
X_val_lr_vif = X_val_lr[X_train_lr_vif.columns]
X_test_lr_vif = X_test_lr[X_train_lr_vif.columns]


  vif = 1. / (1. - r_squared_i)
