# 03. Feature Engineering

In this notebook:
- We load the previously split datasets
- Construct domain-specific features such as BMI, Pulse Pressure, and Vision Average
- Apply the same feature transformations consistently across training, validation, and test sets
- Save the updated datasets for further preprocessing

In [1]:
# 03_feature_engineering.ipynb

# ====================================================
# 03. Feature Engineering
# ----------------------------------------------------
# Objective:
# - Load the split training, validation, and test datasets
# - Construct new domain-specific features
# - Ensure feature consistency across all splits
# - Save updated datasets
# ====================================================

## 1. Import necessary libraries
import pandas as pd
from pathlib import Path

## 2. Define relative file paths
TRAIN_DATA_PATH = Path('../data/processed/train.csv')
VAL_DATA_PATH = Path('../data/processed/val.csv')
TEST_DATA_PATH = Path('../data/processed/test.csv')

OUTPUT_TRAIN_PATH = Path('../data/processed/train_fe.csv')
OUTPUT_VAL_PATH = Path('../data/processed/val_fe.csv')
OUTPUT_TEST_PATH = Path('../data/processed/test_fe.csv')

## 3. Load the split datasets
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_val = pd.read_csv(VAL_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

print("Training set shape:", df_train.shape)
print("Validation set shape:", df_val.shape)
print("Test set shape:", df_test.shape)

## 4. Define feature engineering function
def create_features(df):
    """
    Create new features based on domain knowledge.

    Features created:
    - BMI (Body Mass Index)
    - Pulse Pressure (SBP - DBP)
    - Mean Arterial Pressure (DBP + 1/3 Pulse Pressure)
    - Vision Average (average of left and right sight)
    - Hearing Average (average of left and right hearing)
    - AST/ALT Ratio (liver function marker)
    """
    df_new = df.copy()
    
    # BMI: weight (kg) / (height (m))^2
    df_new['BMI'] = df_new['weight'] / ((df_new['height'] / 100) ** 2)
    
    # Pulse Pressure: Systolic BP - Diastolic BP
    df_new['pulse_pressure'] = df_new['SBP'] - df_new['DBP']
    
    # Mean Arterial Pressure: DBP + 1/3 * Pulse Pressure
    df_new['mean_arterial_pressure'] = df_new['DBP'] + (df_new['pulse_pressure'] / 3)
    
    # Vision Average: average of sight_left and sight_right
    df_new['vision_avg'] = (df_new['sight_left'] + df_new['sight_right']) / 2
    
    # Hearing Average: average of hear_left and hear_right
    df_new['hearing_avg'] = (df_new['hear_left'] + df_new['hear_right']) / 2
    
    # AST/ALT Ratio: Liver function indicator
    df_new['AST_ALT_ratio'] = df_new['SGOT_AST'] / df_new['SGOT_ALT']
    
    return df_new

## 5. Apply feature engineering
df_train_fe = create_features(df_train)
df_val_fe = create_features(df_val)
df_test_fe = create_features(df_test)

print("\nFeature engineering completed.")

## 6. Verify new features
print("\nNew features added (train set preview):")
df_train_fe[['BMI', 'pulse_pressure', 'mean_arterial_pressure', 'vision_avg', 'hearing_avg', 'AST_ALT_ratio']].head()

## 7. Save updated datasets with engineered features
df_train_fe.to_csv(OUTPUT_TRAIN_PATH, index=False)
df_val_fe.to_csv(OUTPUT_VAL_PATH, index=False)
df_test_fe.to_csv(OUTPUT_TEST_PATH, index=False)

print(f"\nFeature-engineered datasets saved successfully to 'data/processed/'.")


Training set shape: (693942, 24)
Validation set shape: (148702, 24)
Test set shape: (148702, 24)

Feature engineering completed.

New features added (train set preview):

Feature-engineered datasets saved successfully to 'data/processed/'.
