In [1]:
# scripts/feature_engineer.py
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def engineer_features(input_path, output_path):
    df = pd.read_csv(input_path)
    
    # ----------------------------
    # 1. Feature Engineering
    # ----------------------------
    # Example: Create interaction terms or binned features
    # Age bins (clinically meaningful)
    df['age_group'] = pd.cut(
        df['age'],
        bins=[18, 30, 45, 60, 100],
        labels=['18-29', '30-44', '45-59', '60+']
    )
    
    # BMI categories (WHO standards)
    df['bmi_category'] = pd.cut(
        df['bmi'],
        bins=[0, 18.5, 25, 30, 60],
        labels=['Underweight', 'Normal', 'Overweight', 'Obese']
    )
    
    # ----------------------------
    # 2. Encode Categorical Features
    # ----------------------------
    categorical_cols = ['gender', 'smoking_history', 'age_group', 'bmi_category']
    numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
    
    # Use ColumnTransformer for encoding/scaling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ])
    
    # Fit-transform and convert to DataFrame
    processed_data = preprocessor.fit_transform(df)
    feature_names = preprocessor.get_feature_names_out()
    df_processed = pd.DataFrame(processed_data, columns=feature_names)
    
    # Retain the target variable
    df_processed['diabetes'] = df['diabetes']
    
    # Save
    df_processed.to_csv(output_path, index=False)
    print(f"Processed data saved to {output_path}")

if __name__ == "__main__":
    engineer_features(
        input_path="cleaned_diabetes_prediction_dataset.csv",
        output_path="featured_cleaned_diabetes_prediction_dataset.csv"
    )

Processed data saved to featured_cleaned_diabetes_prediction_dataset.csv
