# Advanced Feature Engineering

This notebook implements comprehensive feature engineering with interaction, ratio, derived, and categorical features.

## What we will do:
1. Load cleaned data
2. Create advanced features (interactions, ratios, derived, categorical)
3. Compare with basic features
4. Save processed data


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import sys
sys.path.append('../../scripts')
from feature_builders import BasicFeatureBuilder, AdvancedFeatureBuilder


## 1. Load Data


In [2]:
# Load cleaned data
df = pd.read_csv('../../data/cleaned/domain_cleaned.csv')
print(f"Data shape: {df.shape}")


Data shape: (1161, 81)


## 2. Advanced Feature Engineering


In [3]:
# Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape: (1161, 80)
Target shape: (1161,)


In [4]:
# Apply advanced feature engineering
advanced_builder = AdvancedFeatureBuilder()
advanced_builder.fit(X, y)
X_advanced = advanced_builder.transform(X)

print(f"Original features: {X.shape[1]}")
print(f"After advanced engineering: {X_advanced.shape[1]}")
print(f"New features added: {X_advanced.shape[1] - X.shape[1]}")


Original features: 80
After advanced engineering: 104
New features added: 24


In [5]:
# Show new features by category
new_features = [col for col in X_advanced.columns if col not in X.columns]
print(f"Total new features: {len(new_features)}")

# Group features by type
interaction_features = [f for f in new_features if '_x_' in f]
ratio_features = [f for f in new_features if '_to_' in f or 'AvgRoomSize' in f]
derived_features = [f for f in new_features if f in ['TotalBathrooms', 'EffectiveAge', 'TotalPorchSF']]
categorical_features = [f for f in new_features if f not in interaction_features + ratio_features + derived_features]

print(f"\\nInteraction features ({len(interaction_features)}): {interaction_features}")
print(f"Ratio features ({len(ratio_features)}): {ratio_features}")
print(f"Derived features ({len(derived_features)}): {derived_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")


Total new features: 24
\nInteraction features (4): ['Quality_x_Area', 'Quality_x_TotalSF', 'Bath_x_Area', 'Garage_x_Quality']
Ratio features (5): ['GrLivArea_to_LotArea', '1stFlr_to_GrLivArea', 'GarageArea_to_GrLivArea', 'BsmtArea_to_GrLivArea', 'AvgRoomSize']
Derived features (3): ['TotalBathrooms', 'EffectiveAge', 'TotalPorchSF']
Categorical features (12): ['TotalSF', 'HouseAge', 'AgeCategory', 'IsNew', 'Has2ndFloor', 'HasBasement', 'HasFireplace', 'HasPool', 'HasMasVnrArea', 'HasPorch', 'QualityCategory', 'NeighborhoodCategory']


## 3. Compare with Basic Features


In [6]:
# Compare with basic features
basic_builder = BasicFeatureBuilder()
basic_builder.fit(X, y)
X_basic = basic_builder.transform(X)

basic_new_features = [col for col in X_basic.columns if col not in X.columns]
advanced_new_features = [col for col in X_advanced.columns if col not in X.columns]

print(f"Basic features added: {len(basic_new_features)}")
print(f"Advanced features added: {len(advanced_new_features)}")
print(f"Difference: {len(advanced_new_features) - len(basic_new_features)} additional features")


Basic features added: 2
Advanced features added: 24
Difference: 22 additional features


## 4. Save Data


In [7]:
# Combine features and target
df_advanced = X_advanced.copy()
df_advanced['SalePrice'] = y

# Save to processed folder
df_advanced.to_csv('../../data/processed/df_advanced_features.csv', index=False)
print(f"Saved advanced features dataset: {df_advanced.shape}")


Saved advanced features dataset: (1161, 105)
