# Advanced Feature Engineering

High-impact features to improve within_10pct accuracy based on 03_basic_models results.

## Table of Contents
1. Advanced Location Features
2. Nonlinear Quality-Size Features  
3. Advanced Remodeling Features
4. Categorical Feature Enhancement
5. Feature Selection & Correlation Analysis
6. Preprocessing for New Features
7. Create Enhanced Datasets


In [10]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')


# Load original data with all features
df = pd.read_pickle('../../data/processed/df_engineered.pkl')
print(f"Data shape: {df.shape}")
print(f"Neighborhood categories: {df['NeighborhoodCategory'].value_counts()}")


Data shape: (1408, 105)
Neighborhood categories: NeighborhoodCategory
Middle     706
Budget     406
Premium    296
Name: count, dtype: int64


## 1. Advanced Location Features


In [11]:
# Neighborhood statistics with sample size handling
neighborhood_stats = df.groupby('Neighborhood').agg({
    'SalePrice': ['count', 'mean', 'std', 'median']
}).round(0)
neighborhood_stats.columns = ['count', 'mean', 'std', 'median']

# Handle small sample neighborhoods (< 10 samples)
small_neighborhoods = neighborhood_stats[neighborhood_stats['count'] < 10].index
category_stats = df.groupby('NeighborhoodCategory')['SalePrice'].agg(['mean', 'std']).round(0)

print(f"Small sample neighborhoods ({len(small_neighborhoods)}): {list(small_neighborhoods)}")

# Create location features
for neighborhood in df['Neighborhood'].unique():
    if neighborhood in small_neighborhoods:
        # Use category stats for small neighborhoods
        category = df[df['Neighborhood'] == neighborhood]['NeighborhoodCategory'].iloc[0]
        mean_price = category_stats.loc[category, 'mean']
        std_price = category_stats.loc[category, 'std']
    else:
        # Use neighborhood-specific stats
        mean_price = neighborhood_stats.loc[neighborhood, 'mean']
        std_price = neighborhood_stats.loc[neighborhood, 'std']
    
    mask = df['Neighborhood'] == neighborhood
    df.loc[mask, 'Neighborhood_Mean'] = mean_price
    df.loc[mask, 'Neighborhood_Std'] = std_price

# Relative value features
df['Price_vs_Neighborhood'] = df['SalePrice'] / df['Neighborhood_Mean']
df['Neighborhood_ZScore'] = (df['SalePrice'] - df['Neighborhood_Mean']) / (df['Neighborhood_Std'] + 1)

# Category premium features
category_means = df.groupby('NeighborhoodCategory')['SalePrice'].mean()
df['Category_Premium'] = df['NeighborhoodCategory'].astype(str).map(category_means)
df['Price_vs_Category'] = df['SalePrice'] / df['Category_Premium']

print("✓ Location features created")


Small sample neighborhoods (2): ['Blueste', 'NPkVill']
✓ Location features created


## 2. Nonlinear Quality-Size Features


In [12]:
# Convert categorical columns to numeric
qual_numeric = pd.to_numeric(df['OverallQual'], errors='coerce')
total_sf_numeric = pd.to_numeric(df['TotalSF'], errors='coerce')
grliv_numeric = pd.to_numeric(df['GrLivArea'], errors='coerce')
lot_numeric = pd.to_numeric(df['LotArea'], errors='coerce')

# Nonlinear quality-size relationships
df['Quality_TotalSF_Squared'] = qual_numeric * (total_sf_numeric ** 2)
df['Quality_Efficiency'] = (qual_numeric ** 2) / np.log1p(total_sf_numeric)

# Premium size effect (luxury homes have exponential value)
size_threshold = total_sf_numeric.quantile(0.8)
df['Premium_Size_Effect'] = np.where(
    total_sf_numeric > size_threshold,
    qual_numeric * np.log1p(total_sf_numeric) * 1.5,
    qual_numeric * total_sf_numeric
)

# Quality interaction with area efficiency
df['Quality_Area_Efficiency'] = qual_numeric * (grliv_numeric / lot_numeric)

print("✓ Nonlinear quality features created")


✓ Nonlinear quality features created


## 3. Advanced Remodeling Features


In [13]:
# Convert columns to numeric
yr_sold = pd.to_numeric(df['YrSold'], errors='coerce')
year_remod = pd.to_numeric(df['YearRemodAdd'], errors='coerce')
year_built = pd.to_numeric(df['YearBuilt'], errors='coerce')
house_age = pd.to_numeric(df['HouseAge'], errors='coerce')

# Remodeling impact analysis
df['Years_Since_Remodel'] = yr_sold - year_remod
df['Remodel_Impact'] = np.where(
    year_remod > year_built,
    (year_remod - year_built) / (house_age + 1),
    0
)

# Recent remodeling premium
df['Recent_Remodel_Bonus'] = np.where(df['Years_Since_Remodel'] < 5, 1.2, 1.0)
df['Remodel_Quality_Boost'] = df['Remodel_Impact'] * qual_numeric * df['Recent_Remodel_Bonus']

# Age vs remodeling interaction
df['Age_Remodel_Balance'] = qual_numeric / (house_age + 1) * (1 + df['Remodel_Impact'])

print("✓ Remodeling features created")


✓ Remodeling features created


## 4. Categorical Feature Enhancement


In [14]:
# Convert discrete numeric features to categorical
print("Converting discrete features to categorical...")

# FullBath categories
df['FullBath_Cat'] = pd.cut(
    pd.to_numeric(df['FullBath'], errors='coerce'),
    bins=[-0.5, 0.5, 1.5, 2.5, float('inf')],
    labels=['0_Bath', '1_Bath', '2_Bath', '3+_Bath']
).astype(str)

# Fireplaces categories  
df['Fireplaces_Cat'] = pd.cut(
    pd.to_numeric(df['Fireplaces'], errors='coerce'),
    bins=[-0.5, 0.5, 1.5, float('inf')], 
    labels=['No_Fireplace', '1_Fireplace', '2+_Fireplace']
).astype(str)

# TotRmsAbvGrd categories (room count)
df['Rooms_Cat'] = pd.cut(
    pd.to_numeric(df['TotRmsAbvGrd'], errors='coerce'),
    bins=[0, 5, 7, 9, float('inf')],
    labels=['Small_5-', 'Medium_6-7', 'Large_8-9', 'XLarge_10+']
).astype(str)

print(f"FullBath categories: {df['FullBath_Cat'].value_counts().to_dict()}")
print(f"Fireplaces categories: {df['Fireplaces_Cat'].value_counts().to_dict()}")
print(f"Rooms categories: {df['Rooms_Cat'].value_counts().to_dict()}")
print("✓ Categorical features created")


Converting discrete features to categorical...
FullBath categories: {'2_Bath': 743, '1_Bath': 636, '3+_Bath': 22, '0_Bath': 7}
Fireplaces categories: {'No_Fireplace': 678, '1_Fireplace': 633, '2+_Fireplace': 97}
Rooms categories: {'Medium_6-7': 715, 'Small_5-': 379, 'Large_8-9': 252, 'XLarge_10+': 62}
✓ Categorical features created


## 5. Feature Selection & Correlation Analysis


In [15]:
# New features correlation with SalePrice
new_features = [
    'Price_vs_Neighborhood', 'Neighborhood_ZScore', 'Price_vs_Category',
    'Quality_TotalSF_Squared', 'Quality_Efficiency', 'Premium_Size_Effect', 'Quality_Area_Efficiency',
    'Remodel_Impact', 'Recent_Remodel_Bonus', 'Remodel_Quality_Boost', 'Age_Remodel_Balance'
]

correlations = df[new_features + ['SalePrice']].corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False)
print("New features correlation with SalePrice:")
for feature, corr in correlations.items():
    print(f"{feature:25s}: {corr:.3f}")

# Select top performing new features
top_new_features = correlations[correlations.abs() > 0.3].index.tolist()
print(f"\nSelected new features (|corr| > 0.3): {len(top_new_features)}")
print(top_new_features)


New features correlation with SalePrice:
Quality_TotalSF_Squared  : 0.887
Quality_Efficiency       : 0.801
Price_vs_Category        : 0.727
Price_vs_Neighborhood    : 0.635
Neighborhood_ZScore      : 0.629
Age_Remodel_Balance      : 0.610
Recent_Remodel_Bonus     : 0.401
Quality_Area_Efficiency  : 0.260
Remodel_Quality_Boost    : -0.045
Remodel_Impact           : -0.160
Premium_Size_Effect      : -0.217

Selected new features (|corr| > 0.3): 7
['Quality_TotalSF_Squared', 'Quality_Efficiency', 'Price_vs_Category', 'Price_vs_Neighborhood', 'Neighborhood_ZScore', 'Age_Remodel_Balance', 'Recent_Remodel_Bonus']


## 6. Preprocessing for New Features


In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Define feature groups
categorical_features = ['FullBath_Cat', 'Fireplaces_Cat', 'Rooms_Cat']
new_numeric_features = [f for f in top_new_features if f not in categorical_features]

print(f"Categorical features: {categorical_features}")
print(f"New numeric features: {new_numeric_features}")

# Simple preprocessing
df_processed = df.copy()

# 1. One-hot encode categorical features for linear models
onehot_columns = []
for cat_feature in categorical_features:
    if cat_feature in df_processed.columns:
        # One-hot encoding
        dummies = pd.get_dummies(df_processed[cat_feature], prefix=cat_feature, drop_first=True)
        df_processed = pd.concat([df_processed, dummies], axis=1)
        onehot_columns.extend(dummies.columns.tolist())

# 2. Label encode categorical features for random forest models
from sklearn.preprocessing import LabelEncoder
df_rf_processed = df_processed.copy()
for col in categorical_features:
    if col in df_rf_processed.columns:
        le = LabelEncoder()
        df_rf_processed[col] = le.fit_transform(df_rf_processed[col].astype(str))

# 3. Handle infinite values in new numeric features
for feature in new_numeric_features:
    if feature in df_processed.columns:
        # Replace inf with very large finite values
        df_processed[feature] = df_processed[feature].replace([np.inf, -np.inf], [1e10, -1e10])
        # Fill any remaining NaN with median
        df_processed[feature] = df_processed[feature].fillna(df_processed[feature].median())


# preprocessing summary
print(f"Feature counts: Categorical={len(categorical_features)} | Numeric={len(new_numeric_features)} | One-hot={len(onehot_columns)}")
print("✓ Preprocessing completed (One-hot for Linear/Ridge, Label encoding for RF)")


Categorical features: ['FullBath_Cat', 'Fireplaces_Cat', 'Rooms_Cat']
New numeric features: ['Quality_TotalSF_Squared', 'Quality_Efficiency', 'Price_vs_Category', 'Price_vs_Neighborhood', 'Neighborhood_ZScore', 'Age_Remodel_Balance', 'Recent_Remodel_Bonus']
Feature counts: Categorical=3 | Numeric=7 | One-hot=8
✓ Preprocessing completed (One-hot for Linear/Ridge, Label encoding for RF)


## 7. Create Enhanced Datasets


In [17]:
# Create datasets for different model types
original_features = [
    'Quality_x_TotalSF', 'Garage_x_Quality', 'Bath_x_Area', 'GrLivArea', 'GarageArea',
    'YearBuilt', 'EffectiveAge', 'MasVnrArea', 'HouseAge', 'GarageCars', 'TotalSF', 
    'Quality_x_Area', '1stFlrSF', 'TotalBathrooms', 'TotalBsmtSF', 'AvgRoomSize', 
    'YearRemodAdd', 'LotArea', 'TotalPorchSF', 'BsmtFinSF1', 'OpenPorchSF'
]

# Linear/Ridge model features (one-hot encoded categorical + numeric)
linear_features = original_features + new_numeric_features + onehot_columns

# Random Forest features (original categorical + numeric)
rf_features = original_features + new_numeric_features + categorical_features

# Create datasets with appropriate preprocessing
X_linear = df_processed[linear_features].copy()
X_ridge = X_linear.copy()  # Ridge uses same features as Linear  
X_rf = df_rf_processed[rf_features].copy()  # Use label-encoded version for RF

# Prepare target variable (both original and log-transformed)
y = df_processed['SalePrice'].copy()
y_log = np.log1p(y)

print(f"Linear/Ridge model features: {len(linear_features)}")
print(f"Random Forest features: {len(rf_features)}")
print(f"Linear dataset shape: {X_linear.shape}")
print(f"Ridge dataset shape: {X_ridge.shape}")
print(f"Random Forest dataset shape: {X_rf.shape}")

# Save enhanced datasets to dedicated directory
X_linear.to_pickle('../../data/enhanced/df_enhanced_linear.pkl')
X_ridge.to_pickle('../../data/enhanced/df_enhanced_ridge.pkl')
X_rf.to_pickle('../../data/enhanced/df_enhanced_rf.pkl')
y.to_pickle('../../data/enhanced/y_enhanced.pkl')
y_log.to_pickle('../../data/enhanced/y_enhanced_log.pkl')

print("✓ Enhanced datasets saved for all three model types")


Linear/Ridge model features: 36
Random Forest features: 31
Linear dataset shape: (1408, 36)
Ridge dataset shape: (1408, 36)
Random Forest dataset shape: (1408, 31)
✓ Enhanced datasets saved for all three model types
