# Feature Engineering

This notebook creates new features from existing ones to improve model performance.

## What we will do:
1. Load cleaned data
2. Create interaction features
3. Create ratio features
4. Create derived features
5. Create categorical features
6. Validate and save engineered features


In [63]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)


## 1. Load Cleaned Data


In [64]:
# Load cleaned dataset
df = pd.read_pickle('../../data/processed/train_cleaned.pkl')

print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print("\nDataset info:")
print(df.info())


Dataset shape: (1408, 81)
Missing values: 0

Dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 1408 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             1408 non-null   int64   
 1   MSSubClass     1408 non-null   category
 2   MSZoning       1408 non-null   category
 3   LotFrontage    1408 non-null   float64 
 4   LotArea        1408 non-null   int64   
 5   Street         1408 non-null   category
 6   Alley          1408 non-null   category
 7   LotShape       1408 non-null   category
 8   LandContour    1408 non-null   category
 9   Utilities      1408 non-null   category
 10  LotConfig      1408 non-null   category
 11  LandSlope      1408 non-null   category
 12  Neighborhood   1408 non-null   category
 13  Condition1     1408 non-null   category
 14  Condition2     1408 non-null   category
 15  BldgType       1408 non-null   category
 16  HouseStyle     1408 non-

## 2. Create Interaction Features


In [65]:
# Create interaction features
print("Creating interaction features...")

# Quality x Area interactions
df['Quality_x_Area'] = df['OverallQual'].astype(float) * df['GrLivArea']
df['Quality_x_TotalSF'] = df['OverallQual'].astype(float) * (df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'])

# Bathroom interactions
df['Bath_x_Area'] = (df['FullBath'] + 0.5 * df['HalfBath']) * df['GrLivArea']

# Garage interactions  
df['Garage_x_Quality'] = df['GarageCars'] * df['OverallQual'].astype(float)

print("✓ Interaction features created")
print(f"New features: Quality_x_Area, Quality_x_TotalSF, Bath_x_Area, Garage_x_Quality")


Creating interaction features...
✓ Interaction features created
New features: Quality_x_Area, Quality_x_TotalSF, Bath_x_Area, Garage_x_Quality


## 3. Create Ratio Features


In [66]:
# Create ratio features
print("Creating ratio features...")

# Living area ratios
df['GrLivArea_to_LotArea'] = df['GrLivArea'] / df['LotArea']
df['1stFlr_to_GrLivArea'] = df['1stFlrSF'] / (df['GrLivArea'] + 1)  # +1 to avoid division by zero

# Garage ratio
df['GarageArea_to_GrLivArea'] = df['GarageArea'] / (df['GrLivArea'] + 1)

# Basement ratio
df['BsmtArea_to_GrLivArea'] = df['TotalBsmtSF'] / (df['GrLivArea'] + 1)

# Room size
df['AvgRoomSize'] = df['GrLivArea'] / (df['TotRmsAbvGrd'] + 1)

# Replace any infinite values with 0
ratio_cols = ['GrLivArea_to_LotArea', '1stFlr_to_GrLivArea', 'GarageArea_to_GrLivArea', 
              'BsmtArea_to_GrLivArea', 'AvgRoomSize']

for col in ratio_cols:
    df[col] = df[col].replace([np.inf, -np.inf], 0)
    
print("✓ Ratio features created")
print(f"New features: {', '.join(ratio_cols)}")


Creating ratio features...
✓ Ratio features created
New features: GrLivArea_to_LotArea, 1stFlr_to_GrLivArea, GarageArea_to_GrLivArea, BsmtArea_to_GrLivArea, AvgRoomSize


## 4. Create Derived Features


In [67]:
# Create derived features
print("Creating derived features...")

# Total square footage
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# Total bathrooms
df['TotalBathrooms'] = (
    df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
)

# Effective age (based on remodeling)
df['EffectiveAge'] = df['YrSold'] - df[['YearBuilt', 'YearRemodAdd']].max(axis=1)

# Porch area
porch_cols = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
df['TotalPorchSF'] = df[porch_cols].sum(axis=1)

# House age (numeric)
current_year = df['YrSold'].max()
df['HouseAge'] = current_year - df['YearBuilt']
print(f"HouseAge range: {df['HouseAge'].min()} to {df['HouseAge'].max()}")
print(f"Negative HouseAge count: {(df['HouseAge'] < 0).sum()}")
df['HouseAge'] = df['HouseAge'].clip(lower=0)

# Collect derived features
derived_features = ['TotalSF', 'TotalBathrooms', 'EffectiveAge', 'TotalPorchSF', 'HouseAge']

print("✓ Derived features created")
print(f"New features: {', '.join(derived_features)}")

# Show basic statistics for numeric derived features
numeric_derived = ['TotalSF', 'TotalBathrooms', 'EffectiveAge', 'TotalPorchSF', 'HouseAge']
print("\nDerived features statistics:")
print(df[numeric_derived].describe())


Creating derived features...
HouseAge range: 0 to 138
Negative HouseAge count: 0
✓ Derived features created
New features: TotalSF, TotalBathrooms, EffectiveAge, TotalPorchSF, HouseAge

Derived features statistics:
           TotalSF  TotalBathrooms  EffectiveAge  TotalPorchSF     HouseAge
count  1408.000000     1408.000000   1408.000000   1408.000000  1408.000000
mean   2521.140625        2.187145     22.970170    175.991477    38.698153
std     717.257897        0.765708     20.614283    149.971904    30.183385
min     720.000000        1.000000      0.000000      0.000000     0.000000
25%    1998.750000        2.000000      4.000000     44.750000     9.750000
50%    2454.500000        2.000000     14.000000    163.000000    37.000000
75%    2972.250000        2.500000     41.000000    260.250000    56.000000
max    5271.000000        6.000000     60.000000   1027.000000   138.000000


## 5. Create Categorical Features


In [68]:
# Create categorical features
print("Creating categorical features...")

# Age categories from numeric HouseAge
df['AgeCategory'] = pd.cut(
    df['HouseAge'],
    bins=[0, 10, 30, 50, 200],
    labels=['New', 'Modern', 'Mature', 'Old'],
    include_lowest=True
).astype('category')

# Binary/categorical flags
df['IsNew'] = (df['YearBuilt'] == df['YrSold']).astype('category')
df['Has2ndFloor'] = (df['2ndFlrSF'] > 0).astype('category')
df['HasBasement'] = (df['TotalBsmtSF'] > 0).astype('category')
df['HasFireplace'] = (df['Fireplaces'] > 0).astype('category')
df['HasPool'] = (df['PoolArea'] > 0).astype('category')
df['HasMasVnrArea'] = (df['MasVnrArea'] > 0).astype('category')
df['HasPorch'] = (df['TotalPorchSF'] > 0).astype('category')

# Quality categories
df['QualityCategory'] = pd.cut(
    df['OverallQual'].astype(float),
    bins=[0, 4, 7, 10],
    labels=['Low', 'Medium', 'High'],
    include_lowest=True
).astype('category')

# Neighborhood category by median SalePrice quartiles
neighborhood_median = df.groupby('Neighborhood')['SalePrice'].median()
q1, q3 = neighborhood_median.quantile([0.25, 0.75])
neighborhood_label = pd.cut(
    neighborhood_median,
    bins=[-np.inf, q1, q3, np.inf],
    labels=['Budget', 'Middle', 'Premium'],
    include_lowest=True
)
df['NeighborhoodCategory'] = df['Neighborhood'].map(neighborhood_label).astype('category')

categorical_features = [
    'AgeCategory', 'IsNew', 'Has2ndFloor', 'HasBasement', 'QualityCategory',
    'HasFireplace', 'HasPool', 'HasMasVnrArea', 'HasPorch', 'NeighborhoodCategory'
]

print("✓ Categorical features created")
print(f"New features: {', '.join(categorical_features)}")
print(f"Total categorical features created: {len(categorical_features)}")

# Show value counts for new categorical features
for col in categorical_features:
    print(f"\n{col} distribution:")
    print(df[col].value_counts())


Creating categorical features...
✓ Categorical features created
New features: AgeCategory, IsNew, Has2ndFloor, HasBasement, QualityCategory, HasFireplace, HasPool, HasMasVnrArea, HasPorch, NeighborhoodCategory
Total categorical features created: 10

AgeCategory distribution:
AgeCategory
Old       469
New       376
Mature    349
Modern    214
Name: count, dtype: int64

IsNew distribution:
IsNew
False    1347
True       61
Name: count, dtype: int64

Has2ndFloor distribution:
Has2ndFloor
False    800
True     608
Name: count, dtype: int64

HasBasement distribution:
HasBasement
True     1372
False      36
Name: count, dtype: int64

QualityCategory distribution:
QualityCategory
Medium    1066
High       209
Low        133
Name: count, dtype: int64

HasFireplace distribution:
HasFireplace
True     730
False    678
Name: count, dtype: int64

HasPool distribution:
HasPool
False    1404
True        4
Name: count, dtype: int64

HasMasVnrArea distribution:
HasMasVnrArea
False    839
True     569


  neighborhood_median = df.groupby('Neighborhood')['SalePrice'].median()


## 6. Validate and Save Engineered Features


In [69]:
# Validate engineered features
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Original dataset shape: {df.shape}")

# List all new features created
all_new_features = (['Quality_x_Area', 'Quality_x_TotalSF', 'Bath_x_Area', 'Garage_x_Quality'] +
                   ratio_cols + categorical_features + derived_features)

print(f"Total new features created: {len(all_new_features)}")
print("New features list:")
for i, feature in enumerate(all_new_features, 1):
    dtype = df[feature].dtype
    print(f"  {i:2d}. {feature} ({dtype})")

# Check for missing values in new features
print(f"\nMissing values in new features:")
missing_new = df[all_new_features].isnull().sum()
if missing_new.sum() > 0:
    print("⚠️ Missing values found:")
    print(missing_new[missing_new > 0])
    
    # Handle missing values in AgeCategory if they exist
    if 'AgeCategory' in missing_new.index and missing_new['AgeCategory'] > 0:
        print(f"\nFixing {missing_new['AgeCategory']} missing values in AgeCategory...")
        # Fill missing AgeCategory with 'Old' as default
        df['AgeCategory'] = df['AgeCategory'].fillna('Old').astype('category')
        print("✓ Missing values in AgeCategory filled with 'Old'")
        
        # Recheck missing values
        missing_new_after = df[all_new_features].isnull().sum()
        if missing_new_after.sum() == 0:
            print("✓ All missing values resolved")
        else:
            print("⚠️ Some missing values remain:")
            print(missing_new_after[missing_new_after > 0])
else:
    print("✓ No missing values in new features")

# Check for infinite values in numeric new features
print(f"\nInfinite values check:")
numeric_new = df[all_new_features].select_dtypes(include=[np.number]).columns
inf_check = df[numeric_new].replace([np.inf, -np.inf], np.nan).isnull().sum()
if inf_check.sum() > 0:
    print("⚠️ Infinite values found:")
    print(inf_check[inf_check > 0])
else:
    print("✓ No infinite values found")


=== FEATURE ENGINEERING SUMMARY ===
Original dataset shape: (1408, 105)
Total new features created: 24
New features list:
   1. Quality_x_Area (float64)
   2. Quality_x_TotalSF (float64)
   3. Bath_x_Area (float64)
   4. Garage_x_Quality (float64)
   5. GrLivArea_to_LotArea (float64)
   6. 1stFlr_to_GrLivArea (float64)
   7. GarageArea_to_GrLivArea (float64)
   8. BsmtArea_to_GrLivArea (float64)
   9. AvgRoomSize (float64)
  10. AgeCategory (category)
  11. IsNew (category)
  12. Has2ndFloor (category)
  13. HasBasement (category)
  14. QualityCategory (category)
  15. HasFireplace (category)
  16. HasPool (category)
  17. HasMasVnrArea (category)
  18. HasPorch (category)
  19. NeighborhoodCategory (category)
  20. TotalSF (int64)
  21. TotalBathrooms (float64)
  22. EffectiveAge (int64)
  23. TotalPorchSF (int64)
  24. HouseAge (int64)

Missing values in new features:
✓ No missing values in new features

Infinite values check:
✓ No infinite values found


In [70]:
# Save engineered dataset
import os

# Create processed data directory if it doesn't exist
processed_dir = '../../data/processed/'
os.makedirs(processed_dir, exist_ok=True)

# Save engineered dataset
output_file = os.path.join(processed_dir, 'df_engineered.pkl')
df.to_pickle(output_file)

print(f"✓ Engineered dataset saved to: {output_file}")
print(f"Final dataset shape: {df.shape}")
print(f"File size: {os.path.getsize(output_file) / 1024**2:.2f} MB")

# Display first few rows of new features
print(f"\nSample of new features:")
sample_features = all_new_features[:8]  # Show first 8 new features
print(df[sample_features].head())


✓ Engineered dataset saved to: ../../data/processed/df_engineered.pkl
Final dataset shape: (1408, 105)
File size: 0.62 MB

Sample of new features:
   Quality_x_Area  Quality_x_TotalSF  Bath_x_Area  Garage_x_Quality  \
0         11970.0            17962.0       4275.0              14.0   
1          7572.0            15144.0       2524.0              12.0   
2         12502.0            18942.0       4465.0              14.0   
3         12019.0            17311.0       1717.0              21.0   
4         17584.0            26744.0       5495.0              24.0   

   GrLivArea_to_LotArea  1stFlr_to_GrLivArea  GarageArea_to_GrLivArea  \
0              0.202367             0.500292                 0.320281   
1              0.131458             0.999208                 0.364212   
2              0.158756             0.514829                 0.340235   
3              0.179791             0.559371                 0.373690   
4              0.154137             0.520691                 