In [12]:
# 03_feature_engineering.ipynb
# Purpose: Create aggregated and derived features for the Ames Housing dataset.

import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 140)

# Load cleaned dataset
clean_path = "data/cleaned/ames_cleaned.csv"

df_clean = pd.read_csv(
    clean_path,
    keep_default_na=False,   # Ensure 'None' stays as a valid category
    na_values=[]
)

print("Loaded cleaned dataset:", df_clean.shape)
df_clean.head()

Loaded cleaned dataset: (2930, 81)


Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960-01-01,1960-01-01,Hip,CompShg,BrkFace,Plywood,Stone,112.0,3,3,CBlock,3,4,4,4,639.0,1,0.0,441.0,1080.0,GasA,2,Y,SBrkr,1656,0,0,1656,1,0,1,0,3,1,3,7,8,2,4,Attchd,1960-01-01,Fin,2,528.0,3,3,P,210,62,0,0,0,0,0,0,,0,5,2010-01-01,WD,Normal,215000
1,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961-01-01,1961-01-01,Gable,CompShg,VinylSd,VinylSd,,0.0,3,3,CBlock,3,3,1,3,468.0,2,144.0,270.0,882.0,GasA,3,Y,SBrkr,896,0,0,896,0,0,1,0,2,1,3,5,8,0,0,Attchd,1961-01-01,Unf,1,730.0,3,3,Y,140,0,0,0,120,0,0,0,,0,6,2010-01-01,WD,Normal,105000
2,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958-01-01,1958-01-01,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,3,3,CBlock,3,3,1,5,923.0,1,0.0,406.0,1329.0,GasA,3,Y,SBrkr,1329,0,0,1329,0,0,1,1,3,1,4,6,8,0,0,Attchd,1958-01-01,Unf,1,312.0,3,3,Y,393,36,0,0,0,0,0,0,Gar2,12500,6,2010-01-01,WD,Normal,172000
3,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968-01-01,1968-01-01,Hip,CompShg,BrkFace,BrkFace,,0.0,4,3,CBlock,3,3,1,5,1065.0,1,0.0,1045.0,2110.0,GasA,5,Y,SBrkr,2110,0,0,2110,1,0,2,1,3,1,5,8,8,2,3,Attchd,1968-01-01,Fin,2,522.0,3,3,Y,0,0,0,0,0,0,0,0,,0,4,2010-01-01,WD,Normal,244000
4,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997-01-01,1998-01-01,Gable,CompShg,VinylSd,VinylSd,,0.0,3,3,PConc,4,3,1,6,791.0,1,0.0,137.0,928.0,GasA,4,Y,SBrkr,928,701,0,1629,0,0,2,1,3,1,3,6,8,1,3,Attchd,1997-01-01,Fin,2,482.0,3,3,Y,212,34,0,0,0,0,0,0,,0,3,2010-01-01,WD,Normal,189900


In [13]:
# === Structural & Area-Based Aggregations ===

print("=== Creating structural and area-based aggregations ===")

df_eng = df_clean.copy()

# Total bathrooms (full + half weighted as 0.5)
df_eng["Total_Bathrooms"] = (
    df_eng["Full Bath"]
    + df_eng["Half Bath"] * 0.5
    + df_eng["Bsmt Full Bath"]
    + df_eng["Bsmt Half Bath"] * 0.5
)

# Total finished basement area
df_eng["Total_Bsmt_Finished_SF"] = (
    df_eng["BsmtFin SF 1"] + df_eng["BsmtFin SF 2"]
)

# Total porch area
df_eng["Total_Porch_SF"] = (
    df_eng["Open Porch SF"]
    + df_eng["Enclosed Porch"]
    + df_eng["3Ssn Porch"]
    + df_eng["Screen Porch"]
)

# Total house area (above ground + finished basement)
df_eng["Total_House_SF"] = (
    df_eng["Gr Liv Area"] + df_eng["Total_Bsmt_Finished_SF"]
)

# Total external deck/patio space
df_eng["Outdoor_SF"] = (
    df_eng["Wood Deck SF"] + df_eng["Open Porch SF"]
)

print("Done. New columns created:")
df_eng[[
    "Total_Bathrooms",
    "Total_Bsmt_Finished_SF",
    "Total_Porch_SF",
    "Total_House_SF",
    "Outdoor_SF"
]].head()

=== Creating structural and area-based aggregations ===
Done. New columns created:


Unnamed: 0,Total_Bathrooms,Total_Bsmt_Finished_SF,Total_Porch_SF,Total_House_SF,Outdoor_SF
0,2.0,639.0,62,2295.0,272
1,1.0,612.0,120,1508.0,140
2,1.5,923.0,36,2252.0,429
3,3.5,1065.0,0,3175.0,0
4,2.5,791.0,34,2420.0,246


In [14]:
# === Ensure temporal columns are proper datetimes ===

print("=== Parsing temporal columns as datetime ===")

date_cols = ["Year Built", "Year Remod/Add", "Garage Yr Blt", "Yr Sold"]

for col in date_cols:
    df_eng[col] = pd.to_datetime(df_eng[col])

print(df_eng[date_cols].dtypes)
df_eng[date_cols].head()

=== Parsing temporal columns as datetime ===
Year Built        datetime64[ns]
Year Remod/Add    datetime64[ns]
Garage Yr Blt     datetime64[ns]
Yr Sold           datetime64[ns]
dtype: object


Unnamed: 0,Year Built,Year Remod/Add,Garage Yr Blt,Yr Sold
0,1960-01-01,1960-01-01,1960-01-01,2010-01-01
1,1961-01-01,1961-01-01,1961-01-01,2010-01-01
2,1958-01-01,1958-01-01,1958-01-01,2010-01-01
3,1968-01-01,1968-01-01,1968-01-01,2010-01-01
4,1997-01-01,1998-01-01,1997-01-01,2010-01-01


In [15]:
# === Age-Based Features (in years) ===

print("=== Creating age-based features ===")

# Basic ages
house_year_built = df_eng["Year Built"].dt.year
house_year_remod = df_eng["Year Remod/Add"].dt.year
year_sold        = df_eng["Yr Sold"].dt.year
garage_year_blt  = df_eng["Garage Yr Blt"].dt.year

df_eng["House_Age"]      = year_sold - house_year_built
df_eng["Since_Remodel"]  = year_sold - house_year_remod

# Handle the dummy / fallback year for garage (e.g. 1900)
garage_year_effective = garage_year_blt.where(garage_year_blt != 1900, house_year_built)
df_eng["Garage_Age"] = year_sold - garage_year_effective

print("Done. New age-related features created:")
print(df_eng[["House_Age", "Since_Remodel", "Garage_Age"]].head())

=== Creating age-based features ===
Done. New age-related features created:
   House_Age  Since_Remodel  Garage_Age
0         50             50          50
1         49             49          49
2         52             52          52
3         42             42          42
4         13             12          13


In [16]:
# === Ratio & Normalized Features ===

print("=== Creating ratio-based and normalized features ===")

df_eng["Area_per_Room"] = df_eng["Gr Liv Area"] / df_eng["TotRms AbvGrd"]
df_eng["Area_per_Bedroom"] = df_eng["Gr Liv Area"] / df_eng["Bedroom AbvGr"]
df_eng["Bathrooms_per_Bedroom"] = df_eng["Total_Bathrooms"] / df_eng["Bedroom AbvGr"]
df_eng["Outdoor_to_TotalArea"] = df_eng["Outdoor_SF"] / (df_eng["Total_House_SF"] + 1)
df_eng["Basement_to_Living_Ratio"] = df_eng["Total_Bsmt_Finished_SF"] / (df_eng["Gr Liv Area"] + 1)

# Handle division-by-zero infinite values (in case of rooms = 0)
df_eng.replace([np.inf, -np.inf], np.nan, inplace=True)
df_eng.fillna(0, inplace=True)

print("Done. New ratio features created:")
df_eng[[
    "Area_per_Room",
    "Area_per_Bedroom",
    "Bathrooms_per_Bedroom",
    "Outdoor_to_TotalArea",
    "Basement_to_Living_Ratio"
]].head()

=== Creating ratio-based and normalized features ===
Done. New ratio features created:


Unnamed: 0,Area_per_Room,Area_per_Bedroom,Bathrooms_per_Bedroom,Outdoor_to_TotalArea,Basement_to_Living_Ratio
0,236.571429,552.0,0.666667,0.118467,0.385637
1,179.2,448.0,0.5,0.092777,0.682274
2,221.5,443.0,0.5,0.190413,0.693985
3,263.75,703.333333,1.166667,0.0,0.5045
4,271.5,543.0,0.833333,0.101611,0.485276


In [17]:
# === Interaction Features (multiplicative effects) ===

print("=== Creating interaction features ===")

df_eng["Quality_x_Area"] = df_eng["Overall Qual"] * df_eng["Gr Liv Area"]
df_eng["Quality_x_TotalSF"] = df_eng["Overall Qual"] * df_eng["Total_House_SF"]
df_eng["Condition_x_Age"] = df_eng["Overall Cond"] * df_eng["House_Age"]
df_eng["Rooms_x_Bathrooms"] = df_eng["TotRms AbvGrd"] * df_eng["Total_Bathrooms"]
df_eng["Bedrooms_x_Bathrooms"] = df_eng["Bedroom AbvGr"] * df_eng["Total_Bathrooms"]

# Interaction involving basement quality and area
if "Bsmt Qual" in df_eng.columns:
    # ensure numeric if needed
    df_eng["Bsmt_Quality_x_FinishedSF"] = (
        pd.to_numeric(df_eng["Bsmt Qual"], errors="coerce").fillna(0)
        * df_eng["Total_Bsmt_Finished_SF"]
    )
else:
    df_eng["Bsmt_Quality_x_FinishedSF"] = 0

print("Done. New interaction features created:")
df_eng[[
    "Quality_x_Area",
    "Quality_x_TotalSF",
    "Condition_x_Age",
    "Rooms_x_Bathrooms",
    "Bedrooms_x_Bathrooms",
    "Bsmt_Quality_x_FinishedSF"
]].head()

=== Creating interaction features ===
Done. New interaction features created:


Unnamed: 0,Quality_x_Area,Quality_x_TotalSF,Condition_x_Age,Rooms_x_Bathrooms,Bedrooms_x_Bathrooms,Bsmt_Quality_x_FinishedSF
0,9936,13770.0,250,14.0,6.0,1917.0
1,4480,7540.0,294,5.0,2.0,1836.0
2,7974,13512.0,312,9.0,4.5,2769.0
3,14770,22225.0,210,28.0,10.5,3195.0
4,8145,12100.0,65,15.0,7.5,3164.0


In [18]:
# === Log-Transform Features ===

print("=== Applying log-transforms to skewed features ===")

log_cols = [
    "Lot Area",
    "Gr Liv Area",
    "Total_House_SF",
    "Total_Bsmt_Finished_SF",
    "Total_Porch_SF",
    "Outdoor_SF",
    "SalePrice"
]

for col in log_cols:
    df_eng[f"log_{col.replace(' ', '_')}"] = np.log1p(df_eng[col])

print("Done. Log-transformed columns created:")
df_eng[[f"log_{c.replace(' ', '_')}" for c in log_cols]].head()

=== Applying log-transforms to skewed features ===
Done. Log-transformed columns created:


Unnamed: 0,log_Lot_Area,log_Gr_Liv_Area,log_Total_House_SF,log_Total_Bsmt_Finished_SF,log_Total_Porch_SF,log_Outdoor_SF,log_SalePrice
0,10.366309,7.412764,7.738924,6.461468,4.143135,5.609472,12.278398
1,9.360741,6.799056,7.319202,6.418365,4.795791,4.94876,11.561725
2,9.565775,7.192934,7.720018,6.828712,3.610918,6.063785,12.055256
3,9.320181,7.654917,8.063378,6.971669,0.0,0.0,12.404928
4,9.534668,7.396335,7.791936,6.674561,3.555348,5.509388,12.154258


In [19]:
# === Categorical Encoding Preparation ===

print("=== Preparing categorical encoding structures ===")

# Identify categorical columns
cat_cols = df_eng.select_dtypes(include=["object", "string"]).columns.tolist()

# Separate ordinal and nominal based on metadata from variable_dictionary
ordinal_cols = [
    "Exter Qual", "Exter Cond", "Bsmt Qual", "Bsmt Cond",
    "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2",
    "Heating QC", "Kitchen Qual", "Functional",
    "Fireplace Qu", "Garage Qual", "Garage Cond",
    "Pool QC", "Fence"
]

nominal_cols = [col for col in cat_cols if col not in ordinal_cols]

# Compute cardinality
cat_cardinality = {col: df_eng[col].nunique() for col in cat_cols}

# Create a dataframe summarizing categorical variables
cat_summary = pd.DataFrame({
    "column": cat_cols,
    "cardinality": [cat_cardinality[c] for c in cat_cols],
    "type": ["ordinal" if c in ordinal_cols else "nominal" for c in cat_cols]
}).sort_values("cardinality", ascending=False)

print("Summary of categorical columns:")
cat_summary.head(20)

=== Preparing categorical encoding structures ===
Summary of categorical columns:


Unnamed: 0,column,cardinality,type
8,Neighborhood,28,nominal
16,Exterior 2nd,17,nominal
15,Exterior 1st,16,nominal
26,Sale Type,10,nominal
9,Condition 1,9,nominal
14,Roof Matl,8,nominal
12,House Style,8,nominal
10,Condition 2,8,nominal
0,MS Zoning,7,nominal
18,Foundation,6,nominal


In [20]:
# === Save engineered dataset ===

print("=== Saving engineered dataset ===")

# Ensure output directory exists
import os
os.makedirs("data/features", exist_ok=True)

output_path = "data/features/ames_features.csv"
df_eng.to_csv(output_path, index=False)

print(f"Engineered dataset saved to: {output_path}")
print("Final shape:", df_eng.shape)

=== Saving engineered dataset ===
Engineered dataset saved to: data/features/ames_features.csv
Final shape: (2930, 107)


In [21]:
# === Generate Feature Engineering Report (Markdown) ===

import os

os.makedirs("reports", exist_ok=True)
report_path = "reports/feature_engineering_report.md"

report_md = """# Feature Engineering Report — Ames Housing Dataset
Version: v1  
This document summarizes all engineered features produced in `03_feature_engineering.ipynb`.

---

## 1. Overview
This report documents all transformations performed during feature engineering: structural, temporal, ratio, interaction, and log-transformed features.  
Input: `data/cleaned/ames_cleaned.csv`  
Output: `data/features/ames_features.csv`

---

## 2. Structural & Area-Based Features

### Total_Bathrooms
Full Bath + 0.5×Half Bath + Bsmt Full Bath + 0.5×Bsmt Half Bath

### Total_Bsmt_Finished_SF
BsmtFin SF 1 + BsmtFin SF 2

### Total_Porch_SF
Open Porch SF + Enclosed Porch + 3Ssn Porch + Screen Porch

### Total_House_SF
Gr Liv Area + Total_Bsmt_Finished_SF

### Outdoor_SF
Wood Deck SF + Open Porch SF

---

## 3. Age-Based Features
(All temporal fields converted to datetime)

### House_Age
Yr Sold − Year Built

### Since_Remodel
Yr Sold − Year Remod/Add

### Garage_Age
Yr Sold − Garage Yr Blt  
(missing values replaced with Year Built)

---

## 4. Ratio Features
Area_per_Room = Gr Liv Area / TotRms AbvGrd  
Area_per_Bedroom = Gr Liv Area / Bedroom AbvGr  
Bathrooms_per_Bedroom = Total_Bathrooms / Bedroom AbvGr  
Outdoor_to_TotalArea = Outdoor_SF / (Total_House_SF + 1)  
Basement_to_Living_Ratio = Total_Bsmt_Finished_SF / (Gr Liv Area + 1)

---

## 5. Interaction Features
Quality_x_Area = Overall Qual × Gr Liv Area  
Quality_x_TotalSF = Overall Qual × Total_House_SF  
Condition_x_Age = Overall Cond × House_Age  
Rooms_x_Bathrooms = TotRms AbvGrd × Total_Bathrooms  
Bedrooms_x_Bathrooms = Bedroom AbvGr × Total_Bathrooms  
Bsmt_Quality_x_FinishedSF = Bsmt Qual × Total_Bsmt_Finished_SF

---

## 6. Log-Transformed Features
Applied using log1p():

- Lot Area  
- Gr Liv Area  
- Total_House_SF  
- Total_Bsmt_Finished_SF  
- Total_Porch_SF  
- Outdoor_SF  
- SalePrice  

Fixes skewness and improves modeling.

---

## 7. Categorical Encoding Preparation
Ordinal features mapped based on domain meaning.  
Nominal features prepared for one-hot encoding.  
Cardinality recorded for all categorical columns.

---

## 8. Final Output
Final engineered dataset saved to:

End of Report.
"""

with open(report_path, "w") as f:
    f.write(report_md)

print(f"Report saved to: {report_path}")

Report saved to: reports/feature_engineering_report.md
