# Basic Feature Engineering

This notebook implements minimal feature engineering with only essential features.

## What we will do:
1. Load cleaned data
2. Create basic features (TotalSF, HouseAge)
3. Save processed data


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import sys
sys.path.append('../../scripts')
from feature_builders import BasicFeatureBuilder


## 1. Load Data


In [2]:
# Load cleaned data
df = pd.read_csv('../../data/cleaned/domain_cleaned.csv')
print(f"Data shape: {df.shape}")


Data shape: (1161, 81)


## 2. Basic Feature Engineering


In [3]:
# Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape: (1161, 80)
Target shape: (1161,)


In [4]:
# Apply basic feature engineering
basic_builder = BasicFeatureBuilder()
basic_builder.fit(X, y)
X_basic = basic_builder.transform(X)

print(f"Original features: {X.shape[1]}")
print(f"After basic engineering: {X_basic.shape[1]}")
print(f"New features added: {X_basic.shape[1] - X.shape[1]}")


Original features: 80
After basic engineering: 82
New features added: 2


In [5]:
# Show new features
new_features = [col for col in X_basic.columns if col not in X.columns]
print(f"New features: {new_features}")

if new_features:
    print("\nNew feature statistics:")
    print(X_basic[new_features].describe())


New features: ['TotalSF', 'HouseAge']

New feature statistics:
           TotalSF     HouseAge
count  1161.000000  1161.000000
mean   2560.031869    38.761413
std     766.855433    30.177134
min     720.000000     0.000000
25%    2014.000000    10.000000
50%    2486.000000    37.000000
75%    2992.000000    56.000000
max    6872.000000   138.000000


## 3. Save Data


In [6]:
# Combine features and target
df_basic = X_basic.copy()
df_basic['SalePrice'] = y

# Save to processed folder
df_basic.to_csv('../../data/processed/df_basic_features.csv', index=False)
print(f"Saved basic features dataset: {df_basic.shape}")


Saved basic features dataset: (1161, 83)
