In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys

def find_project_root():
    """Find the housing_price_project directory."""
    current = Path.cwd().resolve()
    
    # Check if we're in notebooks/ - go up one level
    if current.name == 'notebooks':
        parent = current.parent
        if (parent / 'src').exists() and (parent / 'notebooks').exists():
            return parent
    
    # Search up the directory tree for housing_price_project
    path = current
    while path != path.parent:
        if path.name == 'housing_price_project' and (path / 'src').exists():
            return path
        path = path.parent
    
    # If not found, assume current directory is project root
    return current

project_root = find_project_root()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))


from src.features.encoders import (
    LabelEncoder, OrdinalEncoder, OneHotEncoder, 
    TargetEncoder, MultiColumnEncoder
)
from src.features.engineering import (
    AmesFeaturesEngineeer, LogTransformer
)

In [4]:
DATA_PATH = Path("../data/processed/ames_cleaned.csv")
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} rows from {DATA_PATH}")


Loaded 2685 rows from ../data/processed/ames_cleaned.csv


In [5]:
target = "price"
y = df[target].values
X = df.drop(columns=[target])

print(f"Features: {X.shape}")
print(f"Target: {y.shape}")

Features: (2685, 79)
Target: (2685,)


In [6]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

Categorical columns: ['MS.Zoning', 'Street', 'Alley', 'Lot.Shape', 'Land.Contour', 'Utilities', 'Lot.Config', 'Land.Slope', 'Neighborhood', 'Condition.1', 'Condition.2', 'Bldg.Type', 'House.Style', 'Roof.Style', 'Roof.Matl', 'Exterior.1st', 'Exterior.2nd', 'Mas.Vnr.Type', 'Exter.Qual', 'Exter.Cond', 'Foundation', 'Bsmt.Qual', 'Bsmt.Cond', 'Bsmt.Exposure', 'BsmtFin.Type.1', 'BsmtFin.Type.2', 'Heating', 'Heating.QC', 'Central.Air', 'Electrical', 'Kitchen.Qual', 'Functional', 'Fireplace.Qu', 'Garage.Type', 'Garage.Finish', 'Garage.Qual', 'Garage.Cond', 'Paved.Drive', 'Pool.QC', 'Fence', 'Misc.Feature', 'Sale.Type', 'Sale.Condition']


In [7]:
cardinality = {col: X[col].nunique() for col in categorical_cols}
print("\nCardinality:")
for col, card in sorted(cardinality.items(), key=lambda x: -x[1])[:15]:
    print(f"  {col}: {card} unique values")


Cardinality:
  Neighborhood: 28 unique values
  Exterior.1st: 16 unique values
  Exterior.2nd: 16 unique values
  Condition.1: 9 unique values
  Sale.Type: 9 unique values
  Condition.2: 8 unique values
  House.Style: 8 unique values
  Functional: 8 unique values
  MS.Zoning: 7 unique values
  Roof.Matl: 7 unique values
  Roof.Style: 6 unique values
  Foundation: 6 unique values
  BsmtFin.Type.1: 6 unique values
  BsmtFin.Type.2: 6 unique values
  Heating: 6 unique values


In [8]:
ordinal_mappings = {
    # Quality ratings: None < Poor < Fair < Average < Good < Excellent
    'Exter.Qual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Exter.Cond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Bsmt.Qual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Bsmt.Cond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Heating.QC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Kitchen.Qual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fireplace.Qu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Garage.Qual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Garage.Cond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Pool.QC': ['None', 'Fa', 'TA', 'Gd', 'Ex'],
    
    # Basement exposure: None < No < Mn < Av < Gd
    'Bsmt.Exposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
    
    # Basement finish type
    'BsmtFin.Type.1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFin.Type.2': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    
    # Garage finish
    'Garage.Finish': ['None', 'Unf', 'RFn', 'Fin'],
    
    # Fence quality
    'Fence': ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    
    # Paved drive
    'Paved.Drive': ['N', 'P', 'Y'],
    
    # Lot shape
    'Lot.Shape': ['IR3', 'IR2', 'IR1', 'Reg'],
    
    # Land slope
    'Land.Slope': ['Sev', 'Mod', 'Gtl'],
    
    # Functional (home functionality)
    'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
}

# 2. TARGET ENCODING: High cardinality (neighborhood, etc.)
target_encode_cols = ['Neighborhood']  # 25+ values

# 3. ONE-HOT: Nominal categories (no natural order)
onehot_cols = [
    'MS.Zoning', 'Street', 'Alley', 'Land.Contour', 'Lot.Config',
    'Condition.1', 'Condition.2', 'Bldg.Type', 'House.Style',
    'Roof.Style', 'Roof.Matl', 'Exterior.1st', 'Exterior.2nd',
    'Mas.Vnr.Type', 'Foundation', 'Heating', 'Electrical',
    'Garage.Type', 'Misc.Feature', 'Sale.Type', 'Sale.Condition',
    'Central.Air', 'Utilities'
]

# Filter to columns that exist
ordinal_cols = {k: v for k, v in ordinal_mappings.items() if k in X.columns}
target_encode_cols = [c for c in target_encode_cols if c in X.columns]
onehot_cols = [c for c in onehot_cols if c in X.columns]

print(f"\nEncoding plan:")
print(f"  Ordinal: {len(ordinal_cols)} columns")
print(f"  Target: {len(target_encode_cols)} columns")
print(f"  One-Hot: {len(onehot_cols)} columns")



Encoding plan:
  Ordinal: 19 columns
  Target: 1 columns
  One-Hot: 23 columns


In [9]:
print("\n" + "="*60)
print("STEP 2: ORDINAL ENCODING")
print("="*60)

X_encoded = X.copy()

for col, order in ordinal_cols.items():
    encoder = OrdinalEncoder(order=order, unknown_value=-1)
    encoder.fit()
    X_encoded[col] = encoder.transform(X[col].fillna('None').astype(str))
    print(f"  {col}: encoded with order {order[:3]}...{order[-1]}")



STEP 2: ORDINAL ENCODING
  Exter.Qual: encoded with order ['Po', 'Fa', 'TA']...Ex
  Exter.Cond: encoded with order ['Po', 'Fa', 'TA']...Ex
  Bsmt.Qual: encoded with order ['None', 'Po', 'Fa']...Ex
  Bsmt.Cond: encoded with order ['None', 'Po', 'Fa']...Ex
  Heating.QC: encoded with order ['Po', 'Fa', 'TA']...Ex
  Kitchen.Qual: encoded with order ['Po', 'Fa', 'TA']...Ex
  Fireplace.Qu: encoded with order ['None', 'Po', 'Fa']...Ex
  Garage.Qual: encoded with order ['None', 'Po', 'Fa']...Ex
  Garage.Cond: encoded with order ['None', 'Po', 'Fa']...Ex
  Pool.QC: encoded with order ['None', 'Fa', 'TA']...Ex
  Bsmt.Exposure: encoded with order ['None', 'No', 'Mn']...Gd
  BsmtFin.Type.1: encoded with order ['None', 'Unf', 'LwQ']...GLQ
  BsmtFin.Type.2: encoded with order ['None', 'Unf', 'LwQ']...GLQ
  Garage.Finish: encoded with order ['None', 'Unf', 'RFn']...Fin
  Fence: encoded with order ['None', 'MnWw', 'GdWo']...GdPrv
  Paved.Drive: encoded with order ['N', 'P', 'Y']...Y
  Lot.Shape: enco

In [10]:
print("\n" + "="*60)
print("STEP 3: TARGET ENCODING")
print("="*60)

# Important: We'll need to be careful about this during CV!
# For now, fit on all training data (we'll handle CV properly later)

for col in target_encode_cols:
    encoder = TargetEncoder(smoothing=10)
    encoder.fit(X[col], y)
    X_encoded[col] = encoder.transform(X[col])
    
    # Show encoding map
    encoding_map = encoder.get_encoding_map()
    print(f"\n  {col} target encoding (smoothing=10):")
    sorted_map = sorted(encoding_map.items(), key=lambda x: x[1])
    print(f"    Lowest: {sorted_map[0][0]} → ${sorted_map[0][1]:,.0f}")
    print(f"    Highest: {sorted_map[-1][0]} → ${sorted_map[-1][1]:,.0f}")



STEP 3: TARGET ENCODING

  Neighborhood target encoding (smoothing=10):
    Lowest: IDOTRR → $110,413
    Highest: NoRidge → $310,817


In [11]:
print("\n" + "="*60)
print("STEP 4: ONE-HOT ENCODING")
print("="*60)

# Track columns before/after
cols_before = len(X_encoded.columns)

for col in onehot_cols:
    if col not in X_encoded.columns:
        continue
    
    encoder = OneHotEncoder(drop_first=True)  # Drop first to avoid multicollinearity
    encoder.fit(X_encoded[col].fillna('None').astype(str))
    
    # Get encoded values
    encoded = encoder.transform(X_encoded[col].fillna('None').astype(str))
    feature_names = encoder.get_feature_names(col)
    
    # Add new columns
    for i, name in enumerate(feature_names):
        X_encoded[name] = encoded[:, i]
    
    # Drop original
    X_encoded = X_encoded.drop(columns=[col])

cols_after = len(X_encoded.columns)
print(f"\nOne-hot encoding: {cols_before} → {cols_after} columns")
print(f"Added {cols_after - cols_before + len(onehot_cols)} columns")



STEP 4: ONE-HOT ENCODING

One-hot encoding: 79 → 181 columns
Added 125 columns


  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[name] = encoded[:, i]
  X_encoded[na

In [12]:
print("\n" + "="*60)
print("STEP 5: FEATURE ENGINEERING")
print("="*60)

engineer = AmesFeaturesEngineeer(reference_year=2010)
X_featured = engineer.fit_transform(X_encoded)


new_features = engineer.get_new_feature_names()
print(f"\nCreated {len(new_features)} new features:")
for feat in new_features:
    print(f"  • {feat}")


STEP 5: FEATURE ENGINEERING

Created 23 new features:
  • House_Age
  • Years_Since_Remodel
  • Was_Remodeled
  • Is_New
  • Total_SF
  • Total_Porch_SF
  • Total_Outdoor_SF
  • Bsmt_Finished_Ratio
  • Above_Grade_Ratio
  • Total_Bathrooms
  • Qual_Cond_Product
  • Qual_Cond_Sum
  • Qual_Per_SF
  • Has_Garage
  • Garage_Area_Per_Car
  • Has_Pool
  • Has_Fireplace
  • Has_2nd_Floor
  • Has_Basement
  • Sold_Spring
  • Sold_Summer
  • Sold_Fall
  • Sold_Winter


In [13]:
print("\n" + "="*60)
print("STEP 6: LOG TRANSFORM")
print("="*60)

# Apply log transform to target (very skewed)
y_log = np.log1p(y)
print(f"Target skewness: {pd.Series(y).skew():.3f} → {pd.Series(y_log).skew():.3f}")

# Identify skewed numeric features
skewed_features = []
numeric_cols = X_featured.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    skewness = X_featured[col].skew()
    if abs(skewness) > 1.0:  # Highly skewed
        skewed_features.append((col, skewness))

skewed_features.sort(key=lambda x: -abs(x[1]))
print(f"\nHighly skewed features (|skew| > 1.0): {len(skewed_features)}")
for col, skew in skewed_features[:10]:
    print(f"  {col}: {skew:.2f}")

# Apply log transform to area-related features
area_features = [col for col in numeric_cols if 'SF' in col or 'Area' in col or col == 'area']
log_transformer = LogTransformer(columns=area_features)
X_final = log_transformer.fit_transform(X_featured)

print(f"\nLog-transformed {len(area_features)} area features")



STEP 6: LOG TRANSFORM
Target skewness: 1.850 → -0.102

Highly skewed features (|skew| > 1.0): 158
  Condition.2_RRAn: 51.82
  Condition.2_RRAe: 51.82
  Mas.Vnr.Type_CBlock: 51.82
  Roof.Matl_Roll: 51.82
  Exterior.1st_ImStucc: 51.82
  Roof.Matl_Membran: 51.82
  Roof.Matl_Metal: 51.82
  Exterior.1st_PreCast: 51.82
  Exterior.2nd_PreCast: 51.82
  Electrical_Mix: 51.82

Log-transformed 20 area features


In [14]:
print("\n" + "="*60)
print("STEP 7: VALIDATION")
print("="*60)

print(f"\nFinal feature matrix: {X_final.shape}")
print(f"Original features: {X.shape[1]}")
print(f"After engineering: {X_final.shape[1]}")

# Check for any remaining issues
print("\nData quality checks:")

# NaN check
nan_counts = X_final.isnull().sum()
cols_with_nan = nan_counts[nan_counts > 0]
if len(cols_with_nan) > 0:
    print(f"  ⚠ Columns with NaN: {len(cols_with_nan)}")
    print(cols_with_nan)
else:
    print("  ✓ No NaN values")

# Infinite values
inf_counts = np.isinf(X_final.select_dtypes(include=[np.number])).sum()
cols_with_inf = inf_counts[inf_counts > 0]
if len(cols_with_inf) > 0:
    print(f"  ⚠ Columns with Inf: {len(cols_with_inf)}")
else:
    print("  ✓ No infinite values")

# Non-numeric columns remaining
non_numeric = X_final.select_dtypes(include=['object']).columns
if len(non_numeric) > 0:
    print(f"  ⚠ Non-numeric columns remaining: {list(non_numeric)}")
else:
    print("  ✓ All columns are numeric")



STEP 7: VALIDATION

Final feature matrix: (2685, 204)
Original features: 79
After engineering: 204

Data quality checks:
  ✓ No NaN values
  ✓ No infinite values
  ✓ All columns are numeric


In [15]:
print("\n" + "="*60)
print("STEP 8: SAVE")
print("="*60)

# Combine features and target
df_final = X_final.copy()
df_final['price'] = y
df_final['log_price'] = y_log

OUTPUT_PATH = Path("../data/features/ames_featured.csv")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_final.to_csv(OUTPUT_PATH, index=False)

print(f"\n✓ Featured data saved to: {OUTPUT_PATH}")
print(f"  Shape: {df_final.shape}")
print(f"  Features: {X_final.shape[1]}")



STEP 8: SAVE

✓ Featured data saved to: ../data/features/ames_featured.csv
  Shape: (2685, 206)
  Features: 204


In [17]:
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)

print(f"""
Original columns:       {X.shape[1]}
After ordinal encoding: (same count, converted to numeric)
After target encoding:  (same count, numeric representation)
After one-hot encoding: {cols_after} 
After feature eng:      {X_featured.shape[1]}
After log transform:    {X_final.shape[1]}

Key transformations:
- {len(ordinal_cols)} ordinal columns → encoded with meaningful order
- {len(target_encode_cols)} high-cardinality cols → target encoded
- {len(onehot_cols)} nominal cols → one-hot encoded (drop_first=True)
- {len(new_features)} engineered features created
- {len(area_features)} area features → log transformed
- Target (price) → log transformed
""")


FEATURE ENGINEERING SUMMARY

Original columns:       79
After ordinal encoding: (same count, converted to numeric)
After target encoding:  (same count, numeric representation)
After one-hot encoding: 181 
After feature eng:      204
After log transform:    204

Key transformations:
- 19 ordinal columns → encoded with meaningful order
- 1 high-cardinality cols → target encoded
- 23 nominal cols → one-hot encoded (drop_first=True)
- 23 engineered features created
- 20 area features → log transformed
- Target (price) → log transformed

