# Feature Grouping

This notebook focuses on creating grouped features based on existing categorical and numerical features to enhance model performance and enable hypothesis validation.

## What we will do:
1. Adding NeighborhoodGroup feature (Price-based quartiles)
2. Adding AgeGroup feature
3. Adding QualityGroup feature
4. Adding SizeGroup feature
5. Data validation and summary


In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [63]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('default')


In [64]:
# Load the feature engineered data from previous notebook
# Note: This assumes the feature engineering notebook has been run first
data_path = '../../data/processed/feature_engineered_df.pkl'
df = pd.read_pickle(data_path)

print(f"Shape: {df.shape}")
# Check if the data types are preserved; category type columns should be 47
print("\nCount of 'category' dtype columns:", (df.dtypes == 'category').sum())


Shape: (1458, 90)

Count of 'category' dtype columns: 47


## 1. Adding NeighborhoodGroup feature (Price-based quartiles)


In [83]:
# Create NeighborhoodGroup feature based on price quartiles
# Calculate average price by neighborhood
neighborhood_prices = df.groupby('Neighborhood').agg({
    'SalePrice': ['mean', 'count']
}).round(0)
neighborhood_prices.columns = ['Avg_Price', 'House_Count']

# Calculate quartiles based on neighborhoods with 10+ houses only
valid_neighborhoods = neighborhood_prices[neighborhood_prices['House_Count'] >= 10]
q25 = valid_neighborhoods['Avg_Price'].quantile(0.25)
q75 = valid_neighborhoods['Avg_Price'].quantile(0.75)

print(f"Quartile thresholds: Q1=${q25:,.0f}, Q3=${q75:,.0f}")

# Create groups (None for < 10 houses)
neighborhood_groups = {}
for neighborhood, row in neighborhood_prices.iterrows():
    if row['House_Count'] < 10:
        neighborhood_groups[neighborhood] = None
    elif row['Avg_Price'] > q75:
        neighborhood_groups[neighborhood] = 'Premium'
    elif row['Avg_Price'] < q25:
        neighborhood_groups[neighborhood] = 'Budget'
    else:
        neighborhood_groups[neighborhood] = 'Middle'

print(f"Groups: Premium({sum(1 for g in neighborhood_groups.values() if g == 'Premium')}), "
      f"Middle({sum(1 for g in neighborhood_groups.values() if g == 'Middle')}), "
      f"Budget({sum(1 for g in neighborhood_groups.values() if g == 'Budget')}), "
      f"None({sum(1 for g in neighborhood_groups.values() if g is None)})")


Quartile thresholds: Q1=$132,509, Q3=$218,972
Groups: Premium(6), Middle(11), Budget(6), None(2)


In [66]:
# Apply the mapping to create NeighborhoodGroup feature
df['NeighborhoodGroup'] = df['Neighborhood'].map(neighborhood_groups)
# Convert to category type
df['NeighborhoodGroup'] = df['NeighborhoodGroup'].astype('category')


In [84]:
# Check NeighborhoodGroup results
print("NeighborhoodGroup value counts:")
print(df['NeighborhoodGroup'].value_counts(dropna=False))

null_count = df['NeighborhoodGroup'].isnull().sum()
print(f"\nNull values: {null_count} (neighborhoods with < 10 houses)")

if null_count > 0:
    excluded = df[df['NeighborhoodGroup'].isnull()]['Neighborhood'].unique()
    print(f"Excluded: {', '.join(excluded)}")
else:
    print("✅ No null values found in NeighborhoodGroup feature!")

# Show price range for each group
print(f"\nPrice ranges by group:")
for group in ['Premium', 'Middle', 'Budget']:
    group_neighborhoods = [n for n, g in neighborhood_groups.items() if g == group]
    if group_neighborhoods:
        prices = [neighborhood_prices.loc[n, 'Avg_Price'] for n in group_neighborhoods]
        print(f"{group}: ${min(prices):,.0f} - ${max(prices):,.0f}")


NeighborhoodGroup value counts:
NeighborhoodGroup
Middle     830
Budget     339
Premium    278
NaN         11
Name: count, dtype: int64

Null values: 11 (neighborhoods with < 10 houses)
Excluded: NPkVill, Blueste

Price ranges by group:
Premium: $225,380 - $335,295
Middle: $136,793 - $212,565
Budget: $98,576 - $128,225


## 2. Adding AgeGroup feature 


In [68]:
# First, examine HouseAge distribution to understand the data
print("HouseAge distribution:")
print(df['HouseAge'].describe())
print("\nHouseAge percentiles:")
percentiles = [10, 25, 50, 75, 90]
for p in percentiles:
    print(f"{p}th percentile: {np.percentile(df['HouseAge'], p):.1f} years")


HouseAge distribution:
count    1458.000000
mean       36.598080
std        30.240565
min         0.000000
25%         8.000000
50%        35.000000
75%        54.000000
max       136.000000
Name: HouseAge, dtype: float64

HouseAge percentiles:
10th percentile: 1.0 years
25th percentile: 8.0 years
50th percentile: 35.0 years
75th percentile: 54.0 years
90th percentile: 84.0 years


In [69]:
# Create AgeGroup feature based on housing market characteristics
def create_age_groups(house_age):
    """
    Create age groups based on housing market characteristics:
    - New: 0-10 years (modern features, warranty period)
    - Recent: 11-25 years (contemporary design, good condition)
    - Mature: 26-50 years (established neighborhoods, may need updates)
    - Old: 51+ years (historic charm, potential renovation needs)
    """
    if house_age <= 10:
        return 'New'
    elif house_age <= 25:
        return 'Recent'
    elif house_age <= 50:
        return 'Mature'
    else:
        return 'Old'

# Apply the function to create AgeGroup
df['AgeGroup'] = df['HouseAge'].apply(create_age_groups)

# Convert to category type for efficiency
df['AgeGroup'] = df['AgeGroup'].astype('category')

# Set the order for logical progression
df['AgeGroup'] = df['AgeGroup'].cat.reorder_categories(['New', 'Recent', 'Mature', 'Old'])

print("AgeGroup feature created successfully!")


AgeGroup feature created successfully!


In [70]:
# Verify the AgeGroup creation
print("AgeGroup distribution:")
print(df['AgeGroup'].value_counts().sort_index())

# Show percentage distribution
print("\nAgeGroup percentage distribution:")
age_group_pct = df['AgeGroup'].value_counts(normalize=True).sort_index() * 100
for group, pct in age_group_pct.items():
    print(f"{group}: {pct:.1f}%")

# Show average SalePrice for each AgeGroup
print("\nAverage SalePrice per AgeGroup:")
agegroup_price_avg = df.groupby('AgeGroup', observed=False)['SalePrice'].mean().sort_index()
for group, avg in agegroup_price_avg.items():
    print(f"{group}: ${avg:,.0f}")



AgeGroup distribution:
AgeGroup
New       432
Recent    158
Mature    429
Old       439
Name: count, dtype: int64

AgeGroup percentage distribution:
New: 29.6%
Recent: 10.8%
Mature: 29.4%
Old: 30.1%

Average SalePrice per AgeGroup:
New: $239,260
Recent: $226,378
Mature: $153,917
Old: $133,580


## 3. Adding QualityGroup feature


In [71]:
# Examine OverallQual distribution
print("OverallQual distribution:")
print(df['OverallQual'].value_counts().sort_index())

# Check correlation with SalePrice for meaningful grouping
quality_price_corr = df['OverallQual'].corr(df['SalePrice'])
print(f"\nOverallQual correlation with SalePrice: {quality_price_corr:.3f}")


OverallQual distribution:
OverallQual
1       2
2       3
3      20
4     116
5     397
6     374
7     319
8     168
9      43
10     16
Name: count, dtype: int64

OverallQual correlation with SalePrice: 0.796


In [72]:
# Create QualityGroup feature for analysis
def create_quality_groups(overall_qual):
    """
    Group OverallQual into meaningful categories:
    - Basic: 1-4 (Below average to fair)
    - Standard: 5-6 (Average to above average)
    - Premium: 7-8 (Good to very good)
    - Luxury: 9-10 (Excellent to very excellent)
    """
    if overall_qual <= 4:
        return 'Basic'
    elif overall_qual <= 6:
        return 'Standard'
    elif overall_qual <= 8:
        return 'Premium'
    else:
        return 'Luxury'

# Apply the function
df['QualityGroup'] = df['OverallQual'].apply(create_quality_groups)

# Convert to category type with logical order
df['QualityGroup'] = df['QualityGroup'].astype('category')
df['QualityGroup'] = df['QualityGroup'].cat.reorder_categories(['Basic', 'Standard', 'Premium', 'Luxury'])

print("QualityGroup feature created successfully!")


QualityGroup feature created successfully!


In [73]:
# Validate QualityGroup creation
print("QualityGroup distribution:")
print(df['QualityGroup'].value_counts().sort_index())

# Show percentage distribution
print("\nQualityGroup percentage distribution:")
quality_group_pct = df['QualityGroup'].value_counts(normalize=True).sort_index() * 100
for group, pct in quality_group_pct.items():
    print(f"{group}: {pct:.1f}%")

# Show average SalePrice by QualityGroup
print("\nAverage SalePrice by QualityGroup:")
quality_price_avg = df.groupby('QualityGroup')['SalePrice'].mean().sort_index()
for group, avg_price in quality_price_avg.items():
    print(f"{group}: ${avg_price:,.0f}")


QualityGroup distribution:
QualityGroup
Basic       141
Standard    771
Premium     487
Luxury       59
Name: count, dtype: int64

QualityGroup percentage distribution:
Basic: 9.7%
Standard: 52.9%
Premium: 33.4%
Luxury: 4.0%

Average SalePrice by QualityGroup:
Basic: $103,418
Standard: $147,144
Premium: $230,836
Luxury: $395,812


## 4. Adding SizeGroup feature


In [74]:
# Examine GrLivArea distribution for meaningful size grouping
print("GrLivArea distribution:")
print(df['GrLivArea'].describe())

# Check percentiles for grouping decisions
print("\nGrLivArea percentiles:")
percentiles = [25, 50, 75, 90]
for p in percentiles:
    print(f"{p}th percentile: {np.percentile(df['GrLivArea'], p):.0f} sq ft")


GrLivArea distribution:
count    1458.000000
mean     1510.465706
std       507.878508
min       334.000000
25%      1128.500000
50%      1461.500000
75%      1776.000000
max      4476.000000
Name: GrLivArea, dtype: float64

GrLivArea percentiles:
25th percentile: 1128 sq ft
50th percentile: 1462 sq ft
75th percentile: 1776 sq ft
90th percentile: 2156 sq ft


In [75]:
# Create SizeGroup feature based on living area
def create_size_groups(gr_liv_area):
    """
    Group houses by living area size:
    - Compact: < 1200 sq ft (small homes, condos)
    - Medium: 1200-1800 sq ft (typical family homes)
    - Large: 1800-2500 sq ft (spacious homes)
    - Mansion: > 2500 sq ft (luxury homes)
    """
    if gr_liv_area < 1200:
        return 'Compact'
    elif gr_liv_area < 1800:
        return 'Medium'
    elif gr_liv_area < 2500:
        return 'Large'
    else:
        return 'Mansion'

# Apply the function
df['SizeGroup'] = df['GrLivArea'].apply(create_size_groups)

# Convert to category type with logical order
df['SizeGroup'] = df['SizeGroup'].astype('category')
df['SizeGroup'] = df['SizeGroup'].cat.reorder_categories(['Compact', 'Medium', 'Large', 'Mansion'])

print("SizeGroup feature created successfully!")


SizeGroup feature created successfully!


In [76]:
# Validate SizeGroup creation
print("SizeGroup distribution:")
print(df['SizeGroup'].value_counts().sort_index())

# Show percentage distribution
print("\nSizeGroup percentage distribution:")
size_group_pct = df['SizeGroup'].value_counts(normalize=True).sort_index() * 100
for group, pct in size_group_pct.items():
    print(f"{group}: {pct:.1f}%")

# Show average SalePrice by SizeGroup
print("\nAverage SalePrice by SizeGroup:")
size_price_avg = df.groupby('SizeGroup', observed=True)['SalePrice'].mean().sort_index()
for group, avg_price in size_price_avg.items():
    print(f"{group}: ${avg_price:,.0f}")


SizeGroup distribution:
SizeGroup
Compact    426
Medium     690
Large      274
Mansion     68
Name: count, dtype: int64

SizeGroup percentage distribution:
Compact: 29.2%
Medium: 47.3%
Large: 18.8%
Mansion: 4.7%

Average SalePrice by SizeGroup:
Compact: $121,067
Medium: $178,018
Large: $243,420
Mansion: $333,774


## 4. Adding AreaPerSpace feature

## 5. Data validation and summary


In [77]:
# Check final shape and new features
print(f"Final dataset shape: {df.shape}")
print(f"\nTotal category columns: {(df.dtypes == 'category').sum()}")

# List all new grouped features
new_features = ['NeighborhoodGroup', 'AgeGroup', 'QualityGroup', 'SizeGroup']
print("\nNew grouped features created:")
for feature in new_features:
    if feature in df.columns:
        print(f"✓ {feature}: {df[feature].nunique()} categories")
    else:
        print(f"✗ {feature}: Not found")

# Check for any null values in new features
print("\nNull values in new features:")
for feature in new_features:
    if feature in df.columns:
        null_count = df[feature].isnull().sum()
        print(f"{feature}: {null_count} nulls")


Final dataset shape: (1458, 94)

Total category columns: 51

New grouped features created:
✓ NeighborhoodGroup: 3 categories
✓ AgeGroup: 4 categories
✓ QualityGroup: 4 categories
✓ SizeGroup: 4 categories

Null values in new features:
NeighborhoodGroup: 11 nulls
AgeGroup: 0 nulls
QualityGroup: 0 nulls
SizeGroup: 0 nulls


In [78]:
# Quick correlation analysis for hypothesis validation readiness
print("Correlation of key features with SalePrice:")
key_features = ['HouseAge', 'AreaPerSpace', 'TotRmsAbvGrd', 'OverallQual']
for feature in key_features:
    if feature in df.columns:
        corr = df[feature].corr(df['SalePrice'])
        print(f"{feature}: {corr:.3f}")

# Show sample of data with new grouped features
print("\nSample data with new grouped features:")
sample_cols = ['SalePrice', 'HouseAge', 'AgeGroup', 'NeighborhoodGroup', 
               'OverallQual', 'QualityGroup', 'GrLivArea', 'SizeGroup']
available_cols = [col for col in sample_cols if col in df.columns]
print(df[available_cols].head())


Correlation of key features with SalePrice:
HouseAge: -0.524
AreaPerSpace: 0.551
TotRmsAbvGrd: 0.538
OverallQual: 0.796

Sample data with new grouped features:
   SalePrice  HouseAge AgeGroup NeighborhoodGroup OverallQual QualityGroup  \
0     208500         5      New            Middle           7      Premium   
1     181500        31   Mature           Premium           6     Standard   
2     223500         7      New            Middle           7      Premium   
3     140000        91      Old            Middle           7      Premium   
4     250000         8      New           Premium           8      Premium   

   GrLivArea SizeGroup  
0       1710    Medium  
1       1262    Medium  
2       1786    Medium  
3       1717    Medium  
4       2198     Large  


In [79]:
df.shape


(1458, 94)

In [80]:
# Save the dataframe with all grouped features
output_path = '../../data/processed/feature_grouped_df.pkl'
df.to_pickle(output_path)
print(f"✓ Dataset saved to {output_path}")
print(f"✓ Ready for hypothesis validation with {df.shape[1]} features")


✓ Dataset saved to ../../data/processed/feature_grouped_df.pkl
✓ Ready for hypothesis validation with 94 features
