# **Feature Selection with CatBoost**

Predicting housing Prices: Kaggle competition dataset

In [185]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error as MSE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder 
from sklearn.feature_selection import RFE, RFECV, SelectFromModel 
from sklearn.linear_model import LassoCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor, Pool

### RFE with CatBoost

In [187]:
df = pd.read_csv("extended_df.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,nbd_price_sqf,nbd_avg_price,SubClass_avg_price,Zoning_avg_price,yearbuilt_avg_price,age_avg_price,age_afterRemodel_price,rooms_avg_price,bldgtype_avg_price,overalcond_avg_price
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,62.038341,197965.773333,65.750673,59.919799,227408.577778,235136.666667,189917.391304,213427.529412,185763.807377,203146.914738
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,68.934594,238772.727273,58.865992,59.919799,163831.969697,168832.419355,163939.464286,161303.29602,185763.807377,155651.736111
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,62.038341,197965.773333,65.750673,59.919799,242630.0,223333.333333,200525.375,161303.29602,185763.807377,203146.914738
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,65.396994,210624.72549,56.016462,59.919799,134387.5,131566.666667,131429.545455,196666.784195,185763.807377,203146.914738
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,70.024933,335295.317073,65.750673,59.919799,210766.666667,211244.0,209098.809524,252988.173333,185763.807377,203146.914738


In [188]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [189]:
# Split data
#X = df.drop(columns=['SalePrice', "price_sqft", "Id"])
X = df.drop(columns=['SalePrice', "price_sqft", "Id"])
X = X.dropna(axis='columns')
y = df['SalePrice']

# Identify categorical columns (assumes object or category dtype)
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoost model
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    early_stopping_rounds=50,
    verbose=0,
    random_state=42
)

In [190]:
# Use CatBoost Pool for native categorical handling
train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_test, y_test, cat_features=cat_features)

# Fit the model
catboost_model.fit(train_pool, eval_set=valid_pool)

<catboost.core.CatBoostRegressor at 0x1f7d07dbcb0>

In [191]:
# Get top 40 most important features
feature_importance = catboost_model.get_feature_importance(train_pool, type='FeatureImportance')
feature_names = X_train.columns
top_40_features = pd.Series(feature_importance, index=feature_names).nlargest(40).index.tolist()

# Subset data to top 40 features
X_train_top = X_train[top_40_features]
X_test_top = X_test[top_40_features]

# Update categorical list for reduced feature set
cat_features_top = [col for col in cat_features if col in top_40_features]


In [192]:
# Recreate Pools
train_pool_top = Pool(X_train_top, y_train, cat_features=cat_features_top)
test_pool_top = Pool(X_test_top, y_test, cat_features=cat_features_top)

# Retrain with top 40 features
catboost_model.fit(train_pool_top, eval_set=test_pool_top, verbose=0)

<catboost.core.CatBoostRegressor at 0x1f7d07dbcb0>

In [193]:
# Predict and evaluate
y_pred = catboost_model.predict(test_pool_top)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE using CatBoost with top 40 native features: ${mae:,.0f}")

MAE using CatBoost with top 40 native features: $15,004


In [194]:
best_mae = float('inf')
best_n = 0

for n in range(20, 61, 5):  # Try top 20 to 60 features
    top_n_features = pd.Series(feature_importance, index=feature_names).nlargest(n).index.tolist()
    X_train_n = X_train[top_n_features]
    X_test_n = X_test[top_n_features]
    cat_features_n = [col for col in cat_features if col in top_n_features]
    
    train_pool_n = Pool(X_train_n, y_train, cat_features=cat_features_n)
    test_pool_n = Pool(X_test_n, y_test, cat_features=cat_features_n)

    model_n = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.04,
        depth=6,
        early_stopping_rounds=50,
        verbose=0,
        random_state=42
    )
    model_n.fit(train_pool_n, eval_set=test_pool_n)
    y_pred_n = model_n.predict(test_pool_n)
    mae_n = mean_absolute_error(y_test, y_pred_n)

    if mae_n < best_mae:
        best_mae = mae_n
        best_n = n

print(f"Best MAE: ${best_mae:,.0f} with top {best_n} features")

Best MAE: $14,758 with top 40 features


In [195]:
top_40_features = pd.Series(feature_importance, index=feature_names).nlargest(40).index.tolist()

print("Top 40 features used:")
for i, feat in enumerate(top_40_features, 1):
    print(f"{i:2d}. {feat}")

Top 40 features used:
 1. total_area
 2. OverallQual
 3. nbd_avg_price
 4. GrLivArea
 5. total_bathrooms
 6. KitchenQual
 7. LotArea
 8. age_avg_price
 9. BsmtFinSF1
10. nbd_price_sqf
11. GarageCars
12. FireplaceQu
13. BsmtQual
14. FullBath
15. yearbuilt_avg_price
16. OverallCond
17. 2ndFlrSF
18. Fireplaces
19. 1stFlrSF
20. TotalBsmtSF
21. age
22. YearRemodAdd
23. ExterQual
24. YearBuilt
25. rooms_avg_price
26. BsmtUnfSF
27. SaleCondition
28. LandContour
29. BsmtExposure
30. age_afterRemodel
31. age_afterRemodel_price
32. SubClass_avg_price
33. MasVnrArea
34. GarageArea
35. BsmtFinType1
36. BsmtCond
37. OpenPorchSF
38. GarageYrBlt
39. Condition1
40. Exterior1st


In [196]:
cat_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'Heating',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

### Combine RFE/Random Forest with Catboost Selected Features and Select again with Catboost

In [198]:
catboost_selected = ['total_area', 'OverallQual', 'nbd_avg_price', 'GrLivArea', 'total_bathrooms',
                'KitchenQual', 'LotArea', 'age_avg_price', 'BsmtFinSF1', 'nbd_price_sqf', 'GarageCars',
                'FireplaceQu', 'BsmtQual', 'FullBath', 'yearbuilt_avg_price', 'OverallCond', '2ndFlrSF',
                'Fireplaces', '1stFlrSF', 'TotalBsmtSF', 'age', 'YearRemodAdd', 'ExterQual', 'YearBuilt',
                'rooms_avg_price', 'BsmtUnfSF', 'SaleCondition', 'LandContour', 'BsmtExposure', 'age_afterRemodel',
                'age_afterRemodel_price', 'SubClass_avg_price', 'MasVnrArea', 'GarageArea', 'BsmtFinType1',
                'BsmtCond', 'OpenPorchSF', 'GarageYrBlt', 'Condition1', 'Exterior1st', 'LotFrontage', 'BsmtFinType2', 
                'HeatingQC', 'CentralAir', 'BsmtFullBath', 'Zoning_avg_price', 'overalcond_avg_price']

In [199]:
rfe_selected = ['LotFrontage_yearbuilt_avg_price', 'OverallQual_GrLivArea', 'OverallQual_total_area',
                'OverallQual_nbd_price_sqf', 'OverallQual_rooms_avg_price', 'OverallCond_total_area',
                'OverallCond_age_avg_price', 'ExterQual_total_area', 'BsmtQual_total_area', 
                'BsmtQual_nbd_price_sqf', 'BsmtCond_total_area', 'BsmtExposure_2ndFlrSF', 
                'BsmtExposure_GrLivArea', 'BsmtFinSF1_nbd_avg_price', 'BsmtFinType2_FullBath',
                'BsmtFinType2_age_avg_price', 'HeatingQC_total_area', 'CentralAir_total_area',
                '1stFlrSF_age_avg_price', '2ndFlrSF_yearbuilt_avg_price', 'GrLivArea_nbd_avg_price',
                'GrLivArea_yearbuilt_avg_price', 'GrLivArea_age_avg_price', 'BsmtFullBath_yearbuilt_avg_price',
                'KitchenQual_total_area', 'Fireplaces_yearbuilt_avg_price', 'Fireplaces_age_afterRemodel_price',
                'total_area_nbd_price_sqf', 'total_area_nbd_avg_price', 'total_area_SubClass_avg_price',
                'total_area_Zoning_avg_price', 'total_area_yearbuilt_avg_price', 'total_area_age_avg_price',
                'total_area_age_afterRemodel_price', 'total_area_overalcond_avg_price', 'total_bathrooms_nbd_avg_price',
                'total_bathrooms_age_avg_price', 'total_bathrooms_age_afterRemodel_price', 'nbd_price_sqf_age_avg_price',
                'nbd_avg_price_rooms_avg_price']


RFE selected only interaction features which do not yet exist in the df. Create those features and then select again using catboost

In [201]:
# Define function to add features safely to train and test
def add_poly_features(df):
    df = df.copy()
    df['expected_price'] = df['GrLivArea'] * df['nbd_avg_price']
    df["YearBuilt_total_area"] = df["YearBuilt"] * df["total_area"]
    df['LotFrontage_yearbuilt_avg_price'] = df['LotFrontage'] * df['yearbuilt_avg_price']
    df['OverallQual_GrLivArea'] = df['OverallQual'] * df['GrLivArea']
    df['OverallQual_total_area'] = df['OverallQual'] * df['total_area']
    df['OverallQual_nbd_price_sqf'] = df['OverallQual'] * df['nbd_price_sqf']
    df['OverallQual_rooms_avg_price'] = df['OverallQual'] * df['rooms_avg_price']
    df["OverallCond_total_area"] = df["OverallCond"] * df["total_area"]
    df['OverallCond_age_avg_price'] = df['OverallCond'] * df['age_avg_price']
    df['ExterQual_total_area'] = df['ExterQual'] * df['total_area']
    df['BsmtQual_total_area'] = df['BsmtQual'] * df['total_area']
    df['BsmtQual_nbd_price_sqf'] = df['BsmtQual'] * df['nbd_price_sqf']
    df['BsmtCond_total_area'] = df['BsmtCond'] * df['total_area']
    df['BsmtExposure_2ndFlrSF'] = df['BsmtExposure'] * df['2ndFlrSF']
    df['BsmtExposure_GrLivArea'] = df['BsmtExposure'] * df['GrLivArea']
    df['BsmtFinSF1_nbd_avg_price'] = df['BsmtFinSF1'] * df['nbd_avg_price']
    df['BsmtFinType2_FullBath'] = df['BsmtFinType2'] * df['FullBath']
    df['BsmtFinType2_age_avg_price'] = df['BsmtFinType2'] * df['age_avg_price']
    df['HeatingQC_total_area'] = df['HeatingQC'] * df['total_area']
    df['CentralAir_total_area'] = df['CentralAir'] * df['total_area']
    df['1stFlrSF_age_avg_price'] = df['1stFlrSF'] * df['age_avg_price']
    df['2ndFlrSF_yearbuilt_avg_price'] = df['2ndFlrSF'] * df['yearbuilt_avg_price']
    df['GrLivArea_nbd_avg_price'] = df['GrLivArea'] * df['nbd_avg_price']
    df['GrLivArea_yearbuilt_avg_price'] = df['GrLivArea'] * df['yearbuilt_avg_price']
    df['GrLivArea_age_avg_price'] = df['GrLivArea'] * df['age_avg_price']
    df['BsmtFullBath_yearbuilt_avg_price'] = df['BsmtFullBath'] * df['yearbuilt_avg_price']
    df['KitchenQual_total_area'] = df['KitchenQual'] * df['total_area']
    df['Fireplaces_yearbuilt_avg_price'] = df['Fireplaces'] * df['yearbuilt_avg_price']
    df['Fireplaces_age_afterRemodel_price'] = df['Fireplaces'] * df['age_afterRemodel_price']
    df['total_area_nbd_price_sqf'] = df['total_area'] * df['nbd_price_sqf']
    df['total_area_nbd_avg_price'] = df['total_area'] * df['nbd_avg_price']
    df['total_area_SubClass_avg_price'] = df['total_area'] * df['SubClass_avg_price']
    df['total_area_Zoning_avg_price'] = df['total_area'] * df['Zoning_avg_price']
    df['total_area_yearbuilt_avg_price'] = df['total_area'] * df['yearbuilt_avg_price']
    df['total_area_age_avg_price'] = df['total_area'] * df['age_avg_price']
    df['total_area_age_afterRemodel_price'] = df['total_area'] * df['age_afterRemodel_price']
    df['total_area_overalcond_avg_price'] = df['total_area'] * df['overalcond_avg_price']
    df['total_bathrooms_nbd_avg_price'] = df['total_bathrooms'] * df['nbd_avg_price']
    df['total_bathrooms_age_avg_price'] = df['total_bathrooms'] * df['age_avg_price']
    df['total_bathrooms_age_afterRemodel_price'] = df['total_bathrooms'] * df['age_afterRemodel_price']
    df['nbd_price_sqf_age_avg_price'] = df['nbd_price_sqf'] * df['age_avg_price']
    df['nbd_avg_price_rooms_avg_price'] = df['nbd_avg_price'] * df['rooms_avg_price']

    return df

In [202]:
X_new = df.loc[:, catboost_selected]
y = df['SalePrice']

In [203]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

X_train = add_poly_features(X_train_full)
X_test = add_poly_features(X_test_full)

In [204]:
cat_features = X_new.select_dtypes(include=['object', 'category']).columns.tolist()

In [205]:
# Initialize CatBoost model
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    early_stopping_rounds=50,
    verbose=0,
    random_state=42
)

In [206]:
# Use CatBoost Pool for native categorical handling
train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_test, y_test, cat_features=cat_features)

# Fit the model
catboost_model.fit(train_pool, eval_set=valid_pool)

<catboost.core.CatBoostRegressor at 0x1f7d28397f0>

In [207]:
# Get top 40 most important features
feature_importance = catboost_model.get_feature_importance(train_pool, type='FeatureImportance')
feature_names = X_train.columns
top_40_features = pd.Series(feature_importance, index=feature_names).nlargest(40).index.tolist()

# Subset data to top 40 features
X_train_top = X_train[top_40_features]
X_test_top = X_test[top_40_features]

# Update categorical list for reduced feature set
cat_features_top = [col for col in cat_features if col in top_40_features]

In [208]:
# Recreate Pools
train_pool_top = Pool(X_train_top, y_train, cat_features=cat_features_top)
test_pool_top = Pool(X_test_top, y_test, cat_features=cat_features_top)

# Retrain with top 40 features
catboost_model.fit(train_pool_top, eval_set=test_pool_top, verbose=0)

<catboost.core.CatBoostRegressor at 0x1f7d28397f0>

In [209]:
# Predict and evaluate
y_pred = catboost_model.predict(test_pool_top)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE using CatBoost with top 40 native features: ${mae:,.0f}")

MAE using CatBoost with top 40 native features: $15,042


In [231]:
best_mae = float('inf')
best_n = 0

for n in range(20, 61, 5):  # Try top 20 to 60 features
    top_n_features = pd.Series(feature_importance, index=feature_names).nlargest(n).index.tolist()
    X_train_n = X_train[top_n_features]
    X_test_n = X_test[top_n_features]
    cat_features_n = [col for col in cat_features if col in top_n_features]
    
    train_pool_n = Pool(X_train_n, y_train, cat_features=cat_features_n)
    test_pool_n = Pool(X_test_n, y_test, cat_features=cat_features_n)

    model_n = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.038,
        depth = 10,
        #boosting_type = 'Ordered',
        early_stopping_rounds=50,
        verbose=0,
        random_state=42
    )
    model_n.fit(train_pool_n, eval_set=test_pool_n)
    y_pred_n = model_n.predict(test_pool_n)
    mae_n = mean_absolute_error(y_test, y_pred_n)

    if mae_n < best_mae:
        best_mae = mae_n
        best_n = n

print(f"Best MAE: ${best_mae:,.0f} with top {best_n} features")

Best MAE: $14,118 with top 30 features


In [None]:
Best MAE: $14,737 with top 55 features 0.03

In [238]:
top_30_features = pd.Series(feature_importance, index=feature_names).nlargest(30).index.tolist()

print("Top 30 features used:")
for i, feat in enumerate(top_30_features, 1):
    print(f"{i:2d}. {feat}")

Top 30 features used:
 1. OverallQual_total_area
 2. KitchenQual_total_area
 3. total_area_nbd_avg_price
 4. total_area_yearbuilt_avg_price
 5. BsmtQual_total_area
 6. OverallCond_total_area
 7. OverallQual
 8. ExterQual_total_area
 9. total_area_nbd_price_sqf
10. GrLivArea_nbd_avg_price
11. BsmtExposure_GrLivArea
12. nbd_avg_price_rooms_avg_price
13. nbd_price_sqf
14. nbd_avg_price
15. total_area_age_avg_price
16. BsmtQual_nbd_price_sqf
17. total_bathrooms_nbd_avg_price
18. OverallQual_GrLivArea
19. LotArea
20. Fireplaces_age_afterRemodel_price
21. HeatingQC_total_area
22. Fireplaces_yearbuilt_avg_price
23. total_area_age_afterRemodel_price
24. total_bathrooms
25. OverallCond_age_avg_price
26. OverallQual_rooms_avg_price
27. total_area_Zoning_avg_price
28. total_area_SubClass_avg_price
29. CentralAir_total_area
30. BsmtFinSF1
