### Post-Model Feature Pruning (Gradient Boosting)

After selecting Gradient Boosting as our best model based on cross-validation RMSE, we performed model-specific feature pruning. This was done using permutation importance to identify the top 15 most predictive features. We then retrained and evaluated the model on this reduced feature set.


In [None]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Recreate and retrain the best model (Gradient Boosting with selected hyperparameters)
final_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
final_model.fit(X_train, y_train)

# Compute permutation importance
perm = permutation_importance(final_model, X_test, y_test, n_repeats=5, random_state=42)
sorted_idx = perm.importances_mean.argsort()[::-1]

# Get top 15 features
top_features = X_numeric.columns[sorted_idx[:15]]
print("Top 15 Features:", list(top_features))

# Filter train/test data
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Retrain and evaluate
final_model.fit(X_train_top, y_train)
y_pred_top = final_model.predict(X_test_top)
rmse_top = np.sqrt(mean_squared_error(y_test, y_pred_top))
print(f"Improved RMSE using top 15 features: {rmse_top:.2f}")


In [None]:
# ======= RMSE Visualization =======
import matplotlib.pyplot as plt

rmse_all_features = results_df[results_df['Model'] == 'GradientBoosting']['Test RMSE'].values[0]
labels = ['All Features', 'Top 15 Features']
rmse_values = [rmse_all_features, rmse_top]

plt.figure(figsize=(8, 5))
plt.bar(labels, rmse_values)
plt.ylabel('RMSE')
plt.title('RMSE Comparison: All Features vs. Top 15 Features')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

rmse_change = rmse_all_features - rmse_top
improvement_pct = (rmse_change / rmse_all_features) * 100
print(f"RMSE improved by: {rmse_change:.2f} ({improvement_pct:.2f}%)")
