In [1]:
from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [19]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


In [None]:
import optuna
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
import sklearn.model_selection
def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    ensemble_model = trial.suggest_categorical("ensemble_model", ["GradientBoost", "RandomForest", "HistGradientBoost"])
    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    
    if (ensemble_model == 'GradientBoost'):
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
        estimator = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    elif (ensemble_model == 'HistGradientBoost'):
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
        estimator = HistGradientBoostingRegressor(max_iter=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    else:
        min_split = trial.suggest_int("min_split", 5, 50, log=True)
        estimator = RandomForestRegressor(min_samples_split=min_split, max_depth=max_depth, n_estimators=n_estimators)
    
    scoring = sklearn.model_selection.cross_val_score(estimator, x, y, n_jobs=-1, cv=3, scoring='neg_mean_squared_error')

    return scoring.mean()


In [24]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_trial)

[I 2025-03-25 08:29:20,316] A new study created in memory with name: no-name-bf79be1a-4e2d-4b96-ae46-fbe93fe4e11f
[I 2025-03-25 08:30:00,953] Trial 0 finished with value: -0.6136882588682313 and parameters: {'ensemble_model': 'HistGradientBoost', 'n_estimators': 148, 'max_depth': 12, 'learning_rate': 0.007832458877827188}. Best is trial 0 with value: -0.6136882588682313.
[I 2025-03-25 08:36:53,331] Trial 1 finished with value: -0.6198213672194938 and parameters: {'ensemble_model': 'GradientBoost', 'n_estimators': 410, 'max_depth': 12, 'learning_rate': 0.08923403782642546}. Best is trial 0 with value: -0.6136882588682313.
[I 2025-03-25 08:37:32,908] Trial 2 finished with value: -0.6152557218888043 and parameters: {'ensemble_model': 'HistGradientBoost', 'n_estimators': 154, 'max_depth': 9, 'learning_rate': 0.007467875542970141}. Best is trial 0 with value: -0.6136882588682313.
[I 2025-03-25 08:37:41,646] Trial 3 finished with value: -0.7697623621522366 and parameters: {'ensemble_model': 

FrozenTrial(number=73, state=1, values=[-0.5156532504794074], datetime_start=datetime.datetime(2025, 3, 25, 11, 46, 56, 176827), datetime_complete=datetime.datetime(2025, 3, 25, 11, 49, 10, 667713), params={'ensemble_model': 'GradientBoost', 'n_estimators': 268, 'max_depth': 6, 'learning_rate': 0.0465703966438768}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'ensemble_model': CategoricalDistribution(choices=('GradientBoost', 'RandomForest', 'HistGradientBoost')), 'n_estimators': IntDistribution(high=500, log=True, low=10, step=1), 'max_depth': IntDistribution(high=32, log=True, low=5, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None)}, trial_id=73, value=None)


In [25]:
study.best_trial.params

{'ensemble_model': 'GradientBoost',
 'n_estimators': 268,
 'max_depth': 6,
 'learning_rate': 0.0465703966438768}

In [26]:
final_model = GradientBoostingRegressor(n_estimators=268, max_depth=6, learning_rate=0.0465703966438768)
final_model.fit(X_filtered, y_filtered.flatten())

In [27]:
y_hat_scaled = final_model.predict(X_test_filtered)

In [28]:
y_hat = preprocessor.inverse_transform(y_hat_scaled.reshape(-1,1))



In [29]:
from sklearn.metrics import ( root_mean_squared_error, 
                             mean_absolute_error, 
                             mean_absolute_percentage_error )

In [30]:
y_true = y_test.values
y_pred = y_hat
rmse = root_mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

# Format metrics with units and percentages
metrics = {
    "RMSE (₹)": f"{rmse:,.2f}",
    "MAE (₹)": f"{mae:,.2f}",
    "MAPE (%)": f"{mape:.2%}"
}

# Display metrics in a formatted table
print("\nModel Performance Metrics:\n")
print(f"{'Metric':<15} {'Value':>15}")
print("-" * 30)
for metric, value in metrics.items():
    print(f"{metric:<15} {value:>15}")

# Add a summary interpretation
print("\nInterpretation:")
print(f"- RMSE: The model's predictions are typically off by ₹{rmse:,.2f} on average")
print(f"- MAE: The average absolute error is ₹{mae:,.2f}")
print(f"- MAPE: The predictions are off by {mape:.1%} on average")


Model Performance Metrics:

Metric                    Value
------------------------------
RMSE (₹)          26,233,075.94
MAE (₹)            6,268,114.94
MAPE (%)                 41.15%

Interpretation:
- RMSE: The model's predictions are typically off by ₹26,233,075.94 on average
- MAE: The average absolute error is ₹6,268,114.94
- MAPE: The predictions are off by 41.2% on average
