In [11]:
# modeling_all_models.py
# Trains 6 ML models on all 8 pre-processed versions
# Saves every model + best ones

import pandas as pd
import numpy as np
import joblib
import os
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [12]:
# === 1. CONFIG ===
DATA_DIR = "versions"
MODEL_DIR = "models_all"
os.makedirs(MODEL_DIR, exist_ok=True)

versions = [
    "Raw_All", "Raw_Selected", "Out_All", "Out_Selected",
    "Norm_All", "Norm_Selected", "Clean_All", "Clean_Selected"
]

# Model-specific version preference (to reduce overfitting)
MODEL_VERSION = {
    "Linear Regression": "Clean_All",        # Needs scaling + no high-dim
    "Decision Tree":     "Out_Selected",     # Trees love clean data
    "Random Forest":     "Clean_Selected",   # Best combo
    "Gradient Boosting": "Clean_Selected",
    "XGBoost":           "Clean_Selected",
    "CatBoost":         "Clean_All",
    "AdaBoost":          "Out_All"           # Simpler data
}


In [13]:
# === 2. Define Models ===
# models = {
#     "Linear Regression": LinearRegression(),
#     "Decision Tree":     DecisionTreeRegressor(max_depth=15, random_state=42),
#     "Random Forest":     RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42, n_jobs=-1),
#     "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, max_depth=6, random_state=42),
#     "XGBoost":           XGBRegressor(n_estimators=400, max_depth=6, learning_rate=0.05, random_state=42, colsample_bytree = 0.8, subsample= 0.8),
#     "CatBoost":         CatBoostRegressor(iterations=300, depth=8, learning_rate=0.1, random_state=42, verbose=0),
#     "AdaBoost":          AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=200, random_state=42)
# }

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree":     DecisionTreeRegressor(max_depth=10, min_samples_leaf= 2, min_samples_split= 10, random_state=42),
    "Random Forest":     RandomForestRegressor(n_estimators=400, max_depth=None, random_state=42, min_samples_leaf= 2, min_samples_split= 2),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, max_depth=7, random_state=42, learning_rate= 0.05, subsample= 0.8),
    "XGBoost":           XGBRegressor(n_estimators=400, max_depth=6, learning_rate=0.05, random_state=42, colsample_bytree = 0.8, subsample= 0.8),
    "CatBoost":         CatBoostRegressor(iterations=500, depth=8, learning_rate=0.1, random_state=42, l2_leaf_reg= 1),
    "AdaBoost":          AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=100, learning_rate= 0.5, random_state=42)
}

In [14]:
# === 3. Train & Save All ===
results = {}
best_overall_mae = np.inf
best_model = None
best_name = None
best_version = None

for model_name, model in models.items():
    v = MODEL_VERSION[model_name]
    print(f"\nTraining {model_name} on {v}...")

    # Load data
    train_path = os.path.join(DATA_DIR, f"{v}_train.csv")
    test_path  = os.path.join(DATA_DIR, f"{v}_test.csv")
    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    y_train = np.log1p(train_df['price'])
    X_train = train_df.drop('price', axis=1)
    y_test  = np.log1p(test_df['price'])
    X_test  = test_df.drop('price', axis=1)

    # Fit
    model.fit(X_train, y_train)
    pred_log = model.predict(X_test)
    pred = np.expm1(pred_log)
    actual = np.expm1(y_test)

    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)

    key = f"{model_name}__{v}"
    results[key] = {"MAE": mae, "R2": r2, "model": model, "version": v}

    # Save model
    model_path = os.path.join(MODEL_DIR, f"model_{model_name.replace(' ', '_')}__{v}.pkl")
    joblib.dump(model, model_path)

    print(f"   MAE: ${mae:,.0f} | R²: {r2:.4f} → saved: {os.path.basename(model_path)}")

    # Track best
    if mae < best_overall_mae:
        best_overall_mae = mae
        best_model = model
        best_name = model_name
        best_version = v
        best_path = model_path




Training Linear Regression on Clean_All...
   MAE: $88,697 | R²: 0.6714 → saved: model_Linear_Regression__Clean_All.pkl

Training Decision Tree on Out_Selected...
   MAE: $71,636 | R²: 0.7188 → saved: model_Decision_Tree__Out_Selected.pkl

Training Random Forest on Clean_Selected...
   MAE: $55,292 | R²: 0.8423 → saved: model_Random_Forest__Clean_Selected.pkl

Training Gradient Boosting on Clean_Selected...
   MAE: $52,212 | R²: 0.8661 → saved: model_Gradient_Boosting__Clean_Selected.pkl

Training XGBoost on Clean_Selected...
   MAE: $52,153 | R²: 0.8673 → saved: model_XGBoost__Clean_Selected.pkl

Training CatBoost on Clean_All...
0:	learn: 0.4249528	total: 11ms	remaining: 5.5s
1:	learn: 0.3962338	total: 32.3ms	remaining: 8.04s
2:	learn: 0.3715964	total: 37.6ms	remaining: 6.23s
3:	learn: 0.3481918	total: 41.9ms	remaining: 5.19s
4:	learn: 0.3276336	total: 46.9ms	remaining: 4.64s
5:	learn: 0.3096822	total: 50.9ms	remaining: 4.19s
6:	learn: 0.2937778	total: 54.8ms	remaining: 3.86s
7:	lea

In [15]:
results = []

print("Training 7 models and computing % metrics...")

for name, model in models.items():
    v = MODEL_VERSION[name]
    print(f"  → {name:<15} on {v}")

    # Load data
    train_path = os.path.join(DATA_DIR, f"{v}_train.csv")
    test_path  = os.path.join(DATA_DIR, f"{v}_test.csv")

    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print(f"     Missing data for {v}")
        continue

    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    # Actual prices (original scale)
    actual_train = train_df['price']
    actual_test  = test_df['price']

    # Mean price for percentage
    mean_price_train = actual_train.mean()
    mean_price_test  = actual_test.mean()

    # Log transform
    y_train = np.log1p(train_df['price'])
    X_train = train_df.drop('price', axis=1)
    y_test  = np.log1p(test_df['price'])
    X_test  = test_df.drop('price', axis=1)

    # Fit
    model.fit(X_train, y_train)

    # Predictions
    pred_train = np.expm1(model.predict(X_train))
    pred_test  = np.expm1(model.predict(X_test))

    # Metrics function
    def calc_metrics(y_true, y_pred, mean_price):
        mae  = mean_absolute_error(y_true, y_pred)
        mse  = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        return {
            "MAE":  mae,
            "MAE%": mae / mean_price * 100,
            "MSE":  mse,
            "MSE%": mse / (mean_price ** 2) * 100,
            "RMSE": rmse,
            "RMSE%": rmse / mean_price * 100,
            "R2":   r2_score(y_true, y_pred)
        }

    train_metrics = calc_metrics(actual_train, pred_train, mean_price_train)
    test_metrics  = calc_metrics(actual_test,  pred_test,  mean_price_test)

    # Append
    for split, metrics, mean_price in [
        ("Train", train_metrics, mean_price_train),
        ("Test",  test_metrics,  mean_price_test)
    ]:
        results.append({
            "Model": name,
            "Version": v,
            "Split": split,
            "MAE":  metrics["MAE"],
            "MAE%": metrics["MAE%"],
            "MSE":  metrics["MSE"],
            "MSE%": metrics["MSE%"],
            "RMSE": metrics["RMSE"],
            "RMSE%": metrics["RMSE%"],
            "R2":   metrics["R2"]
        })

Training 7 models and computing % metrics...
  → Linear Regression on Clean_All
  → Decision Tree   on Out_Selected
  → Random Forest   on Clean_Selected
  → Gradient Boosting on Clean_Selected
  → XGBoost         on Clean_Selected
  → CatBoost        on Clean_All
0:	learn: 0.4249528	total: 5.32ms	remaining: 2.66s
1:	learn: 0.3962338	total: 10.4ms	remaining: 2.59s
2:	learn: 0.3715964	total: 14.4ms	remaining: 2.39s
3:	learn: 0.3481918	total: 18.5ms	remaining: 2.29s
4:	learn: 0.3276336	total: 24.1ms	remaining: 2.39s
5:	learn: 0.3096822	total: 28ms	remaining: 2.31s
6:	learn: 0.2937778	total: 32ms	remaining: 2.26s
7:	learn: 0.2801916	total: 36.3ms	remaining: 2.23s
8:	learn: 0.2680866	total: 41ms	remaining: 2.24s
9:	learn: 0.2564270	total: 45ms	remaining: 2.2s
10:	learn: 0.2469764	total: 48.9ms	remaining: 2.17s
11:	learn: 0.2376661	total: 53.7ms	remaining: 2.19s
12:	learn: 0.2295665	total: 59.4ms	remaining: 2.22s
13:	learn: 0.2230336	total: 63.4ms	remaining: 2.2s
14:	learn: 0.2179118	total:

In [16]:
# === FINAL TABLE ===
df_results = pd.DataFrame(results)

# Format
df_results["MAE"]   = df_results["MAE"].round(0).astype(int)
df_results["MAE%"]  = df_results["MAE%"].round(2)
df_results["MSE"]   = df_results["MSE"].round(0).astype(int)
df_results["MSE%"]  = df_results["MSE%"].round(2)
df_results["RMSE"]  = df_results["RMSE"].round(0).astype(int)
df_results["RMSE%"] = df_results["RMSE%"].round(2)
df_results["R2"]    = df_results["R2"].round(4)

# Sort
df_results = df_results.sort_values(["Model", "Version", "Split"]).reset_index(drop=True)

# === DISPLAY ===
print("\n" + "="*130)
print("FULL METRICS WITH % RELATIVE TO MEAN PRICE (7 MODELS)".center(130))
print("="*130)
print(df_results.to_string(index=False))

# === SAVE ===
df_results.to_csv("model_metrics_with_percent.csv", index=False)
print(f"\nResults saved → model_metrics_with_percent.csv")


                                      FULL METRICS WITH % RELATIVE TO MEAN PRICE (7 MODELS)                                       
            Model        Version Split   MAE  MAE%         MSE  MSE%   RMSE  RMSE%     R2
         AdaBoost        Out_All  Test 55796 11.79  9058010648  4.05  95174  20.11 0.8382
         AdaBoost        Out_All Train   243  0.05     3591054  0.00   1895   0.40 0.9999
         CatBoost      Clean_All  Test 50860 10.75  6858240875  3.06  82814  17.50 0.8775
         CatBoost      Clean_All Train 33573  7.15  2458951166  1.12  49588  10.56 0.9547
    Decision Tree   Out_Selected  Test 71636 15.14 15741403750  7.03 125465  26.51 0.7188
    Decision Tree   Out_Selected Train 54663 11.64  7602247851  3.45  87191  18.57 0.8599
Gradient Boosting Clean_Selected  Test 52212 11.03  7493604635  3.35  86566  18.29 0.8661
Gradient Boosting Clean_Selected Train 32363  6.89  2179741937  0.99  46688   9.95 0.9598
Linear Regression      Clean_All  Test 88697 18.74 1839486