In [11]:
# notebooks/model_training.ipynb

import pandas as pd
import joblib
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Add app/ to path so we can import pipeline
sys.path.append(os.path.abspath('../app'))
from pipeline import clean_data

# Load and preprocess the data
df = pd.read_csv('../data/raw/Bengaluru_House_Data.csv')
X, y, scaler, feature_names = clean_data(df)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100)
}

# Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    rmse = mean_squared_error(y_test, preds) ** 0.5
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append((name, rmse, mae, r2))
    print(f"\n{name}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE : {mae:.2f}")
    print(f"  R²  : {r2:.2f}")

# Select best model based on RMSE
results.sort(key=lambda x: x[1])  # sort by RMSE ascending
best_model_name = results[0][0]
best_model = models[best_model_name]
print(f"\n✅ Best Model: {best_model_name}")

# Save best model, scaler, and feature list
joblib.dump(best_model, '../models/price_predictor.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(feature_names, '../models/features.pkl')



Linear Regression
  RMSE: 93.07
  MAE : 43.40
  R²  : 0.51

Decision Tree
  RMSE: 90.64
  MAE : 36.40
  R²  : 0.54

Random Forest
  RMSE: 82.90
  MAE : 33.26
  R²  : 0.61

✅ Best Model: Random Forest


['../models/features.pkl']