In [None]:
# -- Imports --
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -- Load Preprocessed Data --
processed_data_path = "../data/processed_toronto_hpi.csv"
df = pd.read_csv(processed_data_path)

print("✅ Preprocessed data loaded.")

# -- Features and Target --
features = ['Year', 'Month']
target = 'HPI'

X = df[features]
y = df[target]

# -- Train-Test Split --
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("✅ Data split into training and testing sets.")

# -- Model Training --
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
print("✅ Model trained.")

# -- Model Evaluation --
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:\nRMSE: {rmse:.2f}\nR²: {r2:.2f}")

# -- Feature Importance --
plt.figure(figsize=(8, 6))
importance = model.feature_importances_
sns.barplot(x=features, y=importance)
plt.title('Feature Importance')
plt.show()

# -- Prediction Visualization --
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual HPI")
plt.ylabel("Predicted HPI")
plt.title("Actual vs Predicted Home Price Index")
plt.grid(True)
plt.show()

# -- Save Trained Model --
import joblib
model_path = "../model/price_model.pkl"
joblib.dump(model, model_path)
print(f"✅ Model saved to: {model_path}")
