<img src="../phData.png">

### 0 - Setup

In [11]:
import pandas as pd
import os, sys
import numpy as np
import json

# Data Viz
import matplotlib.pyplot as plt
import seaborn as sns

# ML
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

# Adjusting relative paths (going up one level from the notebooks folder to the root)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
# Adding the root to sys.path to be able to import things like config.py
sys.path.append(project_root)
from api.config import DATA_PATH, MODELS_PATH, MODEL_REQUIRED_FEATURES, MAIN_ENDPOINT_INPUT_SCHEMA, SALES_DATA_COLUMNS, SALES_DATA_FEATURES

# Data to load
kc_sales_data_path = os.path.join(DATA_PATH, "kc_house_data.csv")
demographic_data_path = os.path.join(DATA_PATH, "zipcode_demographics.csv")
future_unseen_data = os.path.join(DATA_PATH, 'future_unseen_examples.csv')

### 1- Model & Data Loading

In [12]:
# Loading model
model = joblib.load(os.path.join(MODELS_PATH, "model.pkl"))

# Loading data
kc_sales_data = pd.read_csv(kc_sales_data_path, usecols=SALES_DATA_COLUMNS, dtype={'zipcode': str})
demographics_data = pd.read_csv(demographic_data_path, dtype={'zipcode': str})
merged_data = kc_sales_data.merge(demographics_data, how='left', on='zipcode').drop(columns='zipcode')

#### Splitting

In [None]:
# Split X and y
y = merged_data.pop('price')
X = merged_data

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Predicting & Evaluating

In [None]:
# Predict on train and test
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Define metrics
def evaluate(y_true, y_pred, dataset_name=''):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"R²:   {r2:.3f}")
    return rmse, mae, r2

# Evaluate
evaluate(y_train, y_pred_train, "Train")
evaluate(y_test, y_pred_test, "Test")

In [None]:
# Residual plot
plt.figure(figsize=(8,6))
sns.residplot(x=y_pred_test, y=y_test - y_pred_test, lowess=True, color="purple")
plt.title("Residual Plot (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.grid(True)
plt.show()

In [None]:
# Predicted vs Real
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Predicted vs. Actual (Test Set)")
plt.grid(True)
plt.show()

In [None]:
# Optional: Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-validated R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.3f}")