# KKD Real Estate Price Prediction



## Data loading and preprocessing
### Load data

In [None]:
import os
os.getcwd()

'c:\\Users\\oleho\\ikt110h25\\handin2\\handin2\\knutKnutOppgave'

In [None]:
# Imports
import numpy as np
import json
from pathlib import Path

# Load data function
def load_data(path: str):
    path = Path(path)
    with path.open('r') as f:
        return [json.loads(line) for line in f if line.strip()]

# Load datasets
houses = load_data("data/houses_clean.jsonl")
agents = load_data("data/agents.jsonl")
districts = load_data("data/districts.jsonl")
schools = load_data("data/schools.jsonl")

# Convert to lookup dicts
agent_map = {a["agent_id"]: a for a in agents}
district_map_full = {d["id"]: d for d in districts}
school_map_full = {s["id"]: s for s in schools}

# One-hot encoding function
def one_hot_encode_int_matrix(int_matrix, n_classes_per_feature):
    """
    int_matrix: shape (n_samples, n_cat_features)
    n_classes_per_feature: list with number of classes for each categorical feature
    Returns:
        one_hot: np.array shape (n_samples, sum(n_classes_per_feature))
    """
    n_samples, n_features = int_matrix.shape
    oh_columns = []

    for j in range(n_features):
        col = int_matrix[:, j].astype(int)
        n_classes = n_classes_per_feature[j]

        # Allocate zeros
        oh = np.zeros((n_samples, n_classes), float)

        # Mask valid categories (>=0)
        mask = col >= 0
        oh[np.where(mask)[0], col[mask]] = 1.0

        oh_columns.append(oh)

    return np.hstack(oh_columns) if oh_columns else np.empty((n_samples, 0))

## Feature engineering
Add districts and school features

In [None]:
for h in houses:
    # District
    d = district_map_full.get(h["district_id"], {})
    h["district_crime"] = d.get("crime_rating", 0)
    h["district_transport"] = d.get("public_transport_rating", 0)

    # School
    s = school_map_full.get(h["school_id"], {})
    h["school_rating"] = s.get("rating", 0)
    h["school_capacity"] = s.get("capacity", 0)
    h["school_age"] = 2025 - s.get("built_year", 2025)

    # House age
    year_built = h.get("year", 2025)
    remodel = h.get("remodeled", -1)
    if remodel > 0:
        h["effective_age"] = 2025 - remodel
    else:
        h["effective_age"] = 2025 - year_built

    # price per m2
    size = h.get("size", 0)
    if size > 0:
        h["price_per_m2"] = h["price"] / size
    else:
        h["price_per_m2"] = 0

## Define features 

In [None]:
# =========================
# Price prediciton features
# =========================

# # Numeric features
# price_numeric_features = [
#     "condition_rating", "external_storage_m2", "kitchens", "lot_w",
#     "size", "storage_rating", "sun_factor",
#     "district_crime", "district_transport",
#     "school_rating", "school_capacity", "school_age",
#     "effective_age"
# ]

price_numeric_features = ['size', 'external_storage_m2', 'lot_w']

price_categorical_features = ['sold_in_month', 'agent_id']
# Categorical features encoded as ints
#price_categorical_features = [
#    "advertisement", "color", "fireplace", "parking",
#    "rooms"
#]

price_category_values = {
    feature: sorted({h.get(feature, "") for h in houses})
    for feature in price_categorical_features
}

price_category_maps = {
    feature: {v: i for i, v in enumerate(vals)}
    for feature, vals in price_category_values.items()
}

# ID encoding
agent_id_map = {a["agent_id"]: i for i, a in enumerate(agents)}
district_id_map = {d["id"]: i for i, d in enumerate(districts)}
school_id_map = {s["id"]: i for i, s in enumerate(schools)}

# ========================
# Days on marked features
# ========================

# Numeric features
marked_numeric_features = [
    "condition_rating", "external_storage_m2", "kitchens", "lot_w", 
    "storage_rating", "sun_factor",
    "district_crime", "district_transport", "school_capacity",
    "price_per_m2", "size"
]

# Categorical features encoded as ints
marked_categorical_features = [
    "advertisement", "color",
    "rooms", "agent_id"
]


marked_category_values = {
    feature: sorted({h.get(feature, "") for h in houses})
    for feature in marked_categorical_features
}

marked_category_maps = {
    feature: {v: i for i, v in enumerate(vals)}
    for feature, vals in marked_category_values.items()
}

# ID encoding
agent_id_map = {a["agent_id"]: i for i, a in enumerate(agents)}
district_id_map = {d["id"]: i for i, d in enumerate(districts)}
school_id_map = {s["id"]: i for i, s in enumerate(schools)}

## Feature matrix

In [None]:
# =========================
# Price prediction matrix (fixed)
# =========================

# ----- Numeric features -----
price_num_data = np.array(
    [[h.get(f, 0) for f in price_numeric_features] for h in houses],
    float
)

# ----- Int-encoded categorical features -----
price_cat_int = np.array([
    [price_category_maps[f].get(h.get(f, ""), -1) for f in price_categorical_features]
    for h in houses
], int)

# ----- One-hot encode categorical features -----
price_n_classes = [len(price_category_maps[f]) for f in price_categorical_features]
price_cat_oh = one_hot_encode_int_matrix(price_cat_int, price_n_classes)

# ----- Combine numeric + one-hot -----
X_price = np.hstack([price_num_data, price_cat_oh])
y_price = np.array([[h["price"]] for h in houses], float)

# ----- Train/test split -----
n = len(X_price)
split = int(0.8 * n)

X_train_price = X_price[:split]
X_test_price  = X_price[split:]

y_train_price = y_price[:split]
y_test_price  = y_price[split:]

# ----- Scaling -----
# Only scale numeric features (first len(price_numeric_features) columns)
X_mean = X_train_price[:, :len(price_numeric_features)].mean(axis=0)
X_std  = X_train_price[:, :len(price_numeric_features)].std(axis=0) + 1e-8

# Scale numeric columns; leave categorical one-hot unchanged
X_train_price_scaled = X_train_price.copy()
X_train_price_scaled[:, :len(price_numeric_features)] = (
    X_train_price[:, :len(price_numeric_features)] - X_mean
) / X_std

X_test_price_scaled = X_test_price.copy()
X_test_price_scaled[:, :len(price_numeric_features)] = (
    X_test_price[:, :len(price_numeric_features)] - X_mean
) / X_std

# ----- Log-transform target only -----
y_train_price_scaled = np.log1p(y_train_price)
y_test_price_scaled  = np.log1p(y_test_price)


# ========================
# Days on marked features
# ========================

# ----- Numeric matrix -----
marked_num_data = np.array(
    [[h.get(f, 0) for f in marked_numeric_features] for h in houses],
    float
)

# ----- Int-encoded categorical matrix -----
marked_cat_int = np.array([
    [marked_category_maps[f].get(h.get(f, ""), -1) for f in marked_categorical_features]
    for h in houses
], int)

# Number of classes per categorical field
marked_n_classes = [
    len(marked_category_maps[f])
    for f in marked_categorical_features
]

# ----- One-hot encode -----
marked_cat_oh = one_hot_encode_int_matrix(marked_cat_int, marked_n_classes)

# ----- Combine numeric + one-hot -----
X_marked = np.hstack([marked_num_data, marked_cat_oh])
y_marked = np.array([[h["days_on_marked"]] for h in houses], float)

# ----- Train/test split -----
n = len(X_marked)
split = int(0.8 * n)

X_train_marked = X_marked[:split]
X_test_marked  = X_marked[split:]

y_train_marked = y_marked[:split]
y_test_marked  = y_marked[split:]

# ----- Scaling -----
X_mean = X_train_marked.mean(axis=0)
X_std  = X_train_marked.std(axis=0) + 1e-8

X_train_marked_scaled = (X_train_marked - X_mean) / X_std
X_test_marked_scaled  = (X_test_marked - X_mean) / X_std

# y scaling (log transform only + no standardization!)
y_train_marked_scaled = np.log1p(y_train_marked)
y_test_marked_scaled  = np.log1p(y_test_marked)


In [None]:
import pickle
from pathlib import Path

models_dir = Path("./models")
models_dir.mkdir(parents=True, exist_ok=True)

# Price scaler (numeric cols only)
price_num_len = len(price_numeric_features)
X_mean_price = X_train_price[:, :price_num_len].mean(axis=0)
X_std_price  = X_train_price[:, :price_num_len].std(axis=0) + 1e-8

# Marked scaler (all columns were scaled)
X_mean_marked = X_train_marked.mean(axis=0)
X_std_marked  = X_train_marked.std(axis=0) + 1e-8

scalers = {
    "price": {
        "mean": X_mean_price,
        "std": X_std_price,
        "numeric_features": price_numeric_features
    },
    "marked": {
        "mean": X_mean_marked,
        "std": X_std_marked,
        "numeric_features": marked_numeric_features
    }
}

with open(models_dir / "scalers.pkl", "wb") as f:
    pickle.dump(scalers, f)

print(f"Saved scalers to {models_dir / 'scalers.pkl'}")

Saved scalers to models\scalers.pkl


## Simple dataset for testing SGD

In [None]:
# Create a simple linear dataset: y = 3x + 2 + noise
np.random.seed(0)
x_vals = np.linspace(-5, 5, 1000)  # more points
noise = np.random.normal(scale=1.5, size=x_vals.shape)
y_vals = 3 * x_vals + 2 + noise

# reshape for your code:
xs = x_vals.reshape(-1, 1)  # shape (m, 1)
ys = y_vals.reshape(-1, 1)  # shape (m, 1)

train_split = 0.8
n_samples = int(xs.shape[0] * train_split)
x_train = xs[:n_samples]
y_train = ys[:n_samples]
x_test = xs[n_samples:]
y_test = ys[n_samples:]

## Model

In [None]:
def predict(theta, xs, bias):
    return np.dot(xs, theta) + bias

def J_squared_residual(theta, bias, xs, y):
    h = predict(theta, xs, bias)
    sr = ((h - y) ** 2).sum()
    return sr

def gradient_J_squared_residual(theta, bias, xs, y):
    """Return gradient wrt theta (weights) and bias separately."""
    h = predict(theta, xs, bias)
    # grad_theta shape: (n_features, 1)
    grad_theta = np.dot(xs.transpose(), (h - y))
    # grad_bias is scalar
    grad_bias = (h - y).sum()
    return grad_theta, grad_bias

def mse(theta, bias, xs, y):
    h = predict(theta, xs, bias)
    return ((h - y) ** 2).mean()

def train_model(xs, y, lr=0.01, batch_size=128, epochs=500):
    # initialize parameters
    n_features = xs.shape[1]
    theta = np.zeros((n_features, 1))
    bias = 0.0
    m = xs.shape[0]
    j_history = []
    for epoch in range(epochs):
        # shuffle data at the start of each epoch
        perm = np.random.permutation(m)
        xs_shuffled = xs[perm]
        y_shuffled = y[perm]
        # process mini-batches
        for i in range(0, m, batch_size):
            x_batch = xs_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            b = x_batch.shape[0] 
            if b == 0:
                continue

            # compute gradients
            grad_theta, grad_bias = gradient_J_squared_residual(theta, bias, x_batch, y_batch)
            # theta -= lr * grad_theta * (1.0 / b) * batch_size
            # bias -= lr * grad_bias * (1.0 / b) * batch_size
            theta -= lr * (grad_theta / b)
            bias  -= lr * (grad_bias / b)
    
        # compute and record loss at epoch end
        j = J_squared_residual(theta, bias, xs, y)
        j_history.append(j)
        loss = mse(theta, bias, xs, y)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")
    
    return theta, bias, j_history

## Training

In [None]:
# =====================
# Train the price model
# =====================
theta_price, bias_price, j_history_price = train_model(
    X_train_price_scaled, y_train_price_scaled)

print("theta shape:", theta_price.shape)
print("bias:", bias_price)

# append the final result.
j = J_squared_residual(theta_price, bias_price, X_train_price_scaled, y_train_price_scaled)
j_history_price.append(j)
print("The L2 error is: {:.2f}".format(j))

# find the L1 error.
y_pred = predict(theta_price, X_train_price_scaled, bias_price)
l1_error = np.abs(y_pred - y_train_price_scaled).sum()
print("The L1 error is: {:.2f}".format(l1_error))

# Find the R^2
u_price = ((y_train_price_scaled - y_pred) ** 2).sum()
v_price = ((y_train_price_scaled - y_train_price_scaled.mean()) ** 2).sum()
print("R^2: {:.2f}".format(1 - (u_price / v_price)))

Epoch 0, Loss: 174.33478104578919
Epoch 100, Loss: 0.06638156289723694
Epoch 200, Loss: 0.06427085054364365
Epoch 300, Loss: 0.0638305966401651
Epoch 400, Loss: 0.0637097532526886
theta shape: (39, 1)
bias: 13.887471423245778
The L2 error is: 119.70
The L1 error is: 349.76
R^2: 0.94


In [None]:
# =====================
# Train the marked model
# =====================

theta_marked, bias_marked, j_history_marked = train_model(
    X_train_marked_scaled, y_train_marked_scaled)

print("theta shape:", theta_marked.shape)
print("bias:", bias_marked)

# append the final result.
j = J_squared_residual(theta_marked, bias_marked, X_train_marked_scaled, y_train_marked_scaled)
j_history_marked.append(j)
print("The L2 error is: {:.2f}".format(j))

# find the L1 error.
y_pred = predict(theta_marked, X_train_marked_scaled, bias_marked)
l1_error = np.abs(y_pred - y_train_marked_scaled).sum()
print("The L1 error is: {:.2f}".format(l1_error))

# Find the R^2
u_marked = ((y_train_marked_scaled - y_pred) ** 2).sum()
v_marked = ((y_train_marked_scaled - y_train_marked_scaled.mean()) ** 2).sum()
print("R^2: {:.2f}".format(1 - (u_marked / v_marked)))

Epoch 0, Loss: 5.191511926996053
Epoch 100, Loss: 0.7992865687277668
Epoch 200, Loss: 0.7964251567098368
Epoch 300, Loss: 0.7959819139605903
Epoch 400, Loss: 0.7959188287325456
theta shape: (51, 1)
bias: 2.398645657988257
The L2 error is: 1496.35
The L1 error is: 1442.83
R^2: 0.17


In [None]:
# =======================
# Evaluate price model
# =======================

# predictions on unseen test data
y_pred_log = predict(theta_price, X_test_price_scaled, bias_price)

# MSE
test_mse = ((y_pred_log - y_test_price_scaled) ** 2).mean()
print("Test MSE:", test_mse)

# MAE (L1)
test_mae = np.abs(y_pred_log - y_test_price_scaled).mean()
print("Test MAE:", test_mae)

# R^2
u = ((y_test_price_scaled - y_pred_log) ** 2).sum()
v = ((y_test_price_scaled - y_test_price_scaled.mean()) ** 2).sum()
test_r2 = 1 - (u / v)
print("Test R²:", test_r2)

y_pred = np.expm1(y_pred_log)
y = np.expm1(y_test_price_scaled)

for i in range(5):
    print(f"y_hat: {y_pred[i]}, y: {y[i]} , error {y[i] / y_pred[i]}")


Test MSE: 0.05952922718938116
Test MAE: 0.17849082771471583
Test R²: 0.9322930304754892
y_hat: [2312387.04557469], y: [1654000.] , error [0.71527818]
y_hat: [17326887.91398527], y: [17175228.99999997] , error [0.99124719]
y_hat: [1098477.00874823], y: [1920489.] , error [1.74831971]
y_hat: [12821574.83582105], y: [15386719.] , error [1.20006467]
y_hat: [956991.59850959], y: [1691010.] , error [1.76700611]


In [None]:
# =======================
# Evaluate marked model
# =======================

# predictions on unseen test data (log-scale)
y_pred_log = predict(theta_marked, X_test_marked_scaled, bias_marked)

# MSE and RMSE on log-scale
mse_log = ((y_pred_log - y_test_marked_scaled) ** 2).mean()
rmse_log = np.sqrt(mse_log)

# normalized RMSE: divide by std of the log targets
rmse_normalized = rmse_log / (y_test_marked_scaled.std() + 1e-8)
print(f"[DAYS] RMSE (normalized): {rmse_normalized}")

# R^2 on log-scale (same as before)
u = ((y_test_marked_scaled - y_pred_log) ** 2).sum()
v = ((y_test_marked_scaled - y_test_marked_scaled.mean()) ** 2).sum()
test_r2 = 1 - (u / v)
print(f"[DAYS] R^2 (normalized): {test_r2:.3f}")

# Inverse transform to days
y_pred_days = np.expm1(y_pred_log).ravel()
y_true_days = np.expm1(y_test_marked_scaled).ravel()

# RMSE and MAE in days
rmse_days = np.sqrt(((y_pred_days - y_true_days) ** 2).mean())
mae_days = np.abs(y_pred_days - y_true_days).mean()
print(f"[DAYS] RMSE (days): {rmse_days:.2f}")
print(f"[DAYS] MAE (days): {mae_days:.2f}")

# Optional: show a few example predictions (days)
for i in range(min(5, len(y_true_days))):
    print(f"y_hat: {y_pred_days[i]:.2f}, y: {y_true_days[i]:.2f}, error {y_true_days[i] - y_pred_days[i]:.2f}")

[DAYS] RMSE (normalized): 0.9138718400833414
[DAYS] R^2 (normalized): 0.165
[DAYS] RMSE (days): 18.63
[DAYS] MAE (days): 12.53
y_hat: 10.35, y: 4.40, error -5.95
y_hat: 8.66, y: 4.00, error -4.66
y_hat: 5.11, y: 4.00, error -1.11
y_hat: 11.88, y: 4.90, error -6.98
y_hat: 2.85, y: 2.10, error -0.75


In [None]:
categories_per_feature = [
    ['january', 'february', 'march', 'april', 'may', 'june',
     'july', 'august', 'september', 'october', 'november', 'december'], # sold_in_month
    ['Alice Vadasy', 'Rhonda Bowers', 'Pedro Sasser', 'Porfirio Wueste', 
     'Donna Mcclintock', 'Frank Schmidt', 'John Rios', 'Michael Hanners', 
     'John Jiminez', 'Frank Scheetz', 'Donald Campbell', 'Kristen Webb', 
     'Michael Rowland', 'Charlotte Nodeine', 'Ann Perez', 'Shirly Delrio', 
     'Lucy Moffatt', 'Gonzalo Ramos', 'Cheryl Dunlap', 'Mary Chavez', 'Pat Daniels', 
     'Johnnie Stanley', 'Misty Wallace', 'Gregory Dixon'] # agent names
  
    ]

n_classes_per_feature = [len(c) for c in categories_per_feature]
one_hot_feature_names = []
for feature_name, categories in zip(price_categorical_features, categories_per_feature):
    for cat in categories:
        one_hot_feature_names.append(f"{feature_name}_{cat}")

# price feature weights
weights = np.ravel(theta_price)
all_features = price_numeric_features + one_hot_feature_names

arr = np.zeros(len(weights), dtype=[('feature', 'U50'), ('weight', 'f8')])
arr['feature'] = all_features
arr['weight'] = weights

arr

array([('size',  0.48334179), ('external_storage_m2', -0.34670913),
       ('lot_w',  0.21201762), ('sold_in_month_january',  1.18981163),
       ('sold_in_month_february',  1.20434879),
       ('sold_in_month_march',  1.03328772),
       ('sold_in_month_april',  1.09166812),
       ('sold_in_month_may',  1.03749455),
       ('sold_in_month_june',  1.27071307),
       ('sold_in_month_july',  1.33323048),
       ('sold_in_month_august',  1.14292134),
       ('sold_in_month_september',  1.2812051 ),
       ('sold_in_month_october',  1.04048911),
       ('sold_in_month_november',  1.12393895),
       ('sold_in_month_december',  1.13836258),
       ('agent_id_Alice Vadasy',  0.55906852),
       ('agent_id_Rhonda Bowers',  0.59864523),
       ('agent_id_Pedro Sasser',  0.61865167),
       ('agent_id_Porfirio Wueste',  0.58783482),
       ('agent_id_Donna Mcclintock',  0.56498134),
       ('agent_id_Frank Schmidt',  0.57904482),
       ('agent_id_John Rios',  0.56054699),
       ('agent_id_M

In [None]:
# marked feature weights
marked_categories_per_feature = [
    ['no', 'regular', 'premium'],  # advertisement

    ['black', 'blue', 'gray', 'green', 'red', 'unknown', 'white'],  # color

    ['', '1 rooms', '2 rooms', '3 rooms', '4 rooms', '5 rooms'],  # rooms

    ['Alice Vadasy', 'Rhonda Bowers', 'Pedro Sasser', 'Porfirio Wueste', 'Donna Mcclintock', 'Frank Schmidt', 'John Rios', 'Michael Hanners', 'John Jiminez', 'Frank Scheetz', 'Donald Campbell', 'Kristen Webb', 'Michael Rowland', 'Charlotte Nodeine', 'Ann Perez', 'Shirly Delrio', 'Lucy Moffatt', 'Gonzalo Ramos', 'Cheryl Dunlap', 'Mary Chavez', 'Pat Daniels', 'Johnnie Stanley', 'Misty Wallace', 'Gregory Dixon'] # agent names
]

# Number of classes per categorical feature
marked_n_classes_per_feature = [len(c) for c in marked_categories_per_feature]

# Generate one-hot feature names correctly
marked_one_hot_feature_names = []
for feature_name, categories in zip(marked_categorical_features, marked_categories_per_feature):
    for cat in categories:
        marked_one_hot_feature_names.append(f"{feature_name}_{cat}")

# Combine numeric + one-hot feature names
all_features = marked_numeric_features + marked_one_hot_feature_names

# Flatten weights
weights = np.ravel(theta_marked)

# Safety check
assert len(all_features) == len(weights), f"Length mismatch: features={len(all_features)}, weights={len(weights)}"

# Create structured array
arr = np.zeros(len(weights), dtype=[('feature', 'U50'), ('weight', 'f8')])
arr['feature'] = all_features
arr['weight'] = weights

# Sort descending
arr_sorted = np.sort(arr, order='weight')[::-1]

arr_sorted

array([('price_per_m2',  0.59429279), ('size',  0.11431869),
       ('district_crime',  0.04548097), ('rooms_',  0.04484099),
       ('agent_id_Gregory Dixon',  0.04360975),
       ('color_gray',  0.03592666),
       ('agent_id_Alice Vadasy',  0.03379704),
       ('district_transport',  0.02932396), ('color_white',  0.02794031),
       ('advertisement_premium',  0.0263739 ),
       ('agent_id_Pat Daniels',  0.02344812),
       ('agent_id_John Rios',  0.01829175),
       ('agent_id_Kristen Webb',  0.01770776),
       ('rooms_3 rooms',  0.01754181), ('rooms_5 rooms',  0.01745443),
       ('agent_id_Shirly Delrio',  0.01518605),
       ('advertisement_no',  0.01501774),
       ('agent_id_Cheryl Dunlap',  0.01398681),
       ('agent_id_Mary Chavez',  0.01121092),
       ('color_unknown',  0.01087315),
       ('agent_id_Gonzalo Ramos',  0.0073066 ),
       ('agent_id_Rhonda Bowers',  0.00724697),
       ('agent_id_Porfirio Wueste',  0.00680958),
       ('agent_id_Pedro Sasser',  0.0055412 )

# Pickle model and weights

In [None]:
# Save weights using pickle
import pickle
with open("./models/price_model.pkl", "wb") as f:
    pickle.dump(theta_price, f)

with open("./models/price_model_bias.pkl", "wb") as f:
    pickle.dump(bias_price, f)

# with open("./models/marked_model.pkl", "wb") as f:
#     pickle.dump(theta_marked, f)



In [None]:
# predict known data
y_pred_log = predict(theta_price, X_train_price_scaled, bias_price)
y_pred = np.expm1(y_pred_log)
y_true = y_price[:1880]
y_true_scaled = y_train_price_scaled[:1880]

e = np.array([y_true / y_pred])

for i in range(10):
    print(f"y_hat: {y_pred[i]}, y: {y_true[i]} , error {y_true[i] / y_pred[i]}")

# calculate MSE
mse = ((y_pred_log - y_true_scaled) ** 2).mean()
print("MSE:", mse)
# calculate MAE
mae = np.abs(y_pred_log - y_true_scaled).mean()
print("MAE:", mae)
# R^2
u = ((y_true_scaled - y_pred_log) ** 2).sum()
v = ((y_true_scaled - y_true_scaled.mean()) ** 2).sum()
r2 = 1 - (u / v)
print("R²:", r2)

y_hat: [2554259.81776624], y: [2687103.] , error [1.05200848]
y_hat: [5232496.54722276], y: [4824632.] , error [0.92205164]
y_hat: [6368907.00085829], y: [6992047.] , error [1.09784096]
y_hat: [11033697.43697369], y: [13268780.] , error [1.20256877]
y_hat: [5693032.10024077], y: [5351483.] , error [0.94000577]
y_hat: [5705989.00769182], y: [5699283.] , error [0.99882474]
y_hat: [12202257.72720428], y: [12744741.] , error [1.04445761]
y_hat: [8125956.15858636], y: [8636067.] , error [1.06277549]
y_hat: [2165215.03377162], y: [1947098.] , error [0.89926311]
y_hat: [12584415.18221549], y: [12605686.] , error [1.00169025]
MSE: 0.06367283570915204
MAE: 0.18604484977446092
R²: 0.9357306746458752


In [None]:
import pandas as pd
import numpy as np

# pick y_true (prefer y_true, fallback to ys) and y_pred from available variables
if 'y_true' in globals():
    y_true_arr = np.ravel(y_true)
elif 'ys' in globals():
    y_true_arr = np.ravel(ys)
else:
    raise NameError("y_true or ys not found in the notebook namespace.")

# pick a suitable prediction variable
if 'y_pred' in globals():
    y_pred_arr = np.ravel(y_pred)
elif 'y_pred_days' in globals():
    y_pred_arr = np.ravel(y_pred_days)
elif 'y_pred_log' in globals():
    # try inverse-transform if predictions are log-scaled
    try:
        y_pred_arr = np.ravel(np.expm1(y_pred_log))
    except Exception:
        y_pred_arr = np.ravel(y_pred_log)
else:
    raise NameError("y_pred (or y_pred_days / y_pred_log) not found in the notebook namespace.")

# align lengths
n = min(len(y_true_arr), len(y_pred_arr))
y_true_arr = y_true_arr[:n]
y_pred_arr = y_pred_arr[:n]

# compute errors safely (avoid div by zero)
eps = 1e-8
with np.errstate(divide='ignore', invalid='ignore'):
    error = y_true_arr - y_pred_arr
    pct_error = error / np.where(np.abs(y_true_arr) > eps, y_true_arr, np.nan) * 100.0
    abs_pct_error = np.abs(error) / np.where(np.abs(y_true_arr) > eps, np.abs(y_true_arr), np.nan) * 100.0
    ratio = np.where(np.abs(y_pred_arr) > eps, y_true_arr / y_pred_arr, np.nan)

df_errors = pd.DataFrame({
    'y_true': y_true_arr,
    'y_pred': y_pred_arr,
    'error': error,
    'pct_error': pct_error,            # (y_true - y_pred) / y_true * 100
    'abs_pct_error': abs_pct_error,    # absolute percent error
    'y_true_over_y_pred': ratio        # y_true / y_pred
})

df_errors.head()

Unnamed: 0,y_true,y_pred,error,pct_error,abs_pct_error,y_true_over_y_pred
0,2687103.0,2554260.0,132843.2,4.943732,4.943732,1.052008
1,4824632.0,5232497.0,-407864.5,-8.453796,8.453796,0.922052
2,6992047.0,6368907.0,623140.0,8.912125,8.912125,1.097841
3,13268780.0,11033700.0,2235083.0,16.844673,16.844673,1.202569
4,5351483.0,5693032.0,-341549.1,-6.382326,6.382326,0.940006
