# Multimodal House Price Valuation — Modeling & Fusion

This notebook implements and compares **tabular-only**, **image-only**, and **multimodal fusion** models for predicting house prices.

Key goals:
- Establish strong **tabular baselines** (linear and tree-based).
- Use a **pretrained ResNet** as an image feature extractor (frozen backbone, CPU-friendly).
- Compare **late fusion** and **feature-level fusion** strategies.
- Evaluate whether satellite imagery provides **robust, economically meaningful gains** over tabular features alone.



## Data and Problem Setup

We use the following datasets:

- `data/raw/train.csv` (or `train.csv` in project root): training data with observed prices.
- `data/raw/test.xlsx` (or `test.xlsx` in project root): features-only test set for blind prediction.
- `data/processed/train.parquet`, `data/processed/val.parquet`: **leakage-aware splits** created in `preprocessing.ipynb` using spatial grouping.
- `data/satellite/image_metadata.csv`: mapping from property IDs to Sentinel tiles (generated by `data_fetcher.py`).

Our target is typically `price`, modelled on the log scale (`log_price`) for numerical stability. We always compare models on the **original price scale** via RMSE and R².


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

from PIL import Image

PROJECT_ROOT = Path("..").resolve()
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
SATELLITE_DIR = PROJECT_ROOT / "data" / "satellite"
EMBEDDINGS_DIR = PROJECT_ROOT / "data" / "embeddings"
REPORTS_DIR = PROJECT_ROOT / "reports"

for d in [EMBEDDINGS_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Load processed splits (created in preprocessing.ipynb)
train_path = PROCESSED_DIR / "train.parquet"
val_path = PROCESSED_DIR / "val.parquet"

if not train_path.exists() or not val_path.exists():
    raise FileNotFoundError(
        f"Expected processed splits at {train_path} and {val_path}. "
        "Run preprocessing.ipynb first to generate leakage-aware splits."
    )

train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)

print("Train split:", train_df.shape)
print("Val split:", val_df.shape)

# Infer key columns (must match preprocessing)
from collections import Counter

cols = train_df.columns

TARGET_COL = "log_price" if "log_price" in cols else "price"
ID_COL = "id" if "id" in cols else "Id" if "Id" in cols else None
LAT_COL = "lat" if "lat" in cols else "latitude" if "latitude" in cols else None
LON_COL = "long" if "long" in cols else "lon" if "lon" in cols else "longitude" if "longitude" in cols else None

if ID_COL is None:
    raise ValueError("Could not infer ID column in processed data.")

print("TARGET_COL =", TARGET_COL)
print("ID_COL =", ID_COL)
print("LAT_COL =", LAT_COL)
print("LON_COL =", LON_COL)

# If target is on log scale, keep a copy of original price if available
if TARGET_COL == "log_price" and "price" in cols:
    PRICE_COL = "price"
else:
    PRICE_COL = TARGET_COL

print("PRICE_COL (for metrics) =", PRICE_COL)


def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))



In [None]:
# Build tabular feature matrix and baseline models

# Separate target and features

y_train = train_df[TARGET_COL].values
X_train = train_df.drop(columns=[TARGET_COL])

y_val = val_df[TARGET_COL].values
X_val = val_df.drop(columns=[TARGET_COL])

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
if ID_COL in numeric_features:
    numeric_features.remove(ID_COL)

categorical_features = [
    c for c in X_train.columns if c not in numeric_features and c != ID_COL
]

print("Numeric features (tabular):", len(numeric_features))
print("Categorical features (tabular):", len(categorical_features))

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        )
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Linear baseline (on log-price if available)
lin_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", Ridge(alpha=1.0)),
    ]
)

lin_model.fit(X_train, y_train)

y_val_pred_lin = lin_model.predict(X_val)

if TARGET_COL == "log_price" and PRICE_COL in val_df.columns:
    # Evaluate on price scale
    y_val_price = val_df[PRICE_COL].values
    y_val_price_pred = np.expm1(y_val_pred_lin)
    baseline_rmse = rmse(y_val_price, y_val_price_pred)
else:
    y_val_price = y_val
    y_val_price_pred = y_val_pred_lin
    baseline_rmse = rmse(y_val_price, y_val_price_pred)

baseline_r2 = r2_score(y_val_price, y_val_price_pred)

print(f"Linear baseline RMSE (price scale): {baseline_rmse:,.2f}")
print(f"Linear baseline R^2: {baseline_r2:.3f}")

# Tree-based baseline (Random Forest)
rf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "regressor",
            RandomForestRegressor(
                n_estimators=200,
                max_depth=None,
                min_samples_leaf=2,
                n_jobs=-1,
                random_state=42,
            ),
        ),
    ]
)

rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)

if TARGET_COL == "log_price" and PRICE_COL in val_df.columns:
    y_val_price_pred_rf = np.expm1(y_val_pred_rf)
else:
    y_val_price_pred_rf = y_val_pred_rf

rf_rmse = rmse(y_val_price, y_val_price_pred_rf)
rf_r2 = r2_score(y_val_price, y_val_price_pred_rf)

print(f"Random Forest RMSE (price scale): {rf_rmse:,.2f}")
print(f"Random Forest R^2: {rf_r2:.3f}")

TABULAR_BASELINE_RMSE = min(baseline_rmse, rf_rmse)
TABULAR_BASELINE_R2 = max(baseline_r2, rf_r2)

print("Best tabular baseline RMSE:", TABULAR_BASELINE_RMSE)
print("Best tabular baseline R^2:", TABULAR_BASELINE_R2)


## CNN Feature Extraction with Pretrained ResNet

We now treat the satellite imagery as a **feature extractor**, not a standalone classifier:

- Use a pretrained **ResNet18** (or similar) from `torchvision.models`.
- **Freeze** the convolutional backbone so we only reuse generic spatial filters.
- Resize tiles to a standard input size (e.g., 224×224) and normalise using ImageNet statistics.
- Extract **fixed 512-D embeddings** (after global average pooling) and cache them to `data/embeddings/`.

These embeddings serve as image-derived features for both **image-only** and **multimodal** models.


In [None]:
# Prepare image metadata and ResNet feature extractor

meta_path = SATELLITE_DIR / "image_metadata.csv"
if not meta_path.exists():
    raise FileNotFoundError(
        f"No satellite metadata found at {meta_path}. "
        "Run data_fetcher.py first to download Sentinel tiles."
    )

meta_df = pd.read_csv(meta_path)
meta_df = meta_df[meta_df["status"].isin(["ok", "cached"])]

# Align ID column naming
if ID_COL != "id" and "id" in meta_df.columns:
    meta_df = meta_df.rename(columns={"id": ID_COL})

# Keep only rows with existing image files
meta_df = meta_df[meta_df["image_path"].apply(lambda p: Path(p).exists())]

print("Usable satellite images:", len(meta_df))

IMG_SIZE = 224

img_transform = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
    ]
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class SatelliteDataset(Dataset):
    def __init__(self, df, id_col, target_col, transform=None):
        self.df = df.reset_index(drop=True)
        self.id_col = id_col
        self.target_col = target_col
        self.transform = transform

    def __len__(self):  # noqa: D401
        """Number of available image samples."""
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row["image_path"]).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        target = float(row[self.target_col])
        sample_id = row[self.id_col]
        return img, target, sample_id


# Load a pretrained ResNet18 and turn it into a pure feature extractor
try:
    resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
except AttributeError:
    # Older torchvision versions
    resnet = models.resnet18(pretrained=True)

for param in resnet.parameters():
    param.requires_grad = False

feature_dim = resnet.fc.in_features
resnet.fc = nn.Identity()  # outputs pooled features directly

resnet.eval()
resnet.to(device)


def extract_resnet_embeddings(split_df, split_name, batch_size=32):
    """Compute (or load) ResNet embeddings for a given split.

    Returns a DataFrame with one row per property ID and columns:
    - ID_COL
    - img_emb_0, ..., img_emb_{feature_dim-1}
    """

    out_path = EMBEDDINGS_DIR / f"resnet18_{split_name}_embeddings.parquet"
    if out_path.exists():
        print(f"Loading cached embeddings from {out_path}")
        return pd.read_parquet(out_path)

    merged = split_df[[ID_COL, TARGET_COL]].merge(
        meta_df[[ID_COL, "image_path"]], on=ID_COL, how="inner"
    )

    if merged.empty:
        raise RuntimeError(
            f"No overlapping IDs between {split_name} split and satellite metadata."
        )

    dataset = SatelliteDataset(
        merged[[ID_COL, "image_path", TARGET_COL]],
        id_col=ID_COL,
        target_col=TARGET_COL,
        transform=img_transform,
    )

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,  # safer on Windows / notebooks
    )

    all_embeddings = []
    all_ids = []

    with torch.no_grad():
        for imgs, _, ids in loader:
            imgs = imgs.to(device)
            feats = resnet(imgs)  # (B, feature_dim)
            all_embeddings.append(feats.cpu().numpy())
            # Convert tensor IDs to plain Python integers to keep parquet happy
            all_ids.extend([int(i) for i in ids])

    emb_array = np.concatenate(all_embeddings, axis=0)
    emb_cols = [f"img_emb_{i}" for i in range(emb_array.shape[1])]
    emb_df = pd.DataFrame(emb_array, columns=emb_cols)
    emb_df[ID_COL] = all_ids

    emb_df.to_parquet(out_path, index=False)
    print(f"Saved embeddings for {split_name} to {out_path}")

    return emb_df


train_img_emb = extract_resnet_embeddings(train_df, split_name="train", batch_size=32)
val_img_emb = extract_resnet_embeddings(val_df, split_name="val", batch_size=32)

print("Train image embeddings:", train_img_emb.shape)
print("Val image embeddings:", val_img_emb.shape)


## Strategy A — Late Fusion

We now build **separate models** for tabular and image features, then combine them at the prediction level:

1. **Tabular model**: best-performing baseline from above (linear or Random Forest).
2. **Image-only model**: regression on ResNet embeddings (e.g., Random Forest).
3. **Fusion model**: a simple linear combiner on top of tabular and image predictions.

This lets us ask: *Given a strong tabular model, does adding image-based predictions materially improve RMSE/R² in a stable way?*


In [None]:
# Build image-only and late-fusion models

# Join embeddings with tabular splits

train_with_img = train_df.merge(train_img_emb, on=ID_COL, how="inner")
val_with_img = val_df.merge(val_img_emb, on=ID_COL, how="inner")

print("Train with image features:", train_with_img.shape)
print("Val with image features:", val_with_img.shape)

# Image-only features (embeddings only)
emb_cols = [c for c in train_with_img.columns if c.startswith("img_emb_")]

X_train_img = train_with_img[emb_cols].values
X_val_img = val_with_img[emb_cols].values

y_train_img = train_with_img[TARGET_COL].values
_y_val_img = val_with_img[TARGET_COL].values  # same as y_val but aligned subset

img_rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
)

img_rf.fit(X_train_img, y_train_img)

y_val_pred_img = img_rf.predict(X_val_img)

if TARGET_COL == "log_price" and PRICE_COL in val_with_img.columns:
    y_val_price_img = val_with_img[PRICE_COL].values
    y_val_price_pred_img = np.expm1(y_val_pred_img)
else:
    y_val_price_img = _y_val_img
    y_val_price_pred_img = y_val_pred_img

img_rmse = rmse(y_val_price_img, y_val_price_pred_img)
img_r2 = r2_score(y_val_price_img, y_val_price_pred_img)

print(f"Image-only RF RMSE (price scale): {img_rmse:,.2f}")
print(f"Image-only RF R^2: {img_r2:.3f}")

# Align tabular baseline predictions to the subset with embeddings

X_val_subset = val_df[val_df[ID_COL].isin(val_with_img[ID_COL])].copy()

lin_val_subset_pred = lin_model.predict(X_val_subset)
if TARGET_COL == "log_price" and PRICE_COL in val_df.columns:
    tab_price_pred = np.expm1(lin_val_subset_pred)
    tab_price_true = X_val_subset[PRICE_COL].values
else:
    tab_price_pred = lin_val_subset_pred
    tab_price_true = X_val_subset[TARGET_COL].values

# Match order with image subset
fusion_df = val_with_img[[ID_COL]].merge(
    pd.DataFrame(
        {ID_COL: X_val_subset[ID_COL].values, "tab_pred": tab_price_pred,
         "tab_true": tab_price_true}
    ),
    on=ID_COL,
    how="inner",
)

fusion_df = fusion_df.merge(
    pd.DataFrame(
        {
            ID_COL: val_with_img[ID_COL].values,
            "img_pred": y_val_price_pred_img,
        }
    ),
    on=ID_COL,
    how="inner",
)

print(f"Fusion evaluation on {len(fusion_df)} validation examples.")

# Simple linear combiner learned on validation (could also use cross-validation)

fusion_X = fusion_df[["tab_pred", "img_pred"]].values
fusion_y = fusion_df["tab_true"].values

fusion_reg = LinearRegression()
fusion_reg.fit(fusion_X, fusion_y)

fusion_pred = fusion_reg.predict(fusion_X)

fusion_rmse = rmse(fusion_y, fusion_pred)
fusion_r2 = r2_score(fusion_y, fusion_pred)

print(f"Late fusion RMSE (price scale): {fusion_rmse:,.2f}")
print(f"Late fusion R^2: {fusion_r2:.3f}")

improvement_vs_tab = 100.0 * (TABULAR_BASELINE_RMSE - fusion_rmse) / TABULAR_BASELINE_RMSE
print(f"% RMSE improvement over best tabular baseline: {improvement_vs_tab:.2f}%")


## Strategy B — Feature-Level Fusion

In feature-level fusion, we **concatenate** processed tabular features and CNN embeddings, then train a **single regression head**:

- Tabular branch: standardised numeric features + one-hot encoded categoricals.
- Image branch: fixed 512-D ResNet embeddings (already normalised).
- Fusion model: a modest-capacity regressor (e.g., Random Forest or MLP) on top of the concatenated vector.

This tests whether **jointly** using tabular and image features allows the model to exploit interactions that late fusion cannot capture.


In [None]:
# Construct feature-level fusion dataset

# First, obtain preprocessed tabular design matrices from the trained preprocessor
X_train_tab = preprocessor.fit_transform(X_train)
X_val_tab = preprocessor.transform(X_val)

print("Tabular feature matrix shapes:", X_train_tab.shape, X_val_tab.shape)

# Align rows with embeddings (inner join on ID)
train_idx_map = {idx: i for i, idx in enumerate(train_df[ID_COL].values)}
val_idx_map = {idx: i for i, idx in enumerate(val_df[ID_COL].values)}

train_fusion_ids = np.intersect1d(train_df[ID_COL].values, train_img_emb[ID_COL].values)
val_fusion_ids = np.intersect1d(val_df[ID_COL].values, val_img_emb[ID_COL].values)

print("Train fusion IDs:", len(train_fusion_ids))
print("Val fusion IDs:", len(val_fusion_ids))

# Build aligned matrices

def build_fusion_matrices(ids, df, idx_map, X_tab, emb_df):
    tab_rows = []
    img_rows = []
    targets = []
    for pid in ids:
        tab_idx = idx_map[pid]
        tab_rows.append(X_tab[tab_idx])
        img_rows.append(emb_df.loc[emb_df[ID_COL] == pid, emb_cols].values[0])
        targets.append(df.loc[df[ID_COL] == pid, TARGET_COL].values[0])
    return (
        np.vstack(tab_rows),
        np.vstack(img_rows),
        np.array(targets),
    )

X_train_tab_f, X_train_img_f, y_train_f = build_fusion_matrices(
    train_fusion_ids, train_df, train_idx_map, X_train_tab, train_img_emb
)
X_val_tab_f, X_val_img_f, y_val_f = build_fusion_matrices(
    val_fusion_ids, val_df, val_idx_map, X_val_tab, val_img_emb
)

X_train_fusion = np.hstack([X_train_tab_f, X_train_img_f])
X_val_fusion = np.hstack([X_val_tab_f, X_val_img_f])

print("Fusion feature matrices:", X_train_fusion.shape, X_val_fusion.shape)

# Train a modest-capacity Random Forest on fused features
fusion_rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=123,
)

fusion_rf.fit(X_train_fusion, y_train_f)

fusion_val_pred = fusion_rf.predict(X_val_fusion)

if TARGET_COL == "log_price" and PRICE_COL in val_df.columns:
    # Use aligned price values for evaluation
    val_price_aligned = []
    for pid in val_fusion_ids:
        val_price_aligned.append(val_df.loc[val_df[ID_COL] == pid, PRICE_COL].values[0])
    val_price_aligned = np.array(val_price_aligned)
    fusion_val_price_pred = np.expm1(fusion_val_pred)
else:
    val_price_aligned = y_val_f
    fusion_val_price_pred = fusion_val_pred

fusion_rmse_feat = rmse(val_price_aligned, fusion_val_price_pred)
fusion_r2_feat = r2_score(val_price_aligned, fusion_val_price_pred)

print(f"Feature-level fusion RF RMSE (price scale): {fusion_rmse_feat:,.2f}")
print(f"Feature-level fusion RF R^2: {fusion_r2_feat:.3f}")

improvement_vs_tab_feat = 100.0 * (TABULAR_BASELINE_RMSE - fusion_rmse_feat) / TABULAR_BASELINE_RMSE
print(f"% RMSE improvement over best tabular baseline: {improvement_vs_tab_feat:.2f}%")


## Final Model Choice and Test-Set Predictions

We now select the model that offers the **best trade-off** between:

- Predictive performance (RMSE, R²) on the validation set.
- Stability of improvements from imagery over tabular-only baselines.
- Interpretability and complexity.

For illustration, we assume the **feature-level fusion Random Forest** is chosen (you may override this choice after inspecting metrics), and use it to generate predictions for `test.xlsx`.



In [None]:
# Load raw test features and generate predictions with the chosen model

# Load test

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Support data/raw/test.xlsx, project-root test.xlsx, or Provided/test.xlsx

PROVIDED_DIR = PROJECT_ROOT / "Provided"

test_candidates = [
    RAW_DIR / "test.xlsx",
    PROJECT_ROOT / "test.xlsx",
    PROVIDED_DIR / "test.xlsx",
]
for p in test_candidates:
    if p.exists():
        TEST_PATH = p
        break
else:
    raise FileNotFoundError(f"Could not find test.xlsx in {test_candidates}.")

print("Using TEST_PATH =", TEST_PATH)

test_df_raw = pd.read_excel(TEST_PATH)

# Ensure ID column exists
if ID_COL not in test_df_raw.columns:
    raise ValueError(f"Expected ID column '{ID_COL}' in test data.")

# Apply same preprocessing as training
X_test_tab = test_df_raw.drop(columns=[c for c in [TARGET_COL, "log_price", "price"] if c in test_df_raw.columns])

# Align columns with training features
missing_numeric = [c for c in numeric_features if c not in X_test_tab.columns]
for c in missing_numeric:
    X_test_tab[c] = np.nan

missing_cats = [c for c in categorical_features if c not in X_test_tab.columns]
for c in missing_cats:
    X_test_tab[c] = "missing"

X_test_tab = X_test_tab[numeric_features + categorical_features]

X_test_tab_proc = preprocessor.transform(X_test_tab)

# Join with image embeddings for test set (if available)

# Build embeddings for test if not already computed
try:
    test_img_emb = extract_resnet_embeddings(test_df_raw.assign(**{TARGET_COL: np.nan}), split_name="test", batch_size=32)
except RuntimeError as e:
    print("Warning: could not compute test embeddings:", e)
    test_img_emb = None

if test_img_emb is not None:
    # Align IDs
    test_ids = test_df_raw[ID_COL].values
    test_img_emb = test_img_emb[test_img_emb[ID_COL].isin(test_ids)]

    # Map from ID to row index in tabular matrix
    test_idx_map = {idx: i for i, idx in enumerate(test_ids)}

    fusion_test_ids = np.intersect1d(test_ids, test_img_emb[ID_COL].values)

    tab_rows = []
    img_rows = []
    out_ids = []
    for pid in fusion_test_ids:
        tab_idx = test_idx_map[pid]
        tab_rows.append(X_test_tab_proc[tab_idx])
        img_rows.append(test_img_emb.loc[test_img_emb[ID_COL] == pid, emb_cols].values[0])
        out_ids.append(pid)

    X_test_tab_f = np.vstack(tab_rows)
    X_test_img_f = np.vstack(img_rows)
    X_test_fusion = np.hstack([X_test_tab_f, X_test_img_f])

    # Use feature-level fusion RF as the final model (adjust if you prefer a different model)
    test_pred_log = fusion_rf.predict(X_test_fusion)
    test_pred_price = np.expm1(test_pred_log) if TARGET_COL == "log_price" else test_pred_log

    pred_df = pd.DataFrame(
        {
            ID_COL: out_ids,
            "actual_price": np.nan,  # blind test; actuals unknown at prediction time
            "predicted_price": test_pred_price,
        }
    )

    out_csv = REPORTS_DIR / "predictions_test.csv"
    pred_df.to_csv(out_csv, index=False)
    print("Saved test predictions to", out_csv)
else:
    print("No image embeddings for test set; consider falling back to tabular-only model.")
