### 0. Install & imports

In [None]:
# In terminal (once):
# pip install timm opencv-python pandas scikit-learn

import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import timm
from tqdm import tqdm
import torchvision.transforms as T

#### 1. Basic config & label mappings

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Your semantic ids (from PolygonTrans)
class PolygonTrans:
    def __init__(self):
        self.binary = {
            "building": 1, "window": 2, "sky": 3, "roof": 4, "door": 5,
            "tree": 6, "people": 7, "car": 8, "sign": 9
        }

polygon_trans = PolygonTrans()

# We focus on these elements for classification
TARGET_ELEMENTS = ["roof", "door", "window"]

# Condition labels (example)
# Map your scores into 0/1/2 ,check the housing condition survey
COND_LABELS = {
    "poor": 0,
    "fair": 1,
    "good": 2
}
NUM_CLASSES = 3  # poor/fair/good

# Image size for classification models
IMG_SIZE = 224

### 2. Data format assumption (CSV)

#create csv structured like:image_path,mask_path,element,condition
/path/to/img1.jpg,/path/to/mask1.png,roof,poor
/path/to/img1.jpg,/path/to/mask1.png,window,fair

In [None]:
#Load it
csv_path = "facade_elements_labels.csv"  # change to your file
df = pd.read_csv(csv_path)

# Map condition text to numeric labels
df["label"] = df["condition"].map(COND_LABELS)

df.head()

### 3. Dataset: crop patches using the mask

In [None]:
class FacadeElementDataset(Dataset):
    def __init__(self, df, target_element, transforms=None):
        """
        df: subset of dataframe with columns [image_path, mask_path, element, label]
        target_element: 'roof' or 'door' or 'window'
        transforms: torchvision transforms for augmentation / normalization
        """
        self.df = df.reset_index(drop=True)
        self.target_element = target_element
        self.transforms = transforms

        self.element_id = polygon_trans.binary[target_element]#set up dataset

    def __len__(self):
        return len(self.df)#the length of samples

    def __getitem__(self, idx):#look up paths and labels
        row = self.df.iloc[idx]
        img_path = row["image_path"]
        mask_path = row["mask_path"]
        label = int(row["label"])

        # Load image (RGB)
        img_bgr = cv2.imread(img_path)
        if img_bgr is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

        # Load mask (single-channel, IDs)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        if mask is None:
            raise FileNotFoundError(f"Mask not found: {mask_path}")

        # Create binary mask for this element(1 and 0)
        elem_mask = (mask == self.element_id).astype(np.uint8)

        # If no pixels (just in case), return a center crop
        if elem_mask.sum() == 0:
            h, w, _ = img.shape
            min_side = min(h, w)
            start_y = (h - min_side) // 2
            start_x = (w - min_side) // 2
            patch = img[start_y:start_y+min_side, start_x:start_x+min_side]
        else:
            # Bounding box of the element
            ys, xs = np.where(elem_mask == 1)
            y_min, y_max = ys.min(), ys.max()
            x_min, x_max = xs.min(), xs.max()

            # Optional: add margin
            margin = 10
            y_min = max(0, y_min - margin)
            y_max = min(img.shape[0] - 1, y_max + margin)
            x_min = max(0, x_min - margin)
            x_max = min(img.shape[1] - 1, x_max + margin)

            patch = img[y_min:y_max+1, x_min:x_max+1]

        # Resize to fixed size
        patch = cv2.resize(patch, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)

        # To PIL for torchvision transforms
        patch = T.ToPILImage()(patch)

        if self.transforms:
            patch = self.transforms(patch)

        return patch, label

### Transforms

In [None]:
# For training: augment + normalize
train_transforms = T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomRotation(10),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])#All pixels have similar ranges (around -1 to +1) ,which is easier for neural networks to learn.

# For validation/test
val_transforms = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

### 4. Split data function defined

In [None]:
def split_by_element(df, element, test_size=0.1, val_size=0.1, random_state=42):
    df_elem = df[df["element"] == element].copy()
    
    # First split out test
    train_val, test = train_test_split(
        df_elem,
        test_size=test_size,
        stratify=df_elem["label"],#Both parts will keep the same proportion of each label to balance training
        random_state=random_state
    )
    # Then split train/val
    train, val = train_test_split(
        train_val,
        test_size=val_size / (1 - test_size),
        stratify=train_val["label"],
        random_state=random_state
    )
    return train, val, test

### 5.Branch1: Roof learning

In [None]:
train_roof, val_roof, test_roof = split_by_element(df, "roof")

train_dataset_roof = FacadeElementDataset(train_roof, "roof", transforms=train_transforms)
val_dataset_roof   = FacadeElementDataset(val_roof,   "roof", transforms=val_transforms)
test_dataset_roof  = FacadeElementDataset(test_roof,  "roof", transforms=val_transforms)

train_loader_roof = DataLoader(train_dataset_roof, batch_size=16, shuffle=True, num_workers=4)#wrap in data loader
val_loader_roof   = DataLoader(val_dataset_roof,   batch_size=16, shuffle=False, num_workers=4)
test_loader_roof  = DataLoader(test_dataset_roof,  batch_size=16, shuffle=False, num_workers=4)

In [None]:
def build_timm_model(model_name, num_classes=NUM_CLASSES):
    model = timm.create_model(model_name, pretrained=True)#set for change and compare between different models
    # timm provides a helper:
    in_features = model.get_classifier().in_features
    model.reset_classifier(num_classes)
    return model

In [None]:
#import Accuracy & F1
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, loader, device=device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for imgs, labels in loader:
            imgs = imgs.to(device)
            labels = labels.to(device)

            logits = model(imgs)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average="macro")
    return acc, f1

In [None]:
#Train Loop
def train_model(model, train_loader, val_loader, 
                epochs=20, lr=1e-4, weight_decay=1e-4, #change epoches here
                device=device, model_name="model", element="roof"):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_f1 = 0.0
    best_state_dict = None

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0

        for imgs, labels in tqdm(train_loader, desc=f"[{element}][{model_name}] Epoch {epoch}/{epochs}"):
            imgs = imgs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(imgs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        val_acc, val_f1 = evaluate_model(model, val_loader, device)

        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_acc={val_acc:.4f}, val_f1={val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state_dict = model.state_dict().copy()
            os.makedirs("models", exist_ok=True)
            save_path = f"models/{element}_{model_name}_best.pth"
            torch.save(best_state_dict, save_path)
            print(f"  âœ… New best model saved to {save_path}")

    # Load best weights back into model
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    return model, best_val_f1

In [None]:
#Train multiple models for one element
model_names = {
    "vgg16": "vgg16",
    "effnet_b3": "efficientnet_b3",
    "swin_tiny": "swin_tiny_patch4_window7_224"
}

results_roof = {}

for short_name, timm_name in model_names.items():
    print(f"\n=== Training {short_name} for ROOF ===")
    model = build_timm_model(timm_name, num_classes=NUM_CLASSES)
    trained_model, best_val_f1 = train_model(
        model, 
        train_loader_roof, 
        val_loader_roof,
        epochs=20,
        lr=1e-4,
        model_name=short_name,
        element="roof"
    )

    # Evaluate on test set
    test_acc, test_f1 = evaluate_model(trained_model, test_loader_roof, device)
    results_roof[short_name] = {"val_f1": best_val_f1, "test_acc": test_acc, "test_f1": test_f1}
    print(f"[ROOF][{short_name}] Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")

### 5.Branch2: Window learning

In [None]:
train_window, val_window, test_window = split_by_element(df, "window")

train_dataset_window = FacadeElementDataset(train_window, "window", transforms=train_transforms)
val_dataset_window   = FacadeElementDataset(val_window,   "window", transforms=val_transforms)
test_dataset_window  = FacadeElementDataset(test_window,  "window", transforms=val_transforms)

train_loader_window = DataLoader(train_dataset_window, batch_size=16, shuffle=True, num_workers=4)#wrap in data loader
val_loader_window   = DataLoader(val_dataset_window,   batch_size=16, shuffle=False, num_workers=4)
test_loader_window  = DataLoader(test_dataset_window,  batch_size=16, shuffle=False, num_workers=4)

In [None]:
#Train Loop
def train_model(model, train_loader, val_loader, 
                epochs=20, lr=1e-4, weight_decay=1e-4, #change epoches here
                device=device, model_name="model", element="window"):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_f1 = 0.0
    best_state_dict = None

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0

        for imgs, labels in tqdm(train_loader, desc=f"[{element}][{model_name}] Epoch {epoch}/{epochs}"):
            imgs = imgs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(imgs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        val_acc, val_f1 = evaluate_model(model, val_loader, device)

        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_acc={val_acc:.4f}, val_f1={val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state_dict = model.state_dict().copy()
            os.makedirs("models", exist_ok=True)
            save_path = f"models/{element}_{model_name}_best.pth"
            torch.save(best_state_dict, save_path)
            print(f"  New best model saved to {save_path}")

    # Load best weights back into model
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    return model, best_val_f1

In [None]:
model_names = {
    "vgg16": "vgg16",
    "effnet_b3": "efficientnet_b3",
    "swin_tiny": "swin_tiny_patch4_window7_224"
}

results_window = {}

for short_name, timm_name in model_names.items():
    print(f"\n=== Training {short_name} for WINDOW ===")
    model = build_timm_model(timm_name, num_classes=NUM_CLASSES)
    trained_model, best_val_f1 = train_model(
        model, 
        train_loader_window, 
        val_loader_window,
        epochs=20,
        lr=1e-4,
        model_name=short_name,
        element="window"
    )

    # Evaluate on test set
    test_acc, test_f1 = evaluate_model(trained_model, test_loader_window, device)
    results_window[short_name] = {"val_f1": best_val_f1, "test_acc": test_acc, "test_f1": test_f1}
    print(f"[WINDOW][{short_name}] Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")

### 5.Branch3: Door learning

In [None]:
train_door, val_door, test_door = split_by_element(df, "door")

train_dataset_door = FacadeElementDataset(train_door, "door", transforms=train_transforms)
val_dataset_door   = FacadeElementDataset(val_door,   "door", transforms=val_transforms)
test_dataset_door  = FacadeElementDataset(test_door,  "door", transforms=val_transforms)

train_loader_door = DataLoader(train_dataset_door, batch_size=16, shuffle=True, num_workers=4)#wrap in data loader
val_loader_door   = DataLoader(val_dataset_door,   batch_size=16, shuffle=False, num_workers=4)
test_loader_door  = DataLoader(test_dataset_door,  batch_size=16, shuffle=False, num_workers=4)

In [None]:
#Train Loop
def train_model(model, train_loader, val_loader, 
                epochs=20, lr=1e-4, weight_decay=1e-4, #change epoches here
                device=device, model_name="model", element="door"):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_f1 = 0.0
    best_state_dict = None

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0

        for imgs, labels in tqdm(train_loader, desc=f"[{element}][{model_name}] Epoch {epoch}/{epochs}"):
            imgs = imgs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(imgs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        val_acc, val_f1 = evaluate_model(model, val_loader, device)

        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_acc={val_acc:.4f}, val_f1={val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state_dict = model.state_dict().copy()
            os.makedirs("models", exist_ok=True)
            save_path = f"models/{element}_{model_name}_best.pth"
            torch.save(best_state_dict, save_path)
            print(f"  New best model saved to {save_path}")

    # Load best weights back into model
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    return model, best_val_f1

In [None]:
model_names = {
    "vgg16": "vgg16",
    "effnet_b3": "efficientnet_b3",
    "swin_tiny": "swin_tiny_patch4_window7_224"
}

results_door = {}

for short_name, timm_name in model_names.items():
    print(f"\n=== Training {short_name} for DOOR ===")
    model = build_timm_model(timm_name, num_classes=NUM_CLASSES)
    trained_model, best_val_f1 = train_model(
        model, 
        train_loader_door, 
        val_loader_door,
        epochs=20,
        lr=1e-4,
        model_name=short_name,
        element="door"
    )

    # Evaluate on test set
    test_acc, test_f1 = evaluate_model(trained_model, test_loader_door, device)
    results_door[short_name] = {"val_f1": best_val_f1, "test_acc": test_acc, "test_f1": test_f1}
    print(f"[DOOR][{short_name}] Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")