In [2]:
"""
README: Semantic Segmentation Model Citywise Metrics Evaluator

This script evaluates multiple trained DeepLabV3-MobileNetV3 models on corresponding test datasets,
and computes classwise segmentation metrics (precision, recall, F1-score, IoU) for each city represented in the data.

Features:
- Loads trained segmentation models and their test sets.
- Splits evaluation by city keywords parsed from image file names.
- Computes per-class and per-city precision, recall, F1-score, and IoU using torchmetrics.
- Outputs results as CSV files (one per model/test set).

How to use:
- Place your trained models and corresponding test image folders.
- Configure model/test set pairs in `model_dataset_pairs`.
- Update city keywords in `city_keywords` and their English translations if needed.
- Run the script to get per-city, per-class metrics as CSV.

Requirements:
- torch
- torchvision
- albumentations
- opencv-python
- numpy
- pandas
- tqdm
- torchmetrics

Author: Bahadir Akin Akgul
Date: 13.07.2025
"""

import os
import torch
import torchvision
import numpy as np
import pandas as pd
import cv2
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
from torchmetrics.functional import precision, recall, f1_score, jaccard_index

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# City keywords and their corresponding English names
city_keywords = {
    'istanbul': ['libadiye', 'levent', 'taksim', 'ciragan', 'barbaros', 'dolmabahce', 'bagdat', 'muallim', 'katar'],
    'paris': ['paris-champs'],
    'munich': ['munih'],
    'marseille': ['marsilya']
}

city_translation = {
    'istanbul': 'Istanbul',
    'paris': 'Paris',
    'munich': 'Munich',
    'marseille': 'Marseille'
}

class_names = ['Background', 'Person', 'Road', 'Vehicle']

# Custom Dataset class for segmentation
class SegmentationDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.images = [f for f in os.listdir(img_dir) if f.endswith(".jpg")]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_name = self.images[idx]
        img_path = os.path.join(self.img_dir, img_name)
        mask_path = img_path.replace(".jpg", "_mask.png")

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        if mask is None or image.shape[:2] != mask.shape:
            mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
        else:
            mask = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)

        mask = mask.astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image, mask=mask)
            image = transformed["image"]
            mask = transformed["mask"].long()

        return image, mask, img_name

# Albumentations transformations
transform = A.Compose([
    A.Resize(1024, 768),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
], additional_targets={'mask': 'mask'})


# Evaluation function for per-city, per-class metrics
def evaluate_and_collect(model_dir, dataset_dir):
    print(f"\nEvaluating model in: {model_dir}")
    model_path = os.path.join(model_dir, "trained_model.pth")

    NUM_CLASSES = 4
    model = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(weights=None)
    model.classifier[4] = torch.nn.Conv2d(256, NUM_CLASSES, kernel_size=1)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model = model.to(DEVICE)
    model.eval()

    test_dataset = SegmentationDataset(dataset_dir, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=40, shuffle=False, num_workers=8, pin_memory=True)

    city_class_metrics = {
        city: {cls: {'precision': 0, 'recall': 0, 'f1': 0, 'iou': 0, 'count': 0}
               for cls in range(NUM_CLASSES)} for city in city_keywords
    }

    with torch.no_grad():
        for images, masks, names in tqdm(test_loader):
            images, masks = images.to(DEVICE), masks.to(DEVICE)
            outputs = model(images)["out"]
            preds = torch.argmax(outputs, dim=1)

            for i, name in enumerate(names):
                lower_name = name.lower()
                city = None
                for key, keywords in city_keywords.items():
                    if any(keyword in lower_name for keyword in keywords):
                        city = key
                        break

                if city:
                    pred_single = preds[i].unsqueeze(0)
                    mask_single = masks[i].unsqueeze(0)

                    precision_per_class = precision(pred_single, mask_single, task="multiclass", num_classes=NUM_CLASSES, average=None)
                    recall_per_class = recall(pred_single, mask_single, task="multiclass", num_classes=NUM_CLASSES, average=None)
                    f1_per_class = f1_score(pred_single, mask_single, task="multiclass", num_classes=NUM_CLASSES, average=None)
                    iou_per_class = jaccard_index(pred_single, mask_single, task="multiclass", num_classes=NUM_CLASSES, average=None)

                    for cls in range(NUM_CLASSES):
                        city_class_metrics[city][cls]['precision'] += precision_per_class[cls].item()
                        city_class_metrics[city][cls]['recall'] += recall_per_class[cls].item()
                        city_class_metrics[city][cls]['f1'] += f1_per_class[cls].item()
                        city_class_metrics[city][cls]['iou'] += iou_per_class[cls].item()
                        city_class_metrics[city][cls]['count'] += 1

    # Build the DataFrame for reporting
    all_records = []
    for city, class_metrics in city_class_metrics.items():
        for cls_idx, metrics in class_metrics.items():
            count = metrics['count']
            all_records.append({
                "City": city_translation.get(city, city.capitalize()),
                "Class": class_names[cls_idx],
                "Precision": metrics['precision'] / count if count else 0,
                "Recall": metrics['recall'] / count if count else 0,
                "F1": metrics['f1'] / count if count else 0,
                "IoU": metrics['iou'] / count if count else 0
            })

    return pd.DataFrame(all_records)


# List of (model results directory, test dataset directory) pairs to evaluate
model_dataset_pairs = [
    ("YOUR_RESULTS_DIR_65_35", "YOUR_DATASET_DIR_65_35/test"),
    ("YOUR_RESULTS_DIR_70_30", "YOUR_DATASET_DIR_70_30/test"),
    ("YOUR_RESULTS_DIR_75_25", "YOUR_DATASET_DIR_75_25/test"),
    ("YOUR_RESULTS_DIR_95_5", "YOUR_DATASET_DIR_95_5/test")
]

for model_dir, dataset_dir in model_dataset_pairs:
    df = evaluate_and_collect(model_dir, dataset_dir)
    split_name = model_dir.split('_')[-2] + "-" + model_dir.split('_')[-1]  # Adjust to match your naming pattern
    df.to_csv(f"deeplabv3_metrics_{split_name}.csv", index=False)
    print(f"✅ Saved: deeplabv3_metrics_{split_name}.csv")


  check_for_updates()


Using device: cuda

Evaluating model in: road-tr-od-ss-65-35-results


100%|██████████| 34/34 [00:47<00:00,  1.40s/it]


✅ Saved: deeplabv3_metrics_35-.csv

Evaluating model in: road-tr-od-ss-70-30-results


100%|██████████| 29/29 [00:34<00:00,  1.18s/it]


✅ Saved: deeplabv3_metrics_30-.csv

Evaluating model in: road-tr-od-ss-75-25-results


100%|██████████| 24/24 [00:29<00:00,  1.22s/it]


✅ Saved: deeplabv3_metrics_25-.csv

Evaluating model in: road-tr-od-ss-95-5-results


100%|██████████| 5/5 [00:08<00:00,  1.68s/it]

✅ Saved: deeplabv3_metrics_5-.csv



