In [None]:
"""
README: DeepLabV3+ResNet101 Semantic Segmentation Pipeline (PyTorch, Albumentations)

This script trains a DeepLabV3+ResNet101 model for semantic segmentation on a custom dataset.
It uses PyTorch, Albumentations for augmentation, and supports multi-GPU training.

Pipeline:
1. Custom Dataset class loads images and corresponding masks from a given directory.
2. Data augmentations (resize, flip, normalization) are applied with Albumentations.
3. DataLoaders are created for training, validation, and testing.
4. Model is initialized (DeepLabV3+ResNet101) with a custom number of classes.
5. Weighted CrossEntropyLoss is used for imbalanced datasets.
6. Training supports checkpointing and multi-GPU (DataParallel).
7. The best model is saved at the end of training.

How to use:
- Place your images as JPGs and masks as PNGs in the appropriate 'train', 'valid', and 'test' folders:
    YOUR_DATASET_DIR/
        train/
            image1.jpg
            image1_mask.png
            ...
        valid/
        test/
- Set the number of classes and class weights as needed.
- Adjust batch size and number of workers based on your GPU memory.

Author: Bahadir Akin Akgul
Date: 13.07.2025

Requirements:
- torch
- torchvision
- albumentations
- opencv-python
- numpy
- tqdm

"""

import os
import torch
import torchvision
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import gc

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}, GPU count: {torch.cuda.device_count()}")

# Data paths (CHANGE these to your own directory)
DATA_DIR = "YOUR_DATASET_DIR"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
VALID_DIR = os.path.join(DATA_DIR, "valid")
TEST_DIR = os.path.join(DATA_DIR, "test")

# Albumentations transforms
transform = A.Compose([
    A.Resize(1024, 768),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
], additional_targets={'mask': 'mask'})

# Custom Dataset class for segmentation
class SegmentationDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.images = [f for f in os.listdir(img_dir) if f.endswith(".jpg")]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_name = self.images[idx]
        img_path = os.path.join(self.img_dir, img_name)
        mask_path = img_path.replace(".jpg", "_mask.png")

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        if mask is None or image.shape[:2] != mask.shape:
            print(f"Warning: {mask_path} could not be loaded! Image shape: {image.shape}, Mask shape: {mask.shape if mask is not None else 'None'}")
            mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
        else:
            mask = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)

        mask = mask.astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image, mask=mask)
            image = transformed["image"]
            mask = transformed["mask"].long()

        return image, mask

# DataLoaders (drop_last=True is important for batch consistency)
BATCH_SIZE = 4  # Adjust according to your GPU memory
train_dataset = SegmentationDataset(TRAIN_DIR, transform=transform)
valid_dataset = SegmentationDataset(VALID_DIR, transform=transform)
test_dataset = SegmentationDataset(TEST_DIR, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, drop_last=True)

# Clean up memory
torch.cuda.empty_cache()
gc.collect()

# Model setup
NUM_CLASSES = 4
model = torchvision.models.segmentation.deeplabv3_resnet101(weights="DEFAULT")
model.classifier[4] = torch.nn.Conv2d(256, NUM_CLASSES, kernel_size=1)
if hasattr(model, "aux_classifier") and model.aux_classifier is not None:
    model.aux_classifier[4] = torch.nn.Conv2d(256, NUM_CLASSES, kernel_size=1)
model = model.to(DEVICE)

# Multi-GPU support
if torch.cuda.device_count() > 1:
    print(f"Parallelizing model on {torch.cuda.device_count()} GPUs...")
    model = torch.nn.DataParallel(model)

# Loss function and optimizer (adjust weights for your own dataset)
class_weights = torch.tensor([1.0, 2.0, 2.0, 4.0]).to(DEVICE)  # Tune according to class distribution
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# --- CHECKPOINT FUNCTIONS ---
checkpoint_path = "checkpoint_resnet101.pth"

def load_checkpoint(model, optimizer, checkpoint_path):
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
        is_dataparallel = isinstance(model, torch.nn.DataParallel)
        model_to_load = model.module if is_dataparallel else model
        model_to_load.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        start_epoch = checkpoint["epoch"]
        print(f"Checkpoint loaded. Continuing training from epoch {start_epoch}.")
        return model, optimizer, start_epoch
    else:
        print("No checkpoint found. Starting training from scratch.")
        return model, optimizer, 0

def train_model(model, train_loader, valid_loader, optimizer, criterion, start_epoch=0, epochs=100, checkpoint_path=checkpoint_path):
    for epoch in range(start_epoch, epochs):
        model.train()
        running_loss = 0.0
        for images, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            images, masks = images.to(DEVICE), masks.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)["out"]
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.4f}")

        # Save checkpoint
        checkpoint = {
            "epoch": epoch + 1,
            "model_state_dict": model.module.state_dict() if isinstance(model, torch.nn.DataParallel) else model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        }
        torch.save(checkpoint, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

# Training and checkpoint loading
model, optimizer, start_epoch = load_checkpoint(model, optimizer, checkpoint_path=checkpoint_path)

train_model(
    model, train_loader, valid_loader, optimizer, criterion,
    start_epoch=start_epoch, epochs=100, checkpoint_path=checkpoint_path
)

# Save the final trained model
model_to_save = model.module if isinstance(model, torch.nn.DataParallel) else model
torch.save(model_to_save.state_dict(), "trained_model_resnet101.pth")
print("Model saved successfully!")


Using device: cuda, GPU count: 2
2 GPU ile model paralelleştiriliyor...
Checkpoint bulunamadı. Eğitim 0'dan başlıyor.


Epoch 1/100:  91%|█████████▏| 1219/1333 [34:49<03:17,  1.74s/it] 