# Clothing Segmentation
### (with human parsing) by DeepLabV3 with ResNet101 Backbone

## Requirements Installation

In [1]:
!pip install -q torch torchvision datasets tqdm -U

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.

In [2]:
!pip install albumentations -q

### Upload to HF 

In [None]:
from huggingface_hub import login
login()

In [4]:
from huggingface_hub import HfApi
import torch
import os
import json

api = HfApi()

def upload_to_hf(model, val_history, repo_id):

    folder = f"deeplab_model_checkpoint"
    os.makedirs(folder, exist_ok=True)

    # Save model state dict (PyTorch format)
    model_path = os.path.join(folder, 'model.pt')
    torch.save(model.state_dict(), model_path)

    
    metrics_path = os.path.join(folder,"val_metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(val_history, f, indent=2)

    try:
        api.upload_folder(
            repo_id=repo_id,
            folder_path=folder,
            path_in_repo=folder,
        )
        print(f"✅ Uploaded to HuggingFace repo: {repo_id}")
    except Exception as e:
        print(f"⚠️ Failed to push to HuggingFace: {e}")
        print(f"   Model saved locally at: {model_path}")


#### Test HF Uploading

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoConfig
from huggingface_hub import HfApi, Repository, login

hf_token = "<hf_token>"  # get it from https://huggingface.co/settings/tokens
login(token=hf_token)

# Define a dummy PyTorch model
class DummyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(10, 2)
        
    def forward(self, x):
        return self.fc(x)

model = DummyModel()

# HF repo setup
repo_id = "oshaban/deeplabv3_clothes"  
local_dir = "./hf_dummy_model"

# Create local folder
from pathlib import Path
Path(local_dir).mkdir(exist_ok=True, parents=True)

# Save model + dummy config
torch.save(model.state_dict(), f"{local_dir}/pytorch_model.bin")

# Push to Hugging Face Hub
try:
    api.upload_folder(
        repo_id=repo_id,
        folder_path=local_dir,
        path_in_repo=local_dir,
    )
    print(f"✅ Uploaded to HuggingFace repo: {repo_id}")
except Exception as e:
    print(f"⚠️ Failed to push to HuggingFace: {e}")
    print(f"   Model saved locally at: {model_path}")


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Uploaded to HuggingFace repo: oshaban/deeplabv3_clothes


#### **Trial:** Subset from iMaterialist

In [None]:
# from datasets import Dataset
# import pandas as pd
# import random
# import shutil

# DATASET_DIR = "/kaggle/input/imaterialist-fashion-2020-fgvc7/train"
# OUTPUT_DIR = "/kaggle/working/subset_imaterialist"

# os.makedirs(OUTPUT_DIR, exist_ok=True)

# images = os.listdir(DATASET_DIR)

# # Random subset
# subset = random.sample(images, 500)

# for img in subset:
#     shutil.copy(
#         os.path.join(DATASET_DIR, img),
#         os.path.join(OUTPUT_DIR, img)
#     )

## Code Start

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchvision.models.segmentation import deeplabv3_mobilenet_v3_large, deeplabv3_resnet101
from torchvision import transforms

from datasets import load_dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2

import numpy as np
from PIL import Image
from tqdm import tqdm


In [7]:
dataset = load_dataset("mattmdjaga/human_parsing_dataset")

full_train = dataset["train"]

# 70 / 15 / 15 split
full_train = full_train.train_test_split(test_size=0.30, seed=42)
val_test = full_train["test"].train_test_split(test_size=0.50, seed=42)

train_ds = full_train["train"]
val_ds   = val_test["train"]
test_ds  = val_test["test"]

NUM_CLASSES = 18
BATCH_SIZE = 8
NUM_EPOCHS = 20

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002-f3a663f7140ee7(…):   0%|          | 0.00/394M [00:00<?, ?B/s]

data/train-00001-of-00002-74610e243c32d5(…):   0%|          | 0.00/403M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17706 [00:00<?, ? examples/s]

In [8]:
class HumanParsingDataset(Dataset):
    def __init__(self, hf_dataset, augment=False):
        self.ds = hf_dataset
        self.augment = augment
        self.transform = A.Compose([
            A.Resize(512, 512),
            A.HorizontalFlip(p=0.5 if augment else 0.0),
            A.Normalize(mean=(0.485, 0.456, 0.406),
                        std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ])
    
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        item = self.ds[idx]
        image = item["image"].convert("RGB")
        mask  = item["mask"]
        mask = np.array(mask, dtype=np.int64)
        
        augmented = self.transform(image=np.array(image), mask=mask)
        
        image_tensor = augmented["image"]
        mask_tensor = augmented["mask"].long() 
        
        return image_tensor, mask_tensor

In [9]:
train_loader = DataLoader(
    HumanParsingDataset(train_ds, augment=True),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    HumanParsingDataset(val_ds),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = deeplabv3_resnet101(weights="DEFAULT")

model.classifier[4] = torch.nn.Conv2d(256, NUM_CLASSES, kernel_size=1)

if hasattr(model, 'aux_classifier'):
    model.aux_classifier[4] = torch.nn.Conv2d(256, NUM_CLASSES, kernel_size=1)

model = model.to(device)

Downloading: "https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth


100%|██████████| 233M/233M [00:01<00:00, 192MB/s] 


In [11]:
class DiceLoss(nn.Module):
    def __init__(self, smooth=1.0):
        super().__init__()
        self.smooth = smooth

    def forward(self, logits, targets):
        """
        logits: [B, C, H, W]
        targets: [B, H, W]
        """
        num_classes = logits.shape[1]
        probs = torch.softmax(logits, dim=1)

        targets_onehot = torch.nn.functional.one_hot(
            targets, num_classes
        ).permute(0, 3, 1, 2).float()

        dims = (0, 2, 3)
        intersection = torch.sum(probs * targets_onehot, dims)
        union = torch.sum(probs + targets_onehot, dims)

        dice = (2. * intersection + self.smooth) / (union + self.smooth)
        return 1 - dice.mean()
ce_loss = nn.CrossEntropyLoss(ignore_index=255)
dice_loss = DiceLoss()

def combined_loss(logits, targets, dice_weight=0.4):
    return ce_loss(logits, targets) + dice_weight * dice_loss(logits, targets)

# Define optimizer and scalar
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler()


  scaler = torch.cuda.amp.GradScaler()


In [12]:
@torch.no_grad()
def pixel_accuracy(preds, targets):
    """
    preds: [B, H, W]
    targets: [B, H, W]
    """
    valid = targets != 255
    correct = (preds[valid] == targets[valid]).sum()
    total = valid.sum()
    return (correct.float() / total.float()).item()


@torch.no_grad()
def mean_iou(preds, targets, num_classes):
    ious = []

    for cls in range(num_classes):
        pred_i = preds == cls
        target_i = targets == cls

        intersection = (pred_i & target_i).sum().float()
        union = (pred_i | target_i).sum().float()

        if union == 0:
            continue

        ious.append((intersection / union).item())

    if len(ious) == 0:
        return 0.0

    return sum(ious) / len(ious)


In [13]:
@torch.no_grad()
def validate(model, loader):
    model.eval()

    total_loss = 0
    total_acc = 0
    total_iou = 0
    count = 0

    for images, masks in loader:
        images = images.to(device)
        masks = masks.to(device)

        outputs = model(images)["out"]
        loss = combined_loss(outputs, masks)

        preds = torch.argmax(outputs, dim=1)

        total_loss += loss.item()
        total_acc += pixel_accuracy(preds, masks)
        total_iou += mean_iou(preds, masks, NUM_CLASSES)
        count += 1

    return (
        total_loss / count,
        total_acc / count,
        total_iou / count
    )


In [14]:
def train_one_epoch(model, loader):
    model.train()
    total_loss = 0

    for images, masks in tqdm(loader):
        images = images.to(device)
        masks  = masks.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(images)["out"]
            loss = combined_loss(outputs, masks)


        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    return total_loss / len(loader)


In [15]:
REPO_ID = "oshaban/deeplabv3_clothes"

In [None]:
for epoch in range(NUM_EPOCHS):
    print("Training...")
    train_loss = train_one_epoch(model, train_loader)
    print("Validation...")
    val_loss, val_acc, val_iou = validate(model, val_loader)

    val_history = []

    val_history.append({
        "epoch": epoch,
        "val_loss": val_loss,
        "val_accuracy": val_acc,
        "val_iou": val_iou
    })
    
    print("Uploading...")
    upload_to_hf(model, val_history, REPO_ID)

    print(
        f"Epoch {epoch+1}/{NUM_EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"Pixel Acc: {val_acc:.4f} | "
        f"mIoU: {val_iou:.4f}"
    )

  with torch.cuda.amp.autocast():
100%|██████████| 1550/1550 [29:31<00:00,  1.14s/it]


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Uploaded to HuggingFace repo: oshaban/deeplabv3_clothes
Epoch 1/20 | Train Loss: 0.6504 | Val Loss: 0.3955 | Pixel Acc: 0.9328 | mIoU: 0.4906


100%|██████████| 1550/1550 [29:48<00:00,  1.15s/it]


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Uploaded to HuggingFace repo: oshaban/deeplabv3_clothes
Epoch 2/20 | Train Loss: 0.3667 | Val Loss: 0.3498 | Pixel Acc: 0.9368 | mIoU: 0.5120


100%|██████████| 1550/1550 [29:48<00:00,  1.15s/it]


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Uploaded to HuggingFace repo: oshaban/deeplabv3_clothes
Epoch 3/20 | Train Loss: 0.3308 | Val Loss: 0.3519 | Pixel Acc: 0.9339 | mIoU: 0.5036


100%|██████████| 1550/1550 [29:46<00:00,  1.15s/it]


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Uploaded to HuggingFace repo: oshaban/deeplabv3_clothes
Epoch 4/20 | Train Loss: 0.3107 | Val Loss: 0.3301 | Pixel Acc: 0.9401 | mIoU: 0.5457


 77%|███████▋  | 1201/1550 [23:03<06:39,  1.14s/it]