Skip to content

Question: Best way to structure multi-label image dataset for custom PPE classification? #531

@DNJ-JINI

Description

@DNJ-JINI

Hello OpenAI Team,

We are building a custom PPE compliance system where each cropped image contains a single person, and we want to classify multiple PPE attributes per image.

Each image may have multiple labels such as:

Helmet: absent / wearing properly / not wearing properly
Vest: absent /wearing properly / not wearing properly
Gloves: absent /wearing properly / not wearing properly
Glasses: absent /wearing properly / not wearing properly
Boots: absent /wearing properly / not wearing properly

For example:
Image: image1101.jpg

Helmet: absent
Vest: wearing properly
Gloves: not wearing properly
Glasses: wearing properly
Boots: wearing properly

Currently, our CSV format looks like this:

image helmet vest gloves glass boots   caption
image1101.jpg absent wearing properly not wearing properly wearing properly wearing properly

caption for this image is- " Helmet: absent, Vest: wearing properly, Gloves: present but not worn properly, Glasses: wearing properly, Boots: wearing properly."

We would like guidance on:

What is the recommended data structure for multi-label image classification using OpenAI models?
Best suitable caption for this type of atsk
Are there best practices for training or fine-tuning when each image contains multiple related attributes?

I am also sharing the basic training code here for your reference:
import torch
import clip
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader

--------------------

CONFIG

--------------------

train_csv = "train_corrected.csv"
val_csv = "val_corrected.csv"
image_dir = "data/images"
model_name = "clip_ppe_best.pt"
print("......",model_name)

batch_size = 8
epochs = 50
lr = 1e-5
weight_decay = 1e-4

device = "cuda" if torch.cuda.is_available() else "cpu"

--------------------

LOAD CLIP (OpenAI)

--------------------

model, preprocess = clip.load("ViT-L/14", device=device)
model = model.float()

--------------------

FREEZE (SMALL DATA SAFE)

--------------------

for param in model.parameters():
param.requires_grad = False

Train projection heads only

model.visual.proj.requires_grad = True
model.text_projection.requires_grad = True

Freeze logit scale

model.logit_scale.requires_grad = False

model.train()

print("Context length:", model.context_length)
print("Vocab size:", model.vocab_size)
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")

--------------------

DATASET

--------------------

class PPEClipDataset(Dataset):
def init(self, csv_path, image_dir):
self.data = pd.read_csv(csv_path, encoding="latin1")

    self.data["caption"] = (
        self.data["caption"]
        .astype(str)
        .str.replace("\u00a0", " ")
        .str.strip()
        .str.lower()
    )

    self.image_dir = image_dir

def __len__(self):
    return len(self.data)

def __getitem__(self, idx):
    image_path = f"{self.image_dir}/{self.data.iloc[idx]['image']}"
    caption = self.data.iloc[idx]['caption']

    image = Image.open(image_path).convert("RGB")
    image = preprocess(image)

    text = clip.tokenize([caption])[0].long()

    return image, text

train_dataset = PPEClipDataset(train_csv, image_dir)
val_dataset = PPEClipDataset(val_csv, image_dir)

train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
pin_memory=True
)

val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=2,
pin_memory=True
)

--------------------

OPTIMIZER & LOSS

--------------------

optimizer = torch.optim.AdamW(
filter(lambda p: p.requires_grad, model.parameters()),
lr=lr,
weight_decay=weight_decay
)

loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()

--------------------

VALIDATION FUNCTION

--------------------

@torch.no_grad()
def validate(model, loader):
model.eval()

total_loss = 0
batches = 0

for images, texts in loader:
    images = images.to(device)
    texts = texts.to(device)

    logits_per_image, logits_per_text = model(images, texts)

    ground_truth = torch.arange(len(images), device=device)

    loss = (
        loss_img(logits_per_image, ground_truth) +
        loss_txt(logits_per_text, ground_truth)
    ) / 2

    total_loss += loss.item()
    batches += 1

model.train()
return total_loss / max(batches, 1)

--------------------

TRAIN LOOP

--------------------

best_val_loss = float("inf")

for epoch in range(epochs):
total_loss = 0
batches = 0

for images, texts in train_loader:
    images = images.to(device)
    texts = texts.to(device)

    logits_per_image, logits_per_text = model(images, texts)

    ground_truth = torch.arange(len(images), device=device)

    loss = (
        loss_img(logits_per_image, ground_truth) +
        loss_txt(logits_per_text, ground_truth)
    ) / 2

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    model.logit_scale.data.clamp_(0, 4.6052)

    total_loss += loss.item()
    batches += 1

train_loss = total_loss / max(batches, 1)
val_loss = validate(model, val_loader)

print(
    f"Epoch [{epoch+1}/{epochs}] "
    f"Train Loss: {train_loss:.4f} | "
    f"Val Loss: {val_loss:.4f}"
)

# Save best model
if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(
        model.state_dict(),
        model_name
    )
    print("✅ Saved best model",model_name)

print("🎉 Training complete")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions