Requirements to utilize all available GPUs for 1 dataset, n models, m GPUs:
- Each model processes the entire dataset    
- Dataset can be splitted for slow models    


In [1]:
import os
import sys

sys.path.append("..")

import torch
from torch.utils.data import DataLoader, Subset
import torch.multiprocessing as mp
from torch.nn.parallel import DataParallel
from torchvision import models, transforms

import concurrent.futures

from tqdm import tqdm
import wandb

import math
import time

import logging

import re

import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

from utils.data_loader import FiftyOneTorchDatasetCOCO

from torch.utils.tensorboard import SummaryWriter

from transformers import (
    AutoConfig,
    AutoModelForObjectDetection,
    AutoModelForZeroShotObjectDetection,
    AutoProcessor,
    BatchEncoding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

In [2]:
# Load dataset with V51 from HF
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        os.environ[key] = value

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")

Downloading config file fiftyone.yml from dbogdollumich/mcity_fisheye_v51
Loading dataset
Importing samples...
 100% |███████████████| 2744/2744 [87.8ms elapsed, 0s remaining, 31.2K samples/s]   


In [None]:
MAX_SAMPLES = 130
dataset_v51 = dataset_v51.take(MAX_SAMPLES)

In [124]:
# Convert to torch dataset
dataset_torch = FiftyOneTorchDatasetCOCO(dataset_v51)

Processing Voxel51 dataset: 100%|██████████| 130/130 [00:00<00:00, 271.67it/s]


In [125]:
def _initialize_zero_shot_processor(hf_model_config, model_name, batch_size, object_classes, device
    ):
        processor, batch_classes, tokenized_text, batch_tasks = None, None, None, None
        if type(hf_model_config).__name__ == "GroundingDinoConfig":
            processor = AutoProcessor.from_pretrained(model_name)  # , do_rescale=False
            # https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/grounding-dino
            classes = " . ".join(object_classes) + " . "
            batch_classes = [classes] * batch_size
            tokenized_text = processor.tokenizer(
                batch_classes,
                padding="max_length",
                return_tensors="pt",
                max_length=256,  # Adjust max_length to match vision hidden state
            ).to(device)

        elif type(hf_model_config).__name__ == "Owlv2Config":
            processor = CustomOwlv2Processor.from_pretrained(model_name)
            batch_classes = object_classes * batch_size
            tokenized_text = processor.tokenizer(
                batch_classes, padding="max_length", return_tensors="pt"
            ).to(device)
        elif type(hf_model_config).__name__ == "OwlViTConfig":
            processor = AutoProcessor.from_pretrained(model_name)
            batch_classes = object_classes * batch_size
            tokenized_text = processor.tokenizer(
                batch_classes, padding="max_length", return_tensors="pt"
            ).to(device)
        elif type(hf_model_config).__name__ == "OmDetTurboConfig":
            processor = AutoProcessor.from_pretrained(model_name)
            batch_classes = [object_classes] * batch_size
            task = "Detect {}.".format(", ".join(object_classes))
            batch_tasks = [task] * batch_size
        else:
            logging.error(
                "HuggingFace AutoModel does not support " + str(type(hf_model_config))
            )

        return processor, batch_classes, tokenized_text, batch_tasks

In [126]:
def _zero_shot_process_results(outputs):
    # Process results
    if type(hf_model_config).__name__ == "GroundingDinoConfig":
        results = processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=detection_threshold,
            text_threshold=detection_threshold,
        )
    elif type(hf_model_config).__name__ in ["Owlv2Config", "OwlViTConfig"]:
        results = processor.post_process_object_detection(
            outputs=outputs,
            threshold=detection_threshold,
            target_sizes=target_sizes,
        )
    elif type(hf_model_config).__name__ == "OmDetTurboConfig":
        results = processor.post_process_grounded_object_detection(
            outputs,
            classes=batch_classes,
            score_threshold=detection_threshold,
            nms_threshold=detection_threshold,
            target_sizes=target_sizes,
        )

    # Store results in V51 dataset
    for result, target in zip(results, targets):
        boxes, scores = result["boxes"], result["scores"]

        if "labels" in result:
            labels = result["labels"]
        elif "classes" in result:  # OmDet deviates from the other models
            labels = result["classes"]

        detections = []
        for box, score, label in zip(boxes, scores, labels):
            if type(hf_model_config).__name__ == "GroundingDinoConfig":
                processed_label = label.split()[0]
                if processed_label not in object_classes:
                    matches = get_close_matches(
                        processed_label, object_classes, n=1, cutoff=0.6
                    )
                    processed_label = matches[0] if matches else None
                if processed_label == None:
                    logging.info(
                        "Skipped detection with model "
                        + type(hf_model_config).__name__
                        + " due to unclear detection label: "
                        + label
                    )
                    continue
                label = class_parts_dict[
                    processed_label
                ]  # Original label for eval
                top_left_x = box[0].item()
                top_left_y = box[1].item()
                box_width = (box[2] - box[0]).item()
                box_height = (box[3] - box[1]).item()

            elif type(hf_model_config).__name__ in [
                "Owlv2Config",
                "OwlViTConfig",
            ]:
                label = class_parts_dict[object_classes[label]]
                top_left_x = box[0].item() / img_width
                top_left_y = box[1].item() / img_height
                box_width = (box[2].item() - box[0].item()) / img_width
                box_height = (box[3].item() - box[1].item()) / img_height
            elif type(hf_model_config).__name__ == "OmDetTurboConfig":
                label = class_parts_dict[label]
                top_left_x = box[0].item() / img_width
                top_left_y = box[1].item() / img_height
                box_width = (box[2].item() - box[0].item()) / img_width
                box_height = (box[3].item() - box[1].item()) / img_height

            detection = fo.Detection(
                label=label,
                bounding_box=[
                    top_left_x,
                    top_left_y,
                    box_width,
                    box_height,
                ],
                confidence=score.item(),
            )
            detection["bbox_area"] = (
                detection["bounding_box"][2] * detection["bounding_box"][3]
            )
            detections.append(detection)

        # Attach label to V51 dataset
        sample = dataset_v51[target["image_id"]]
        sample[pred_key] = fo.Detections(detections=detections)
        sample.save()

def _zero_shot_inference(
        data_loader,
        model_name,
        device,
        progress_counter,
        batch_size=16,
        detection_threshold=0.2,
        object_classes=[None],
    ):
        print(f'Launched process with {device}')
        pred_key = re.sub(r"[\W-]+", "_", "pred_" + model_name)
        eval_key = re.sub(r"[\W-]+", "_", "eval_" + model_name)

        writer = SummaryWriter(log_dir="logs/tensorboard/teacher_zeroshot")
        hf_model_config = AutoConfig.from_pretrained(model_name)

        # Process combined label types like "motorbike/cycler"
        processed_classes = [
            part for classname in object_classes for part in classname.split("/")
        ]
        class_parts_dict = {
            part: classname
            for classname in object_classes
            for part in classname.split("/")
        }
        object_classes = processed_classes

        processor, batch_classes, tokenized_text, batch_tasks = (
            _initialize_zero_shot_processor(
                hf_model_config=hf_model_config,
                model_name=model_name,
                batch_size=batch_size,
                object_classes=object_classes,
                device=device,
            )
        )

        model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name).to(device)

        for step, (images, targets) in enumerate(data_loader):
            print(f"Started batch {step} for GPU {device}")
            start_time = time.time()
            if len(images) != batch_size:  # For final batch, if batch not full
                processor, batch_classes, tokenized_text, batch_tasks = (
                    _initialize_zero_shot_processor(
                        hf_model_config,
                        model_name,
                        len(images),  # Key difference
                        object_classes,
                        device,
                    )
                )

            target_sizes = [tuple(img.shape[1:]) for img in images]
            # FIXME Assumption that all images have the same size
            img_height = target_sizes[0][0]
            img_width = target_sizes[0][1]  
            for target_size in target_sizes:
                if target_size[0] != img_height or target_size[1] != img_width:
                    logging.error(f"Not all images have the same size. Current w/h {target_size[1]},{target_size[0]} conflict with {img_width},{img_height}")
            
            if type(hf_model_config).__name__ == "OmDetTurboConfig":
                images = [to_pil_image(image) for image in images]
            else:
                images = [(image).to(device, non_blocking=True) for image in images]

            # Process inputs
            if type(hf_model_config).__name__ == "GroundingDinoConfig":
                inputs = processor(
                    text=None, images=images, return_tensors="pt"
                ).to(device)
                inputs.update(tokenized_text)

            elif type(hf_model_config).__name__ == "Owlv2Config":
                inputs = processor(
                    text=None, images=images, return_tensors="pt"
                ).to(device, non_blocking=True)
                inputs.update(tokenized_text)
            elif type(hf_model_config).__name__ == "OwlViTConfig":
                inputs = processor(
                    text=batch_classes, images=images, return_tensors="pt"
                ).to(device)
                # inputs.update(tokenized_text)
            elif type(hf_model_config).__name__ == "OmDetTurboConfig":
                inputs = processor(
                    text=batch_classes,
                    images=images,
                    task=batch_tasks,
                    return_tensors="pt",
                ).to(device)

            # Model inference
            print(f"Finished pre-processing of batch {step} for GPU {device}")
            with torch.amp.autocast("cuda"):
                with torch.no_grad():
                    outputs = model(**inputs)
            print(f"Finished model inference of batch {step} for GPU {device}")

            _zero_shot_process_results(outputs)
            print(f"Finished post-processing of batch {step} for GPU {device}")

            # Log inference performance
            end_time = time.time()
            batch_duration = end_time - start_time
            batches_per_second = 1 / batch_duration
            frames_per_second = batches_per_second * batch_size
            writer.add_scalar(
                f"inference/{device}/frames_per_second", frames_per_second, step
            )

            # Update the progress counter
            with progress_counter.get_lock():  # Ensure thread safety for counter update
                progress_counter.value += 1
            tqdm.write(f"Progress {device}: {progress_counter:.2f}%")
            writer.add_scalar(
                f"inference/total_steps", progress_counter, step
            )

        # Store labels https://docs.voxel51.com/api/fiftyone.core.collections.html#fiftyone.core.collections.SampleCollection.export
        dataset_v51.export(
            export_dir="output/testmultigpu/",
            dataset_type=fo.types.COCODetectionDataset,
            data_path="data.json",
            export_media=None,  # "manifest",
            label_field=pred_key,
            progress=True,
        )

        writer.close()
        return True

In [131]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [128]:
# Split dataset and launch
models_dict = {
    "IDEA-Research/grounding-dino-tiny": {
        "dataset_splits": 2,
        "batch_size": 4
    },
    "omlab/omdet-turbo-swin-tiny-hf": {
        "dataset_splits": 1,
        "batch_size": 8
    }
}

# Create dataset subsets for models with splits
dataset_length = len(dataset_torch)
subsets = {}

for model_name, config in models_dict.items():
    if config["dataset_splits"] > 1:
        split_size = dataset_length // config["dataset_splits"]
        subsets[model_name] = [
            Subset(dataset_torch, list(range(i * split_size, (i + 1) * split_size)))
            for i in range(config["dataset_splits"])
        ]
        # Handle any remainder
        if dataset_length % config["dataset_splits"] > 0:
            subsets[model_name][-1] = Subset(
                dataset_torch, list(range((config["dataset_splits"] - 1) * split_size, dataset_length))
            )
    else:
        subsets[model_name] = [dataset_torch]  # Entire dataset for models with dataset_splits=1

In [129]:
def main():
    run = None
    try:
        run = wandb.init(
        name="Multi GPU Teacher",
        allow_val_change=True,
        sync_tensorboard=True,
        project="Teacher Dev",)

        # Device assignment (manually assigning GPUs)
        if torch.cuda.is_available():
            devices = [torch.device(f"cuda:{i}") for i in range(torch.cuda.device_count())]
            n_gpus = len(devices)
            print(f"Using {n_gpus} GPUs: {devices}")
        else:
            print("Multiprocessing not possible, CUDA not avaiable.")
        
        # Run inference for each model and its subsets
        NUM_WORKERS = 32
        n_workers_per_gpu = math.floor(NUM_WORKERS / n_gpus)

        object_classes = ["pedestrian", "cyclist", "vehicle"]

        processes = []
        assigned_gpu_index = 0

        # Shared progress counter (using Value to be shared between processes)
        progress_counter = mp.Value('i', 0)  # Shared integer for progress tracking

        for model_idx, (model_name, config) in enumerate(models_dict.items()):
            subset_list = subsets[model_name]
            
            for subset_idx, subset in enumerate(subset_list):
                dataloader = DataLoader(
                    subset,
                    batch_size=config["batch_size"],
                    num_workers=n_workers_per_gpu,
                    pin_memory=True,
                    collate_fn=lambda batch: list(zip(*batch)),
                    shuffle=False
                )

                assigned_device = devices[assigned_gpu_index]
                assigned_gpu_index += 1
                print(f"Running {model_name} on {assigned_device} with {len(subset)} samples (subset {subset_idx + 1})...")
                p = mp.Process(target=_zero_shot_inference, args=(subset, model_name, assigned_device, progress_counter, 16, 0.2, object_classes))
                processes.append(p)
                
        # Start all processes
        for p in processes:
            p.start()

        # Wait for all processes to finish
        for p in processes:
            p.join()
        
        run.finish(exit_code=0)
    except Exception as e:
        print(e)
        if run:
            run.finish(exit_code=1)

In [132]:
main()


Using 3 GPUs: [device(type='cuda', index=0), device(type='cuda', index=1), device(type='cuda', index=2)]
Running IDEA-Research/grounding-dino-tiny on cuda:0 with 65 samples (subset 1)...
Running IDEA-Research/grounding-dino-tiny on cuda:1 with 65 samples (subset 2)...
Running omlab/omdet-turbo-swin-tiny-hf on cuda:2 with 130 samples (subset 1)...
Launched process with cuda:0
Launched process with cuda:1
Launched process with cuda:2


KeyboardInterrupt: 

: 