In [None]:
from google.colab import drive
drive.mount('/content/drive')

%pip uninstall torch -y -q
%pip install triton==3.1.0 -q
%pip install torch==2.5.1 -q

%pip install torchinfo --quiet
%pip install ultralytics --quiet

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.2/797.2 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━

In [None]:
import torch
from torchinfo import summary
from ultralytics import YOLO
from torchvision.io import read_image
from torchvision.transforms import Resize, Compose
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
import pandas as pd
import time
from tqdm import tqdm, trange

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7a0095a07100>

In [None]:
if torch.cuda.is_available():
    print('device count:', torch.cuda.device_count())
    device = torch.device(0)
    device_cap = torch.cuda.get_device_capability()
    print(f"GPU {torch.cuda.get_device_name(0)} available with compatibility {device_cap}")
    if device_cap not in ((7, 0), (8, 0), (9, 0)):
        print("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")
else:
    device = torch.device("cpu")
    print("GPU unavailable")

device count: 1
GPU Tesla T4 available with compatibility (7, 5)
GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.


In [None]:
class Coco2017Dataset(Dataset):
  def __init__(self, img_dir: str, transform=None) -> None:
    assert (p := Path(img_dir)).exists() and p.is_dir(), f"Image directory {img_dir} does not exist"
    self.img_paths = [str(p) for p in Path(img_dir).glob("*.jpg")]
    self.transform = transform

  def __len__(self) -> int:
    return len(self.img_paths)

  def __getitem__(self, idx: int) -> torch.Tensor:
    img_path = self.img_paths[idx]
    img = read_image(img_path)
    if self.transform:
        img = self.transform(img / 255).to(torch.float32)
    return img


class Expand(object):
  def __call__(self, sample: torch.Tensor) -> torch.Tensor:
    if sample.size()[0] != 3:
      deep_copy = sample.detach().clone()
      return deep_copy.expand(3, -1, -1)
    else:
      return sample


def latency(model, sample):
  start = torch.cuda.Event(enable_timing=True)
  end = torch.cuda.Event(enable_timing=True)

  start.record()
  res = model.predict(sample, verbose=False)
  end.record()
  torch.cuda.synchronize()
  full_elapsed_time = start.elapsed_time(end)

  infer_elapsed_time = res[0].speed["inference"]
  return full_elapsed_time, infer_elapsed_time

In [None]:
coco_ds = Coco2017Dataset(
  img_dir="/content/drive/MyDrive/colab/datasets/coco2017_val",
  transform=Compose([
      Resize(size=(640, 640), antialias=True),
      Expand()
  ])
)
# plt.imshow(coco_ds[0].permute(1, 2, 0))

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

timestamp = time.time_ns()
print(timestamp)
results_filepath = f'/content/drive/MyDrive/colab/results/pytorch-yolo-{timestamp}.csv'
telemetry = defaultdict(list)

1725224601861425976


## YOLOv8 latency in PyTorch

In [None]:
model = YOLO("yolov8m.pt", verbose=False).to(device)
assert model.device.type == "cuda"

# 5k warmup
# for img in tqdm(coco_ds, desc="Warmup"):
#   img = img.unsqueeze(dim=0).to(device)
#   _ = model.predict(img, device=device)
# with 5k images warmup not necessary on second thought

# latency benchmark
for i, img in enumerate(tqdm(coco_ds, desc="Benchmark"), start=1):
  img = img.unsqueeze(dim=0).to(device)

  start.record()
  res = model.predict(img, device=device, verbose=False)
  end.record()
  torch.cuda.synchronize()

  telemetry["framework"].append("PyTorch")
  telemetry["model_name"].append("YOLOv8m")
  telemetry["phase"].append("latency")
  telemetry["epoch"].append(i)
  telemetry["loss"].append(-1)
  telemetry["performance"].append(start.elapsed_time(end))  # idk who cares how much those times differ but i wanna see
  telemetry["elapsed_time"].append(res[0].speed["inference"])

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:00<00:00, 124MB/s]
Benchmark: 100%|██████████| 5000/5000 [28:35<00:00,  2.91it/s]


## YOLOv8 latency in PyTorch with `torch.compile`

In [None]:
model = YOLO("yolov8m.pt", verbose=False).to(device)
torch._dynamo.reset()
model.model = torch.compile(model.model, mode="reduce-overhead")
assert model.device.type == "cuda"

# 5k warmup
# for img in tqdm(coco_ds, desc="Warmup"):
#   img = img.unsqueeze(dim=0).to(device)
#   stuck = True
#   while stuck:
#       try:
#         start.record()
#         res = model.predict(img, device=device)
#         end.record()
#         torch.cuda.synchronize()
#       except: pass
#       else: stuck = False
# with 5k images warmup not necessary on second thought

# latency benchmark
runtime_errors = 0
for i, img in enumerate(tqdm(coco_ds, desc="Compiled benchmark"), start=1):
  img = img.unsqueeze(dim=0).to(device)

  stuck = True
  while stuck:
      try:
        start.record()
        res = model.predict(img, device=device, verbose=False)
        end.record()
        torch.cuda.synchronize()
      except:
        runtime_errors += 1
      else: stuck = False

  telemetry["framework"].append("PyTorch_compile")
  telemetry["model_name"].append("YOLOv8m")
  telemetry["phase"].append("latency")
  telemetry["epoch"].append(i)
  telemetry["loss"].append(-1)
  telemetry["performance"].append(start.elapsed_time(end))  # idk who cares how much those times differ but i wanna see
  telemetry["elapsed_time"].append(res[0].speed["inference"])

print(f"Runtime error count: {runtime_errors} ({(runtime_errors + len(coco_ds)) / len(coco_ds) * 100}%)")

Compiled benchmark: 100%|██████████| 5000/5000 [03:00<00:00, 27.77it/s]

Runtime error count: 0 (100.0%)





## Graph compilation

In [None]:
for i in tqdm(range(12), desc="Graph compilaton"):
  sample = coco_ds[i]
  sample = sample.unsqueeze(dim=0).to(device)

  model = YOLO("yolov8m.pt", verbose=False).to(device)
  model_comp = YOLO("yolov8m.pt", verbose=False).to(device)
  torch._dynamo.reset()
  model_comp.model = torch.compile(model_comp.model, mode="reduce-overhead")

  e_full, e_infer = latency(model, sample)
  c_full, c_infer = latency(model_comp, sample)

  telemetry["framework"].extend(["PyTorch", "PyTorch_compile"])
  telemetry["model_name"].extend(["YOLOv8m", "YOLOv8m"])
  telemetry["phase"].extend(["graph_compilation", "graph_compilation"])
  telemetry["epoch"].extend([i, i])
  telemetry["loss"].extend([-1, -1])
  telemetry["performance"].extend([e_full, c_full])
  telemetry["elapsed_time"].extend([e_infer, c_infer])

pd.DataFrame(telemetry).to_csv(results_filepath, index=False)

Graph compilaton: 100%|██████████| 12/12 [00:14<00:00,  1.19s/it]


## Warm up


In [None]:
for _ in trange(12):
  model = YOLO("yolov8m.pt", verbose=False).to(device)
  model_comp = YOLO("yolov8m.pt", verbose=False).to(device)
  torch._dynamo.reset()
  model_comp.model = torch.compile(model_comp.model, mode="reduce-overhead")

  for i in range(1, 21):
    sample = coco_ds[i].unsqueeze(dim=0).to(device)

    e_full, e_infer = latency(model, sample)
    c_full, c_infer = latency(model_comp, sample)

    telemetry["framework"].extend(["PyTorch", "PyTorch_compile"])
    telemetry["model_name"].extend(["YOLOv8m", "YOLOv8m"])
    telemetry["phase"].extend(["warmup", "warmup"])
    telemetry["epoch"].extend([i, i])
    telemetry["loss"].extend([-1, -1])
    telemetry["performance"].extend([e_full, c_full])
    telemetry["elapsed_time"].extend([e_infer, c_infer])

pd.DataFrame(telemetry).to_csv(results_filepath, index=False)

100%|██████████| 12/12 [00:20<00:00,  1.72s/it]
