In [1]:
import os
import json
import pandas as pd
from collections import Counter

# Paths
base_path = '.'  # Replace with your actual path
rgb_path = os.path.join(base_path, 'rgb')
train_json = os.path.join(base_path, 'Diving48_V2_train.json')
test_json = os.path.join(base_path, 'Diving48_V2_test.json')
vocab_json = os.path.join(base_path, 'Diving48_vocab.json')

# Load JSONs
with open(train_json, 'r') as f:
    train_data = json.load(f)

with open(test_json, 'r') as f:
    test_data = json.load(f)

with open(vocab_json, 'r') as f:
    vocab_data = json.load(f)

# Convert to DataFrame for easy handling
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

# Show head
print("Train Data Head:")
print(df_train.head())

print("\nTest Data Head:")
print(df_test.head())

# Count unique classes
train_classes = df_train['label'].unique()
test_classes = df_test['label'].unique()
print(f"\nUnique Classes in Train: {len(train_classes)}")
print(f"Unique Classes in Test: {len(test_classes)}")

# Class distribution
print("\nTrain Class Distribution:")
print(df_train['label'].value_counts())

print("\nTest Class Distribution:")
print(df_test['label'].value_counts())

# Check total MP4 files in RGB folder
mp4_files = [f for f in os.listdir(rgb_path) if f.endswith('.mp4')]
print(f"\nTotal video files in rgb/: {len(mp4_files)}")

Train Data Head:
            vid_name  label  start_frame  end_frame
0  -mmq0PT-u8k_00155      0            0         48
1  -mmq0PT-u8k_00156      0            0         70
2  -mmq0PT-u8k_00157      0            0         90
3  3qq031609lA_00002      0            0        123
4  3qq031609lA_00004      0            0        102

Test Data Head:
            vid_name  label  start_frame  end_frame
0  8qRmKunCjtY_00016      0            0         78
1  CVAfPfVFulQ_00038      0            0        100
2  CVAfPfVFulQ_00040      0            0         79
3  CVAfPfVFulQ_00048      0            0         76
4  CVAfPfVFulQ_00049      0            0         83

Unique Classes in Train: 47
Unique Classes in Test: 47

Train Class Distribution:
label
26    1053
35     992
7      901
46     816
21     689
31     683
19     604
15     591
34     571
33     568
45     559
44     518
24     484
5      467
8      424
12     421
43     406
22     355
17     349
36     346
28     326
3      259
27     250


In [2]:
import cv2
import numpy as np
from ultralytics import YOLO

def read_video_and_detect_yolo(video_path, start_frame=0, end_frame=None, model_path="yolo11x.pt"):
    """
    Reads video frames and runs YOLOv8 on the first frame.
    Returns frames and YOLO detections from the first frame.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    current_frame = 0

    # Validate video
    if not cap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")

    # Load YOLOv8 model
    model = YOLO(model_path)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if current_frame >= start_frame:
            if end_frame is not None and current_frame > end_frame:
                break
            frames.append(frame)

        current_frame += 1

    cap.release()

    if len(frames) == 0:
        raise ValueError("No frames extracted from video")

    # Run YOLO only on first frame
    first_frame = frames[0]
    results = model.predict(source=first_frame, classes=None, conf=0.25, verbose=False)
    yolo_boxes = []

    if len(results) > 0:
        for box in results[0].boxes:
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            conf = box.conf[0].item()
            cls_id = int(box.cls[0].item())
            yolo_boxes.append([x1, y1, x2, y2, conf, cls_id])

    return frames, yolo_boxes

In [3]:
video_path = "/Users/mrinalraj/Downloads/WebDownload/Driving48/rgb/VNvb5oLOpLg_00000.mp4"
start, end = 0, 156  # from json

frames, detections = read_video_and_detect_yolo(video_path, start, end)

print("Total Frames:", len(frames))
print("Detections in first frame:")
for det in detections:
    print(det)  # x1, y1, x2, y2, confidence, class_id

Total Frames: 157
Detections in first frame:
[431.69757080078125, 9.231010437011719, 460.45721435546875, 69.33294677734375, 0.4667394757270813, 56]


In [8]:
import cv2
import numpy as np
from ultralytics import YOLO

def read_video_detect_draw(video_path, start_frame=0, end_frame=None, model_path="yolov8x.pt"):
    """
    Reads video frames and runs YOLOv8 on the first frame.
    Draws detections and displays the first frame with boxes.
    Returns all frames and the detected boxes.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    current_frame = 0

    if not cap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")

    # Load YOLOv8 model
    model = YOLO(model_path)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if current_frame >= start_frame:
            if end_frame is not None and current_frame > end_frame:
                break
            frames.append(frame)

        current_frame += 1

    cap.release()

    if len(frames) == 0:
        raise ValueError("No frames extracted")

    # Run YOLO on the first frame
    first_frame = frames[0].copy()
    results = model.predict(source=first_frame, classes=None, conf=0.1, verbose=False)
    yolo_boxes = []

    # Draw YOLO detections
    if len(results) > 0:
        for box in results[0].boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            conf = box.conf[0].item()
            cls_id = int(box.cls[0].item())
            yolo_boxes.append([x1, y1, x2, y2, conf, cls_id])

            # Draw box
            cv2.rectangle(first_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            label = f"Class {cls_id} ({conf:.2f})"
            cv2.putText(first_frame, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Show the frame
    cv2.imshow("YOLO Detection - First Frame", first_frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return frames, yolo_boxes

In [9]:
start_frame = 0
end_frame = 156

frames, boxes = read_video_detect_draw(video_path, start_frame, end_frame, model_path="yolov8x.pt")

print("YOLO Detections:")
for b in boxes:
    print(f"Box: {b}")

YOLO Detections:
Box: [431, 8, 460, 68, 0.2939814627170563, 56]
Box: [5, 0, 51, 43, 0.2411947250366211, 56]
Box: [332, 102, 378, 343, 0.20497339963912964, 0]
Box: [253, 2, 290, 63, 0.1445455402135849, 56]
Box: [186, 4, 223, 68, 0.11279432475566864, 56]
Box: [220, 5, 257, 70, 0.10830151289701462, 56]
