In [3]:
import os

import math
import cv2
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from collections import deque
from ultralytics import YOLO

### Load data

In [8]:
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (1500, 4)
Test shape: (1344, 1)


### Utils

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

model = YOLO("yolov8m-seg.pt")
model.to(device)
vehicle_classes = {"car", "truck", "bus", "motorbike", "bicycle"}

@torch.no_grad()
def extract_mask(frame: np.ndarray, model: YOLO) -> np.ndarray | None:
    """Extracts vehicle masks from a given frame using the YOLO model.

    If no vehicle masks are found, returns None.
    """
    model.eval()

    results = model(frame, verbose=False)[0]
    masks = results.masks
    classes = results.boxes.cls

    if masks is None or masks.data is None or len(masks.data) == 0:
        return None

    H, W = frame.shape[:2]
    mask_out = np.zeros((H, W), dtype=np.uint8)

    # Convert once
    masks_np = masks.data.cpu().numpy().astype(np.uint8)
    classes_np = classes.cpu().numpy()

    for seg, cls_id in zip(masks_np, classes_np):
        class_name = model.model.names[int(cls_id)]
        if class_name in vehicle_classes:
            mask = cv2.resize(seg, (W, H), interpolation=cv2.INTER_NEAREST)
            mask_out[mask > 0] = 255

    return mask_out[None, ...]  # Shape: (1, H, W)

In [137]:
def compute_flow_channels(frame1: np.ndarray, frame2: np.ndarray) -> np.ndarray:
    """Compute optical flow channels between two frames.

    Flow is returned as a 2-channel image with shape (2, H, W):
    - Channel 0: Magnitude of flow
    - Channel 1: Angle of flow
    """
    # Convert to grayscale
    prev_gray = cv2.cvtColor(frame1, cv2.COLOR_RGB2GRAY)
    next_gray = cv2.cvtColor(frame2, cv2.COLOR_RGB2GRAY)

    # Compute Dense Optical Flow (Farneback)
    flow = cv2.calcOpticalFlowFarneback(prev_gray, next_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)

    # Compute magnitude and direction
    magnitude, angle = cv2.cartToPolar(flow[..., 0], flow[..., 1])

    # Normalize magnitude and angle
    magnitude = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX)
    angle = (angle * 180 / np.pi / 2).astype(np.uint8)

    # Stack into a single 2-channel image
    flow_channels = np.dstack((magnitude, angle))

    # Reorder dimensions to (2, H, W)
    return np.moveaxis(flow_channels, -1, 0)  # Shape: (2, H, W)


In [138]:
def downsample_image(image: np.ndarray, factor: int) -> np.ndarray:
    """Downsample imput image by a given factor.

    Accepts images in both (C, H, W) and (H, W, C) formats, but returns in (C, H, W) format.
    """
    if image.shape[0] <= 3:
        # If the image is in (C, H, W) format, transpose to (H, W, C) for resizing
        image = np.transpose(image, (1, 2, 0))

    # Downsample the image
    h, w = image.shape[:2]
    downsampled_image = cv2.resize(image, (w // factor, h // factor), interpolation=cv2.INTER_LINEAR)

    # If downsampled image now has two dimensions, convert to 1 channels (H, W, 1)
    if len(downsampled_image.shape) == 2:
        downsampled_image = np.expand_dims(downsampled_image, axis=-1)

    # Make sure to resize back to (C, H, W) format
    return np.transpose(downsampled_image, (2, 0, 1))

### Process negative training samples

In [139]:
SAVE_EVERY = 30  # Every 1 second @30 FPS

dq = deque(maxlen=3)
for row in tqdm(train_df[train_df["target"] == 0].to_dict(orient="records")):
    # Load video
    video_path = f"../data/raw/train/{str(row['id']).zfill(5)}.mp4"
    video = cv2.VideoCapture(video_path)

    # Iterate frames
    frame_index = 0
    save_index = 0
    while True:
        ret, frame = video.read()
        if not ret:
            break
        
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB

        # Extract information if required
        if frame_index % SAVE_EVERY == 0 and len(dq) == 3:
            mask = extract_mask(frame, model)           # Shape: (1, H, W)
            flow = compute_flow_channels(dq[0], frame)  # Shape: (2, H, W)

            flow_downsampled = downsample_image(flow, 3)
            frame_downsampled = downsample_image(frame, 3)

            save_dir = f"../data/processed/train/{str(row['id']).zfill(5)}"
            os.makedirs(save_dir, exist_ok=True)

            flow_path = save_dir + f"/flows/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(flow_path), exist_ok=True)
            torch.save(torch.tensor(flow_downsampled).to(dtype=torch.float32), flow_path)

            frame_path = save_dir + f"/frames/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(frame_path), exist_ok=True)
            torch.save(torch.tensor(frame_downsampled).to(dtype=torch.int16), frame_path)

            # Save mask if it exists
            if mask is not None:
                mask_downsampled = downsample_image(mask, 3)
                mask_path = save_dir + f"/masks/{str(save_index).zfill(2)}.pt"
                os.makedirs(os.path.dirname(mask_path), exist_ok=True)
                torch.save(torch.tensor(mask_downsampled).to(dtype=torch.int16), mask_path)

            save_index += 1

        frame_index += 1
        dq.append(frame)

    video.release()


100%|██████████| 750/750 [2:05:10<00:00, 10.01s/it]  


### Process positive training samples

In [140]:
FPS = 30  # Frames per second

dq = deque(maxlen=3)
for row in tqdm(train_df[train_df["target"] == 1].to_dict(orient="records")):
    # Load video
    video_path = f"../data/raw/train/{str(row['id']).zfill(5)}.mp4"
    video = cv2.VideoCapture(video_path)

    time_alert = float(row["time_of_alert"])
    time_event = float(row["time_of_event"])
    start_frame = math.floor(time_alert * FPS)
    end_frame = math.ceil(time_event * FPS)

    # Seek to start frame
    video.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    frame_index = start_frame
    save_index = 0

    while frame_index <= end_frame:
        ret, frame = video.read()
        if not ret:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        dq.append(frame)

        if len(dq) == 3:
            mask = extract_mask(frame, model)           # Shape: (1, H, W)
            flow = compute_flow_channels(dq[0], frame)  # Shape: (2, H, W)

            flow_downsampled = downsample_image(flow, 3)
            frame_downsampled = downsample_image(frame, 3)

            save_dir = f"../data/processed/train/{str(row['id']).zfill(5)}"
            os.makedirs(save_dir, exist_ok=True)

            flow_path = save_dir + f"/flows/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(flow_path), exist_ok=True)
            torch.save(torch.tensor(flow_downsampled).to(dtype=torch.float32), flow_path)

            frame_path = save_dir + f"/frames/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(frame_path), exist_ok=True)
            torch.save(torch.tensor(frame_downsampled).to(dtype=torch.int16), frame_path)

            if mask is not None:
                mask_downsampled = downsample_image(mask, 3)
                mask_path = save_dir + f"/masks/{str(save_index).zfill(2)}.pt"
                os.makedirs(os.path.dirname(mask_path), exist_ok=True)
                torch.save(torch.tensor(mask_downsampled).to(dtype=torch.int16), mask_path)

            save_index += 1

        frame_index += 1

    video.release()


100%|██████████| 750/750 [2:24:00<00:00, 11.52s/it]  


### Process test data

In [None]:
FRAMES_NEEDED = 6  # 3 for optical flow context + 3 to process

dq = deque(maxlen=3)
for row in tqdm(test_df.to_dict(orient="records")):
    video_path = f"../data/raw/test/{str(row['id']).zfill(5)}.mp4"
    video = cv2.VideoCapture(video_path)

    # Get total number of frames
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    start_frame = max(0, total_frames - FRAMES_NEEDED)

    # Seek to frame N - 6
    video.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    frame_index = start_frame
    save_index = 0

    while True:
        ret, frame = video.read()
        if not ret:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        dq.append(frame)

        # Only save the *last 3 frames* (from N-3 to N-1)
        if frame_index >= total_frames - 3 and len(dq) == 3:
            mask = extract_mask(frame, model)           # Shape: (1, H, W)
            flow = compute_flow_channels(dq[0], frame)  # Shape: (2, H, W)

            flow_downsampled = downsample_image(flow, 3)
            frame_downsampled = downsample_image(frame, 3)

            save_dir = f"../data/processed/test/{str(row['id']).zfill(5)}"
            os.makedirs(save_dir, exist_ok=True)

            flow_path = save_dir + f"/flows/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(flow_path), exist_ok=True)
            torch.save(torch.tensor(flow_downsampled).to(dtype=torch.float32), flow_path)

            frame_path = save_dir + f"/frames/{str(save_index).zfill(2)}.pt"
            os.makedirs(os.path.dirname(frame_path), exist_ok=True)
            torch.save(torch.tensor(frame_downsampled).to(dtype=torch.int16), frame_path)

            if mask is not None:
                mask_downsampled = downsample_image(mask, 3)
                mask_path = save_dir + f"/masks/{str(save_index).zfill(2)}.pt"
                os.makedirs(os.path.dirname(mask_path), exist_ok=True)
                torch.save(torch.tensor(mask_downsampled).to(dtype=torch.int16), mask_path)

            save_index += 1

        frame_index += 1

    video.release()


100%|██████████| 1344/1344 [16:33<00:00,  1.35it/s]


## Prepare datasets

In [6]:
# Add path to features
train_df["features_path"] = train_df["id"].apply(lambda x: f"../data/processed/train/{str(x).zfill(5)}")
test_df["features_path"] = test_df["id"].apply(lambda x: f"../data/processed/test/{str(x).zfill(5)}")

# Add number of frames (for sampling)
train_df["n_frames"] = train_df["features_path"].apply(lambda x: len(os.listdir(x + "/frames")))
test_df["n_frames"] = test_df["features_path"].apply(lambda x: len(os.listdir(x + "/frames")))

# Save to parquet
train_df.to_parquet("../data/processed/train.parquet", index=False)
test_df.to_parquet("../data/processed/test.parquet", index=False)