In [1]:
!pip install rfdetr supervision opencv-python numpy

Collecting rfdetr
  Downloading rfdetr-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting supervision
  Downloading supervision-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting fairscale (from rfdetr)
  Downloading fairscale-0.4.13.tar.gz (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pylabel (from rfdetr)
  Downloading pylabel-0.1.55-py3-none-any.whl.metadata (3.8 kB)
Collecting polygraphy (from rfdetr)
  Downloading polygraphy-0.49.26-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting open_clip_torch (from rfdetr)
  Downloading open_clip_torch-3.0.0-py3-none-any.whl.metadata (32 kB)
Collecting rf100vl (from rfdetr)
  Downloading rf100vl-1.0.0-py3-none-any.whl.me

In [13]:
import torch
import cv2
import supervision as sv
import numpy as np
import os

# --- Configuration ---
# 1. UPDATE these paths to match your environment in Colab
CHECKPOINT_PATH = "/kaggle/input/rfdter_ppe_pretrained/pytorch/default/1/checkpoint_best_total.pth"
INPUT_VIDEO_PATH = "/kaggle/input/violation-nonviolation/resized_870.mp4"  # Make sure to include the .mp4 extension
OUTPUT_VIDEO_PATH = "/kaggle/working/cctv_0.25_2.mp4" # Output as an .mp4 file

# 2. Model and dataset configuration
CONFIDENCE_THRESHOLD = 0.25  # Adjust as needed
# These should match the dataset the model was trained on
NUM_CLASSES = 10
CLASS_NAMES = [
    'boots', 'gloves', 'goggles', 'helmet', 'no-boots',
    'no-gloves', 'no-goggles', 'no-helmet', 'no-vest', 'vest'
]

# --- Main Script ---

def main():
    """
    Main function to run the PPE detection inference on a video.
    """
    # --- 1. Setup Device (GPU or CPU) ---
    # The library handles device placement internally, but this is good for printing.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # --- 2. Initialize the Model and Load Weights ---
    # The 'rfdetr' library needs to be installed. If not, this will fail.
    try:
        from rfdetr import RFDETRBase
    except ImportError:
        print("Error: The 'rfdetr' library is not installed.")
        print("In a Colab cell, run: !pip install rfdetr")
        return

    # Ensure the checkpoint file exists before trying to load it.
    if not os.path.exists(CHECKPOINT_PATH):
        print(f"Error: Checkpoint file not found at: {CHECKPOINT_PATH}")
        print("Please make sure you have uploaded the file to your Colab session.")
        return

    print("Initializing RF-DETR model and loading weights...")
    try:
        # The rfdetr library loads custom weights via the
        # 'pretrain_weights' argument in its constructor.
        model = RFDETRBase(
            num_classes=NUM_CLASSES,
            size="base",          # Should match the size of your trained model
            resolution=672,       # Should match the resolution of your trained model
            pretrain_weights=CHECKPOINT_PATH
        )
        print("Model initialized and weights loaded successfully.")
    except Exception as e:
        print(f"An error occurred during model initialization or weight loading: {e}")
        return

    # --- 3. Prepare Model for Inference ---
    # **FIX APPLIED HERE**: As suggested by the warning, this optimizes the model for faster inference.
    print("Optimizing model for inference...")
    model.optimize_for_inference()
    print("Model is ready for inference.")


    # --- 4. Setup Video Processing and Annotators ---
    # Ensure the input video file exists.
    if not os.path.exists(INPUT_VIDEO_PATH):
        print(f"Error: Input video not found at: {INPUT_VIDEO_PATH}")
        print("Please make sure you have uploaded the video and the name is correct (including the extension like .mp4).")
        return

    # In recent versions of 'supervision', text and box
    # annotation are handled by separate classes.
    box_annotator = sv.BoxAnnotator(
        thickness=2
    )
    label_annotator = sv.LabelAnnotator(
        text_thickness=1,
        text_scale=0.5
    )


    # --- 5. Define the Frame Processing Callback ---
    # This function will be called for every frame in the video.
    def process_frame(frame: np.ndarray, frame_index: int) -> np.ndarray:
        """
        Performs inference on a single frame and returns the annotated frame.
        """
        # The model.predict method is expected to handle preprocessing.
        with torch.no_grad(): # Disables gradient calculations for efficiency
            results = model.predict(frame, threshold=CONFIDENCE_THRESHOLD)

        # **FIX APPLIED HERE**: The 'results' object is a supervision.Detections object.
        # We should iterate through its attributes directly instead of unpacking.
        labels = [
            f"{CLASS_NAMES[class_id]} {confidence:0.2f}"
            for confidence, class_id in zip(results.confidence, results.class_id)
        ]

        # Annotate in two steps. First boxes, then labels.
        annotated_frame = box_annotator.annotate(
            scene=frame.copy(),
            detections=results
        )
        annotated_frame = label_annotator.annotate(
            scene=annotated_frame,
            detections=results,
            labels=labels
        )
        return annotated_frame

    # --- 6. Run Inference on the Video ---
    print("Starting inference on the video...")
    # supervision's process_video handles reading frames, calling the callback,
    # and writing the output video file. It also shows a progress bar.
    try:
        sv.process_video(
            source_path=INPUT_VIDEO_PATH,
            target_path=OUTPUT_VIDEO_PATH,
            callback=process_frame
        )
        print(f"Inference complete. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
        print("You can find it in the file browser on the left.")
    except Exception as e:
        print(f"An error occurred during video processing: {e}")
        print("This might be due to an issue with the input video file or an out-of-memory error.")


if __name__ == "__main__":
    main()


Using device: cuda
Initializing RF-DETR model and loading weights...
Loading pretrain weights
Model initialized and weights loaded successfully.
Optimizing model for inference...
Model is ready for inference.
Starting inference on the video...
Inference complete. Annotated video saved to: /kaggle/working/cctv_0.25_2.mp4
You can find it in the file browser on the left.
