In [1]:
import os
import json
import av
from ultralytics import YOLO
from PIL import Image
from datetime import timedelta

# Paths
VIDEOS_DIR = '.'
video_path = os.path.join(VIDEOS_DIR, 'sample_video.mp4')
output_json_path = 'output.json'
model_path = os.path.join('.', 'runs', 'detect', 'train', 'weights', 'best.pt')

# Load YOLOv8 model
model = YOLO(model_path)  # Load a custom model

threshold = 0.5

def format_timestamp(seconds):
    # Convert seconds to timedelta and format as HH:MM:SS
    td = timedelta(seconds=seconds)
    return str(td)

def extract_frames(video_path):
    container = av.open(video_path)
    frames = []
    for frame in container.decode(video=0):
        # Convert timestamp to float seconds
        timestamp = float(frame.pts * frame.time_base)
        img = frame.to_image()
        frames.append((img, timestamp))
    return frames

def detect_logos(frames):
    pepsi_pts = []
    cocacola_pts = []

    for img, timestamp in frames:
        results = model(img)  # Run inference
        
        for result in results:
            boxes = result.boxes  # Boxes object for bounding box outputs

            for box in boxes:
                # Extract the bounding box and confidence
                x1, y1, x2, y2 = box.xyxy[0].tolist()  # Convert to list
                score = box.conf[0].item()  # Convert to float
                class_id = int(box.cls[0].item())  # Convert to int

                if score > threshold:
                    class_name = result.names[class_id].upper()
                    width = x2 - x1
                    height = y2 - y1
                    center_x = (x1 + x2) / 2
                    center_y = (y1 + y2) / 2
                    frame_center_x = img.width / 2
                    frame_center_y = img.height / 2
                    distance_from_center = ((center_x - frame_center_x) ** 2 + (center_y - frame_center_y) ** 2) ** 0.5

                    formatted_timestamp = format_timestamp(timestamp)
                    entry = {
                        "timestamp": formatted_timestamp,
                        "size": {"width": width, "height": height},
                        "distance_from_center": distance_from_center
                    }
                    
                    if class_name == 'PEPSI':
                        pepsi_pts.append(entry)
                    elif class_name == 'COCA-COLA':
                        cocacola_pts.append(entry)

    return pepsi_pts, cocacola_pts

def generate_output_json(pepsi_pts, cocacola_pts, output_path='output.json'):
    # Convert all values to strings for JSON serialization
    def to_serializable(obj):
        if isinstance(obj, (list, dict)):
            return obj
        elif hasattr(obj, 'tolist'):
            return obj.tolist()  # Convert numpy arrays or tensors
        elif hasattr(obj, 'item'):
            return obj.item()  # Convert single element tensors
        else:
            return str(obj)  # Convert other non-serializable objects to string

    output = {
        "Pepsi_pts": [entry["timestamp"] for entry in pepsi_pts],
        "CocaCola_pts": [entry["timestamp"] for entry in cocacola_pts],
        "Pepsi_details": [ {k: to_serializable(v) for k, v in entry.items()} for entry in pepsi_pts ],
        "CocaCola_details": [ {k: to_serializable(v) for k, v in entry.items()} for entry in cocacola_pts ]
    }
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=4)




In [2]:
def main(video_path):
    frames = extract_frames(video_path)
    pepsi_pts, cocacola_pts = detect_logos(frames)
    # generate_output_json(pepsi_pts, cocacola_pts)

video_path = "sample/output_video.mp4"
main(video_path)


0: 384x640 (no detections), 125.0ms
Speed: 4.8ms preprocess, 125.0ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 123.5ms
Speed: 1.8ms preprocess, 123.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 117.3ms
Speed: 2.9ms preprocess, 117.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 121.0ms
Speed: 3.0ms preprocess, 121.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 118.9ms
Speed: 3.2ms preprocess, 118.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 120.4ms
Speed: 3.1ms preprocess, 120.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 pepsis, 116.5ms
Speed: 3.0ms preprocess, 116.5ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 coca-colas, 2 pepsis, 117.8ms
Speed: 3.2ms prepr

In [6]:
frames = extract_frames(video_path)
pepsi_pts, cocacola_pts = detect_logos(frames)


0: 384x640 (no detections), 170.7ms
Speed: 1.9ms preprocess, 170.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 170.9ms
Speed: 2.6ms preprocess, 170.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 173.4ms
Speed: 2.1ms preprocess, 173.4ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 164.2ms
Speed: 2.5ms preprocess, 164.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 154.8ms
Speed: 2.1ms preprocess, 154.8ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 149.2ms
Speed: 2.0ms preprocess, 149.2ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 pepsis, 144.7ms
Speed: 2.1ms preprocess, 144.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 coca-colas, 2 pepsis, 148.1ms
Speed: 1.7ms prepr

In [7]:
generate_output_json(pepsi_pts, cocacola_pts)