## MOLMO

In [3]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
import re
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import cv2
import pickle

path = "/home/cermavo3/projects/kaggle/coool/COOOL-Benchmark"


def extract_points(molmo_output, image):
    image_w = image.shape[1]
    image_h = image.shape[0]
    points = []
    for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', molmo_output):
        try:
            point = [float(match.group(i)) for i in range(1, 3)]
        except ValueError:
            pass
        else:
            point = np.array(point)
            if np.max(point) > 100:
                # Treat as an invalid output
                continue
            point /= 100.0
            point = point * np.array([image_w, image_h])
            points.append(point)

    if len(points) > 0:
        points = np.stack(points)
        points = points.round().astype(int)
    else:
        points = np.array([]).astype(int)

    return points


def get_text(image, prompt, processor, model):
    img = Image.fromarray(image)
    inputs = processor.process(
        images=[img],
        text=prompt
    )
    
    # move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    #with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
    output = model.generate_from_batch(
      inputs,
      GenerationConfig(max_new_tokens=400, stop_strings="<|endoftext|>"),
      tokenizer=processor.tokenizer
    )
    # only get generated tokens; decode them to text
    generated_tokens = output[0,inputs['input_ids'].size(1):]
    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return generated_text


def visualise_boxes(image, objects):
    image = image.copy()
    for obj in objects:
        x1, y1, x2, y2 = np.array(obj['bbox']).round().astype(int)
        image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
    img = Image.fromarray(image)
    return img


def visualize_points(image, points):
    image = image.copy()
    for point in points:
        image = cv2.circle(image, point, radius=5, color=(0, 255, 0), thickness=-1)
    img = Image.fromarray(image)
    return img


def crop_with_context(image, bounding_box, context_percent, min_size=20):
    """
    Crops a region from the image defined by a bounding box, ensuring minimum size
    and adding additional context as a percentage of box size.
    
    Parameters:
        image (numpy array): The original image.
        bounding_box (tuple): The bounding box (x1, y1, x2, y2).
        min_size (int): Minimum size for the width and height of the box.
        context_percent (float): Percentage of box size to add as context (e.g., 0.1 for 10%).

    Returns:
        numpy array: Cropped image region.
    """
    bounding_box = np.array(bounding_box).round().astype(int)
    x1, y1, x2, y2 = bounding_box
    
    # Ensure minimum box size
    box_width = x2 - x1
    box_height = y2 - y1
    
    if box_width < min_size:
        padding_x = (min_size - box_width) // 2
        x1 -= padding_x
        x2 += padding_x
    
    if box_height < min_size:
        padding_y = (min_size - box_height) // 2
        y1 -= padding_y
        y2 += padding_y

    # Recalculate box dimensions
    box_width = x2 - x1
    box_height = y2 - y1

    # Compute context in pixels
    context_x = int(box_width * context_percent)
    context_y = int(box_height * context_percent)
    
    # Add context, ensuring we stay within image bounds
    height, width = image.shape[:2]
    x1 = max(0, x1 - context_x)
    y1 = max(0, y1 - context_y)
    x2 = min(width, x2 + context_x)
    y2 = min(height, y2 + context_y)
    
    # Crop the region
    cropped_image = image[y1:y2, x1:x2]
    return cropped_image




def crop_square_with_context(image, bounding_box, context_percent, min_size=10):
    """
    Crops a square region from the image defined by a bounding box, ensuring minimum size
    and adding additional context as a percentage of box size.
    
    Parameters:
        image (numpy array): The original image.
        bounding_box (tuple): The bounding box (x1, y1, x2, y2).
        context_percent (float): Percentage of box size to add as context (e.g., 0.1 for 10%).
        min_size (int): Minimum size for the width and height of the box.

    Returns:
        numpy array: Cropped square image region.
    """
    bounding_box = np.array(bounding_box).round().astype(int)
    x1, y1, x2, y2 = bounding_box

    # Ensure minimum box size
    box_width = x2 - x1
    box_height = y2 - y1

    if box_width < min_size:
        padding_x = (min_size - box_width) // 2
        x1 -= padding_x
        x2 += padding_x

    if box_height < min_size:
        padding_y = (min_size - box_height) // 2
        y1 -= padding_y
        y2 += padding_y

    # Recalculate box dimensions
    box_width = x2 - x1
    box_height = y2 - y1

    # Ensure the box is square by making both sides equal to the larger dimension
    side_length = max(box_width, box_height)

    # Center the square box around the original bounding box
    x_center = (x1 + x2) // 2
    y_center = (y1 + y2) // 2

    x1 = x_center - side_length // 2
    x2 = x_center + side_length // 2
    y1 = y_center - side_length // 2
    y2 = y_center + side_length // 2

    # Add context as a percentage of the side length
    context = int(side_length * context_percent)
    x1 -= context
    y1 -= context
    x2 += context
    y2 += context

    # Ensure the box stays within image bounds
    height, width = image.shape[:2]
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(width, x2)
    y2 = min(height, y2)

    # Crop the square region
    cropped_image = image[y1:y2, x1:x2]
    return cropped_image



def get_img(filename, frame_id, anns=None, folder_path = "/home/cermavo3/projects/kaggle/coool/data"):
    cap = cv2.VideoCapture(os.path.join(folder_path, filename))
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
    
        # Process every Nth frame to match the target FPS
        if frame_count  == frame_id:
            video = filename.split('.')[0]
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if anns is not None:
                objects = anns[video][frame_count]['challenge_object']
                for obj in objects:
                    x1, y1, x2, y2 = np.array(obj['bbox']).round().astype(int)
                    image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
            break
        frame_count += 1    
    cap.release()
    return image


In [None]:
# load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)
model = model.to('cuda')

# Open annotations
with open('COOOL_benchmark/annotations_public.pkl', 'rb') as f:
    anns = pickle.load(f)

# Caption each object - Short square

In [None]:

def process(image, objects):
    for obj in objects:
        img_cropped = crop_square_with_context(image, obj['bbox'], 0.0)
        prompt = 'Considering the context of traffic, caption the hazard in one short sentence of maximum 30 characters and 6 words.'
        obj['caption'] = get_text(img_cropped, prompt, processor, model).strip(' ')
    return objects


folder_path = "/home/cermavo3/projects/kaggle/coool/data"
target_fps = 5

results = {}
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith(".mp4"):
        video = filename.split('.')[0]
        cap = cv2.VideoCapture(os.path.join(folder_path, filename))
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        frame_skip = int(original_fps / target_fps)

        # Initialize storage for this video
        video_results = {}

        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Process every Nth frame to match the target FPS
            if frame_count % frame_skip == 0:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                objects = anns[video][frame_count]['challenge_object']
                video_results[frame_count] = process(image, objects)
            frame_count += 1

        cap.release()
        results[video] = video_results
torch.save(results, f'results/molmo-obj-caption-short-square/all.pkl')

  required_scale_d = candidate_resolutions.astype(np.float32) / original_size
100%|██████████| 201/201 [7:01:32<00:00, 125.84s/it]  


# Caption each object - 5 words

In [None]:
def process(image, objects):
    for obj in objects:
        img_cropped = crop_square_with_context(image, obj['bbox'], 0.0)
        prompt = '''
        Propose 5 most likely class labels of the object, context of the image is traffic and unusual hazards such as various animals on the road. Write only the class names separated by spaces.
        '''
        obj['caption'] = get_text(img_cropped, prompt, processor, model).strip(' ')
    return objects


folder_path = "/home/cermavo3/projects/kaggle/coool/data"
target_fps = 5

results = {}
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith(".mp4"):
        video = filename.split('.')[0]
        cap = cv2.VideoCapture(os.path.join(folder_path, filename))
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        frame_skip = int(original_fps / target_fps)

        # Initialize storage for this video
        video_results = {}

        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Process every Nth frame to match the target FPS
            if frame_count % frame_skip == 0:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                objects = anns[video][frame_count]['challenge_object']
                video_results[frame_count] = process(image, objects)
            frame_count += 1

        cap.release()
        torch.save(video_results, f'results/molmo-obj-caption-words/{filename}.pkl')
        results[video] = video_results
torch.save(results, f'results/molmo-obj-caption-words/all.pkl')

100%|██████████| 201/201 [8:40:01<00:00, 155.23s/it]   
