In [None]:
!pip install -q git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
!pip install accelerate

In [None]:
!huggingface-cli login

Let's load the model.

In [None]:
import torch
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

ckpt = "noah_local/gemma-3-4b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(
    ckpt,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(ckpt)


Download the video and downsample the frames from the video.

In [None]:
import cv2
from PIL import Image
import numpy as np

def downsample_video(video_path):
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)

    frames = []
    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)

    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = vidcap.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert from BGR to RGB
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))

    vidcap.release()
    return frames


In [None]:
frames = downsample_video("thermal_data20200626_154313_mlx90640_01_light_none.mp4")

In [None]:
frames

Here's our system prompt and the instruction. We will add frames and images on top of it.

In [None]:
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },

    {
        "role": "user",
        "content": [
            {"type": "text", "text": f"Generate bullet points of fall down event for the video only. Place each bullet point into an \
object sent to set_timecodes with the timecode of the bullet point in the video."}]
    }
]

In [None]:
messages[1]["content"][0]

In [None]:
for frame in frames:
    image, timestamp = frame
    messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
    image.save(f"image_{timestamp}.png")
    messages[1]["content"].append({"type": "image", "url": f"image_{timestamp}.png"})

In [None]:
messages

Preprocess our input and infer.

In [None]:
inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device)

In [None]:
input_len = inputs["input_ids"].shape[-1]

generation = model.generate(**inputs, max_new_tokens=500, do_sample=False)
generation = generation[0][input_len:]

decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)