In [None]:
import os
# os.environ["HF_HOME"] = "/workspace/huggingface"
# or
# os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_models"

import base64
import io
import json
import torch
import numpy as np
import cv2
from ultralytics import YOLO
from PIL import Image, ImageDraw, ImageFont
from transformers import AutoImageProcessor
from utils import DINOv2Classifier
from safetensors.torch import load_file as load_safetensors
from IPython.display import display, HTML

In [None]:
# CONFIGURATION
repo_dir = os.getcwd().split('dslab25')[0] + 'dslab25/'
video_path = os.path.join(repo_dir, "assets/vacuum_pump/videos/01_run1_cam_2_1024x1024_15fps_3mbps.mp4")
labels_path = os.path.join(repo_dir, "assets/vacuum_pump/videos/output.txt")
coco_path = os.path.join(repo_dir, "training/vacuum_pump/coco_annotations.json")

# Path to your trained YOLOv12 weights (adjust as needed)
temp_images_dir = os.path.join(repo_dir, "temp_images")
anno_dir = os.path.join(repo_dir, "assets/vacuum_pump/eval/anno")
base_dir = os.path.join(repo_dir, "training/vacuum_pump")

coco_path = os.path.join(base_dir, "coco_annotations.json")
YOL_THRESHOLD = 0.38

os.makedirs(temp_images_dir, exist_ok=True)

In [None]:
pretrained_model = "facebook/dinov2-with-registers-base"
yolo_model_path = os.path.join(repo_dir, "obj_detection/dino/yolo_runs/yolov12_boundingbox2", "weights", "best.pt")
model_dir = os.path.join(repo_dir, "obj_detection/dino/dinov2_finetune/base/final_model/")

In [None]:
def load_labels(labels_path):
	"""Load ground truth labels from file."""
	frame_to_class = {}
	with open(labels_path, 'r') as f:
		for line in f:
			parts = line.strip().split()
			if len(parts) == 3:
				state_class, start_frame, end_frame = int(parts[0]), int(parts[1]), int(parts[2])
				for frame_idx in range(start_frame, end_frame + 1):
					frame_to_class[frame_idx] = state_class
	return frame_to_class

# Load ground truth labels.
print(f"Loading labels from: {labels_path}")
frame_to_class = load_labels(labels_path)

# Load COCO annotations to map category IDs to names.
print(f"Loading COCO annotations from: {coco_path}")
try:
	with open(coco_path, 'r') as f:
		coco_data = json.load(f)
	category_id_to_name = {cat['id']: cat.get('name', f'category_{cat["id"]}')
								for cat in coco_data.get('categories', [])}
except (FileNotFoundError, json.JSONDecodeError) as e:
	print(f"Error loading COCO annotations: {e}")
	category_id_to_name = {}

# Yolo

In [None]:
# Define label grouping logic
def labels_match(pred, true):
	group_1 = {4, 5}
	group_2 = {6, 7}
	if pred == true:
		return True
	# return False
	if pred in group_1 and true in group_1:
		return True
	if pred in group_2 and true in group_2:
		return True
	return False

try:
	font = ImageFont.truetype("arial.ttf", 16)
except IOError:
	font = ImageFont.load_default()

print(f"Loading video from: {video_path}")
video = cv2.VideoCapture(video_path)
if not video.isOpened():
	raise Exception("Error: Could not open video file.")
	
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = video.get(cv2.CAP_PROP_FPS)
print(f"Video info: {total_frames} frames, {fps} fps")

print("\n--- Starting YOLO Evaluation ---")
frame_idx = 0
frames_to_process = []
while True:
	ret, frame = video.read()
	if not ret:
		break
	if frame_idx % 5 == 0 and frame_idx in frame_to_class:
		frames_to_process.append((frame_idx, frame))
	frame_idx += 1
video.release()
print(f"Total frames to evaluate: {len(frames_to_process)}")

os.makedirs(temp_images_dir, exist_ok=True)


In [None]:
print("Loading YOLO model...")
yolo_model = YOLO(yolo_model_path)

In [None]:
correct_predictions = 0
total_predictions = 0

for frame_idx, frame in frames_to_process:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	image = Image.fromarray(frame_rgb)

	yolo_results = yolo_model(frame)
	if len(yolo_results) == 0 or len(yolo_results[0].boxes) == 0:
		print(f"Frame {frame_idx}: No detection found. Skipping frame.")
		continue

	boxes = yolo_results[0].boxes.data	# Each row: [x1, y1, x2, y2, conf, cls]
	true_label = frame_to_class[frame_idx]
	true_label_name = category_id_to_name.get(true_label, f"Class_{true_label}")
	# Filter boxes by threshold
	filtered_boxes = [box for box in boxes if box[4].item() >= YOL_THRESHOLD]

	# Pick the box with the highest confidence
	if filtered_boxes:
		best_box = max(filtered_boxes, key=lambda b: b[4].item())
		confidence = best_box[4].item()

		predicted_label = int(best_box[5].item())
		predicted_label_name = category_id_to_name.get(predicted_label, f"Class_{predicted_label}")

		x1, y1, x2, y2 = map(int, best_box[:4].tolist())
		x1 = max(0, x1)
		y1 = max(0, y1)
		x2 = min(image.width, x2)
		y2 = min(image.height, y2)

		cropped_image = image.crop((x1, y1, x2, y2))
		cropped_path = os.path.join(temp_images_dir, f"frame_{str(frame_idx).zfill(4)}.jpg")
		cropped_image.save(cropped_path)

		print(f"Saved cropped image: {cropped_path}")

		is_correct = labels_match(predicted_label, true_label)
		if is_correct:
				correct_predictions += 1
		total_predictions += 1

		print(f"Frame {frame_idx}")
		print(f"	True:	  {true_label_name} (ID: {true_label})")
		print(f"	Predicted: {predicted_label_name} (ID: {predicted_label}) | Conf: {confidence:.2f}")
		print(f"	Correct:   {'✅ Yes' if is_correct else '❌ No'}")
		print("-" * 30)

accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print("\nEvaluation Summary:")
print(f"	Total predictions: {total_predictions}")
print(f"	Correct predictions: {correct_predictions}")
print(f"	Accuracy: {accuracy:.2f}")

# SAM 2

In [None]:
import cv2, imageio, numpy as np, torch, torch.nn.functional as F
from IPython.display import Video
from sam2.sam2_image_predictor  import SAM2ImagePredictor
from sam2.sam2_video_predictor  import SAM2VideoPredictor
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
from transformers import AutoModel, AutoImageProcessor

# 2.  PATHS & OPTIONS  ---------------------------------------------------------refs        = [
refs = [
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_1.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_2.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_3.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_4.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_5.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_6.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_7.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_8.jpg"),
    os.path.join(base_dir, "images/original/stage_0/stage_0_var_0_case_render_9.jpg")
]
boxed_out   = "sam2_boxed.mp4"
device      = "cuda" if torch.cuda.is_available() else "cpu"
skip_frames = 70                             # ← skip these many frames

# ── 2. UTILS ─────────────────────────────────────────────────────────────────
def read_video_rgb(path):
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise IOError(f"cannot open {path}")
    frames = []
    while True:
        ret, frm = cap.read()
        if not ret: break
        frames.append(cv2.cvtColor(frm, cv2.COLOR_BGR2RGB))
    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    cap.release()
    return frames, int(fps)

def get_feat(img_rgb):
    ipt = dinov2_proc(images=img_rgb, return_tensors="pt").to(device)
    with torch.no_grad():
        out = dinov2_backbone(**ipt).last_hidden_state[:,0]
    return F.normalize(out.squeeze(0), dim=-1).cpu()

# ── 3. LOAD VIDEO ────────────────────────────────────────────────────────────
frames, FPS = read_video_rgb(video_path)
assert len(frames) > skip_frames, f"Video must have more than {skip_frames} frames!"
H, W = frames[0].shape[:2]

# ── 4. INITIALISE MODELS ────────────────────────────────────────────────────
backbone_name   = "facebook/dinov2-with-registers-small"
dinov2_backbone = AutoModel.from_pretrained(backbone_name).to(device).eval()
dinov2_proc     = AutoImageProcessor.from_pretrained(backbone_name)

img_pred = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-tiny")
vid_pred = SAM2VideoPredictor.from_pretrained("facebook/sam2.1-hiera-tiny")

# ── 5. REFERENCE EMBEDDINGS ─────────────────────────────────────────────────
ref_feats = []
for p in refs:
    bgr = cv2.imread(p)
    if bgr is None:
        raise IOError(f"cannot open reference image {p}")
    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    ref_feats.append(get_feat(rgb))

# ── 6. AUTO MASKS ON SEED FRAME ─────────────────────────────────────────────
seed_frame = frames[skip_frames]
mask_gen = SAM2AutomaticMaskGenerator(
    img_pred.model,
    points_per_side=32,
    pred_iou_thresh=0.7,          # only keep masks with IoU-pred confidence ≥ 0.7
    stability_score_thresh=0.9,   # only keep very stable masks
    box_nms_thresh=0.3,           # merge overlapping boxes more aggressively
    min_mask_region_area=1000     # drop very small regions
)
masks = mask_gen.generate(seed_frame)

# ── 7. PICK BEST MASK BY COS-SIM ────────────────────────────────────────────
best_m, best_sim = None, -1.0
for m in masks:
    x_f, y_f, w_f, h_f = m["bbox"]
    x0 = max(0, int(round(x_f)));     y0 = max(0, int(round(y_f)))
    x1 = min(W, int(round(x_f + w_f))); y1 = min(H, int(round(y_f + h_f)))
    if x1 <= x0 or y1 <= y0:
        continue

    crop = seed_frame[y0:y1, x0:x1]
    if crop.size == 0:
        continue

    feat = get_feat(crop)
    sims = torch.stack(ref_feats) @ feat     # [n_refs]
    sim  = sims.max().item()
    if sim > best_sim:
        best_m, best_sim = m, sim

if best_m is None:
    raise RuntimeError("No mask matched the reference images!")

mask0 = torch.from_numpy(best_m["segmentation"]).to(device).bool()

# ── 8. TRACK & DRAW BOX (SKIPPING FIRST 70 FRAMES) ──────────────────────────
# --- use the MP4 path rather than a tensor ---
state = vid_pred.init_state(video_path=video_path)

vid_pred.add_new_mask(state, frame_idx=skip_frames, mask=mask0, obj_id=0)
writer = imageio.get_writer(
    boxed_out,
    format="FFMPEG",    # force the FFmpeg plugin
    codec="libx264",    # MP4/H.264 codec
    fps=FPS,
    ffmpeg_params=["-pix_fmt", "yuv420p"]  # ensures broad compatibility
)
with torch.inference_mode(), torch.autocast(device_type=device, dtype=torch.bfloat16):
    for f_idx, _, logits in vid_pred.propagate_in_video(state):
        if f_idx < skip_frames:
            continue

        # get the first object’s mask; this will be shape (1, H, W)
        mask_prob = logits.sigmoid()[0]      

        # squeeze to (H, W)
        mask2d = mask_prob.squeeze(0)        

        # threshold
        bin_m = (mask2d > 0.5).cpu().numpy().astype(np.uint8)

        frame = frames[f_idx].copy()
        if bin_m.any():
            ys, xs = np.where(bin_m)           # now this is 2-D
            x0b, y0b, x1b, y1b = xs.min(), ys.min(), xs.max(), ys.max()
            cv2.rectangle(frame, (x0b, y0b), (x1b, y1b), (0,255,0), 2)
        writer.append_data(frame)
writer.close()

# ── 9. DISPLAY RESULT ───────────────────────────────────────────────────────
Video(boxed_out, embed=True, width=min(W, 640))

# DinoV2

In [None]:
from glob import glob

image_files = sorted(glob(os.path.join(temp_images_dir, "*.jpg")))
correct_predictions = 0
total_predictions = 0

# Load image processor and model
print("Loading image processor...")
processor = AutoImageProcessor.from_pretrained(pretrained_model)

# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Get number of classes from frame_to_class
num_labels = max(frame_to_class.values()) + 1 if frame_to_class else 8
print(f"Number of classes: {num_labels}")

# Load model
print("Loading model...")
model = DINOv2Classifier(num_labels=num_labels, pretrained_model=pretrained_model)

# Load model weights
safetensors_path = os.path.join(model_dir, "model.safetensors")
bin_path = os.path.join(model_dir, "pytorch_model.bin")

model_weights_path = None
if os.path.exists(safetensors_path):
	model_weights_path = safetensors_path
elif os.path.exists(bin_path):
	model_weights_path = bin_path
	
if model_weights_path:
	print(f"Loading model weights from: {model_weights_path}")
	try:
		if model_weights_path.endswith(".safetensors"):
			state_dict = load_safetensors(model_weights_path, device=str(device))
		else:
			state_dict = torch.load(model_weights_path, map_location=str(device), weights_only=True)
			
		# Handle potential DDP prefix
		if next(iter(state_dict)).startswith('module.'):
			state_dict = {k.partition('module.')[2]: v for k,v in state_dict.items()}
				
		model.load_state_dict(state_dict)
	except Exception as e:
		raise e
else:
	raise Exception(f"Error: Model weights not found in {model_dir}")


model.to(device)
model.eval()


In [None]:
correct_predictions = 0
total_predictions = 0

for image_path in image_files:
	# Infer annotation file path
	filename = os.path.splitext(os.path.basename(image_path))[0]
	anno_path = os.path.join(anno_dir, f"{filename}.txt")
	print(anno_path)
	if not os.path.exists(anno_path):
		print(f"Annotation not found for {filename}, skipping.")
		continue

	# Read true class label
	with open(anno_path, "r") as f:
		try:
			true_label = int(f.readline().strip())
		except ValueError:
			print(f"Invalid label in {anno_path}, skipping.")
			continue

	# Load image
	image = Image.open(image_path).convert("RGB")
	display(image)

	# Process image
	inputs = processor(images=image, return_tensors="pt")
	pixel_values = inputs["pixel_values"].to(device)
	with torch.no_grad():
		outputs = model(pixel_values=pixel_values)
	
	logits = outputs["logits"]
	probs = torch.softmax(logits, dim=-1)
	print("Probabilities:", probs)
	predicted_label = logits.argmax(-1).item()

	# Inside your loop:
	is_correct = labels_match(predicted_label, true_label)
	if is_correct:
		correct_predictions += 1
	total_predictions += 1

	print(f"{filename}:")
	print(f"	True:	{true_label}")
	print(f"	Predicted: {predicted_label}")
	print(f"	Correct:	 {'✅ Yes' if is_correct else '❌ No'}")
	print("-" * 30)

accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print("\nEvaluation Summary:")
print(f"	Total evaluated: {total_predictions}")
print(f"	Correct:		 {correct_predictions}")
print(f"	Accuracy:		{accuracy:.2f}")

# Qwen VL

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info


states = {
	'state_0': 'Base block metal piece',
	'state_1': 'Cylinder metal piece which gets stick on the base block stage_0',
	'state_2': 'A Big metal piece which gets stick on the cylinder piece of stage_1',
	'state_3': 'A smaller thin metal piece which gets put onto the center of the big metal piece of stage_2',
	'state_4': 'A tiny metal ring which gets placed onto the center of the thing metal piece of stage_3',
	'state_5': '3 screws now get screwed onto the piece',
	'state_6': 'A darker metal plate now gets placed on top of the piece',
	'state_7': '5 screws now get screwed onto the piece',
}

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
qwen_vl_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2.5-VL-7B-Instruct",
	torch_dtype=torch.bfloat16,
	# attn_implementation="flash_attention_2",
	device_map="auto",
)

# default processer
qwen_vl_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# Messages containing a local video path and a text query

In [None]:
# Messages containing a local video path and a text query

def display_images_horizontal(image_paths, size=(100, 100)):
	html = '<div style="display: flex; flex-direction: row;">'
	for path in image_paths:
		img = Image.open(path).resize(size)
		buffer = io.BytesIO()
		img.save(buffer, format='PNG')
		img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
		html += f'<img src="data:image/png;base64,{img_str}" style="margin:2px;" />'
	html += '</div>'
	display(HTML(html))
	
def qwen_vl(image_paths: list[str], current_dino_state: str, previous_qwen_state: str, previous_information: str):
	"""
	[
		"file:///path/to/frame1.jpg",
		"file:///path/to/frame2.jpg",
		"file:///path/to/frame3.jpg",
		"file:///path/to/frame4.jpg",
	],
	"""
	messages = [
		{
			"role": "user",
			"content": [
			{
				"type": "video",
				"video": image_paths
			},
			{
				"type": "text",
				"text": f'''
					You are given a short video clip of a person assembling an object. The clip is part of a longer clip of the person assembling the object.
					Your task is it to tell me what you see in the video and based on that and the previous action, some more information below make a prediction on which state the piece is at the end of the clip.
					Note that the clip may end while the person is still in the middle of an action.
					If the state is in between two states, return the state it was in before, so if the person is moving the cylinder piece to the base block, return state_0, only when the cylinder piece is on the base block, return state_1.
					Here are the possible states:
						'state_0': 'First part of the object: Base block metal piece',
						'state_1': 'Second part of the object: Cylinder metal piece which gets stick on the base block stage_0',
						'state_2': 'Third part of the object: A Big metal piece which gets stick on the cylinder piece of stage_1',
						'state_3': 'Fourth part of the object: A smaller thin metal piece which gets put onto the center of the big metal piece of stage_2',
						'state_4': 'Fifth part of the object: A tiny metal ring which gets placed onto the center of the thing metal piece of stage_3',
						'state_5': 'Sixth part of the object: 3 screws now get screwed onto the piece',
						'state_6': 'Seventh part of the object: A darker metal plate now gets placed on top of the piece',
						'state_7': 'Eighth part of the object: 5 screws now get screwed onto the piece'
					Additionally I will give you the state in which the classifier thinks the piece is at.
					Here is the information (it might be none id it didnt predict anything):
						{current_dino_state}
					Additionally I will give you the state in which the classifier thinks the piece was at at the beginning of the clip.
					Here is the information ((it might be none id it didnt predict anything)):
						{previous_qwen_state}
					Additionally, I will give you some information about what the person did 5 seconds before the clip starts.
					Here is the information:
						{previous_information}

					Please return the output in the following format:
					{{
						"state": "state_name",
						"action_description": "action_description"
					}}
				'''},
				],
			}
		]

	text = qwen_vl_processor.apply_chat_template(
		messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = qwen_vl_processor(
		text=[text],
		images=image_inputs,
		videos=video_inputs,
		fps=fps,
		padding=True,
		return_tensors="pt",
		
	)
	inputs = inputs.to("cuda")

	# Inference
	generated_ids = qwen_vl_model.generate(**inputs, max_new_tokens=128)
	generated_ids_trimmed = [
		out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = qwen_vl_processor.batch_decode(
		generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	print(output_text)
	return output_text

previous_qwen_state = "None because its the first clip"
previous_information = "None because its the first clip"
current_dino_state = "None because its the first clip"

correct_predictions = 0
total_predictions = 0

for idx in range(0, len(frames_to_process), 9):
	image_paths = []
	for frame_idx, frame in frames_to_process[idx:idx+9]:
		frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
		image = Image.fromarray(frame_rgb)
		image_path = os.path.join(temp_images_dir, f"frame_{frame_idx}_original.jpg")
		image.save(image_path)
		image_paths.append(f'file://{image_path}')
	print("image_paths:", image_paths)
	# yolo_results = yolo_model(frame)
	# skip_to_qwen = False
	# if len(yolo_results) == 0 or len(yolo_results[0].boxes) == 0:
	# 	print(f"Frame {frame_idx}: No detection found. Skipping frame.")
	# 	skip_to_qwen = True
	true_label = frame_to_class[frame_idx]
	true_label_name = category_id_to_name.get(true_label, f"Class_{true_label}")
	if not True:
		boxes = yolo_results[0].boxes.data	# Each row: [x1, y1, x2, y2, conf, cls]

		# Find the box with the highest confidence
		highest_conf_idx = -1
		highest_conf = 0

		for idx, box in enumerate(boxes):
			confidence = box[4].item()
			if confidence < YOL_THRESHOLD:
				continue
			
			if confidence > highest_conf:
				highest_conf = confidence
				highest_conf_idx = idx
		
		if highest_conf_idx >= 0:
			box = boxes[highest_conf_idx]
			confidence = box[4].item()
			predicted_label = int(box[5].item())
			current_dino_state = category_id_to_name.get(predicted_label, f"Class_{predicted_label}")
			
			x1, y1, x2, y2 = map(int, box[:4].tolist())
			x1 = max(0, x1)
			y1 = max(0, y1)
			x2 = min(image.width, x2)
			y2 = min(image.height, y2)
			
			cropped_image = image.crop((x1, y1, x2, y2))
			cropped_path = os.path.join(temp_images_dir, f"frame_{frame_idx}.jpg")
			cropped_image.save(cropped_path)
			
			print(f"Saved cropped image: {cropped_path}")
		else:
			current_dino_state = "None, Dino didnt make a prediction"
			
	display_images_horizontal([img.replace('file://', '') for img in image_paths])
	res = qwen_vl(image_paths, current_dino_state, previous_qwen_state, previous_information)[0]
	res = json.loads(res)

	previous_qwen_state = res["state"]
	previous_information = res["action_description"]
	predicted_label = previous_qwen_state
	is_correct = labels_match(previous_qwen_state, true_label)
	if is_correct:
		correct_predictions += 1
	total_predictions += 1

	print(f"Frame {frame_idx}")
	print(f"	True:	{true_label}")
	print(f"	Predicted: {predicted_label}")
	print(f"	Correct:	 {'✅ Yes' if is_correct else '❌ No'}")
	print("-" * 30)



accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print("\nEvaluation Summary:")
print(f"	Total predictions: {total_predictions}")
print(f"	Correct predictions: {correct_predictions}")
print(f"	Accuracy: {accuracy:.2f}")
