In [1]:
import os
import json
import random
import numpy as np
import evaluate
import torch.multiprocessing as mp
import torch
from datasets import Dataset
from PIL import Image
from IPython.display import display
from safetensors.torch import load_file as load_safetensors
from transformers import AutoImageProcessor, TrainingArguments, Trainer
from utils import DINOv2Dataset, DINOv2Classifier


# From YOLO to COCO annotation

In [2]:
repo_dir = os.getcwd().split('dslab25')[0] + 'dslab25/'
root_dir = repo_dir + "training/vacuum_pump"
image_dir = os.path.join(root_dir, "images/augmented")
label_dir = os.path.join(root_dir, "annotation/augmented")
coco_path = os.path.join(root_dir, "coco_annotations.json")

dino_dir = os.path.join(repo_dir, "obj_detection/dino") 
training_dir = os.path.join(repo_dir, "training/vacuum_pump")
pretrained_model = "facebook/dinov2-with-registers-large"

In [3]:
# Collect category mapping
stage_folders = [f"stage_{i}" for i in range(8)]
category_mapping = {name: i for i, name in enumerate(stage_folders)}  # name -> ID
categories = [{"id": i, "name": name} for name, i in category_mapping.items()]

# Initialize COCO structure
coco_output = {
	"images": [],
	"annotations": [],
	"categories": categories
}

In [None]:
import os
import json
from concurrent.futures import ThreadPoolExecutor
from PIL import Image

def process_stage(args):
    img_folder, label_folder, category_id, start_image_id, start_annotation_id = args
    images = []
    annotations = []
    local_image_id = start_image_id
    local_annotation_id = start_annotation_id

    img_filenames = [f for f in os.listdir(img_folder) if f.endswith(".jpg")]

    for filename in img_filenames:
        image_path = os.path.join(img_folder, filename)
        label_path = os.path.join(label_folder, filename.replace(".jpg", ".txt"))

        # Read image size
        with Image.open(image_path) as img:
            width, height = img.size

        images.append({
            "id": local_image_id,
            "file_name": f"{os.path.basename(img_folder)}/{filename}",
            "width": width,
            "height": height
        })

        # Process annotation if exists
        if os.path.isfile(label_path):
            with open(label_path, "r") as f:
                lines = f.readlines()

            for line in lines:
                parts = line.strip().split()
                if len(parts) != 5:
                    continue
                cls, x_center, y_center, w, h = map(float, parts)

                x = (x_center - w / 2) * width
                y = (y_center - h / 2) * height
                box_width = w * width
                box_height = h * height

                annotations.append({
                    "id": local_annotation_id,
                    "image_id": local_image_id,
                    "category_id": category_id,
                    "bbox": [x, y, box_width, box_height],
                    "area": box_width * box_height,
                    "iscrowd": 0
                })
                local_annotation_id += 1

        local_image_id += 1

    return images, annotations, local_image_id, local_annotation_id

# Initialize
coco_output = {"images": [], "annotations": [], "categories": []}  # Add your categories

# Prepare stage paths
stage_paths = [(os.path.join(image_dir, stage),
                os.path.join(label_dir, stage),
                category_mapping[stage],
                0, 0)  # dummy start ids, will fix later
               for stage in stage_folders]

# Process stages in parallel
all_results = []
with ThreadPoolExecutor() as executor:
    all_results = list(executor.map(process_stage, stage_paths))

# Merge results and fix IDs
image_id = 0
annotation_id = 0
for images, annotations, _, _ in all_results:
    id_mapping = {}
    for img in images:
        old_id = img["id"]
        img["id"] = image_id
        id_mapping[old_id] = image_id
        coco_output["images"].append(img)
        image_id += 1

    for anno in annotations:
        anno["id"] = annotation_id
        anno["image_id"] = id_mapping[anno["image_id"]]  # fix image id
        coco_output["annotations"].append(anno)
        annotation_id += 1

# Save to JSON
with open(coco_path, "w") as f:
    json.dump(coco_output, f, indent=2)

print(f"COCO annotations saved to {coco_path}")

# Train DINO

In [5]:

pretrained_model = "facebook/dinov2-with-registers-base"

# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Initial device check: {device}")

# Load COCO annotations and build the dataset dictionary
with open(coco_path, 'r') as f:
	coco_data = json.load(f)

image_to_category = {ann["image_id"]: ann["category_id"] for ann in coco_data["annotations"]}
dataset_dict = {"image_path": [], "label": []}
for image_info in coco_data["images"]:
	image_id = image_info["id"]
	file_name = image_info["file_name"]
	full_path = os.path.join(image_dir, file_name)
	if image_id in image_to_category:
		dataset_dict["image_path"].append(full_path)
		dataset_dict["label"].append(image_to_category[image_id])

# Split the dataset into train and validation (80/20)
dataset = Dataset.from_dict(dataset_dict)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

print("Initializing Image Processor...")
processor = AutoImageProcessor.from_pretrained(pretrained_model)

print("Preparing datasets...")
train_dataset = DINOv2Dataset(dataset["train"], processor)
eval_dataset = DINOv2Dataset(dataset["test"], processor)

print("Train dataset size:", len(train_dataset))
print("Eval dataset size:", len(eval_dataset))

print("Initializing DINOv2 Classifier model for fine-tuning...")
num_labels = len(set(dataset_dict["label"]))
model = DINOv2Classifier(num_labels=num_labels, pretrained_model=pretrained_model)

training_args = TrainingArguments(
	output_dir=os.path.join(dino_dir, "dinov2_finetune/base"),
	learning_rate=1e-5,  # Lower learning rate for fine-tuning
	per_device_train_batch_size=16,  # Adjust batch size to your GPU memory
	per_device_eval_batch_size=16,
	num_train_epochs=8,  # Fewer epochs may suffice for fine-tuning
	weight_decay=0.01,
	eval_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	dataloader_num_workers=4,
	logging_steps=10,
	fp16=torch.cuda.is_available(),
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=1)
	return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train(resume_from_checkpoint=True)

model_save_path = os.path.join(training_args.output_dir, "final_model")
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

print("Evaluating final model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Initial device check: cuda
Initializing Image Processor...
Preparing datasets...
Train dataset size: 354196
Eval dataset size: 39356
Initializing DINOv2 Classifier model for fine-tuning...


Some weights of Dinov2WithRegistersForImageClassification were not initialized from the model checkpoint at facebook/dinov2-with-registers-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
6,0.0003,0.107267,0.985898
7,0.0,0.100237,0.988032
8,0.0666,0.093606,0.989201


Model saved to /workspace/dslab25/obj_detection/dino/dinov2_finetune/base/final_model
Evaluating final model...


Evaluation results: {'eval_loss': 0.09360600262880325, 'eval_accuracy': 0.9892011383270658, 'eval_runtime': 70.9364, 'eval_samples_per_second': 554.807, 'eval_steps_per_second': 34.679, 'epoch': 8.0}


# Evaluate Dino (on rendered images)

In [6]:
# Path of saved model
model_dir = os.path.join(repo_dir, "obj_detection/dinov2_finetune/base/final_model/")

In [7]:
# Create mappings for easy lookup
image_id_to_info = {img['id']: img for img in coco_data.get('images', [])}
image_id_to_annotation = {ann['image_id']: ann for ann in coco_data.get('annotations', [])}
category_id_to_name = {cat['id']: cat.get('name', f'category_{cat["id"]}') for cat in coco_data.get('categories', [])}

# --- Prepare list of all images with labels ---
all_samples = [] 
for img_id, annotation in image_id_to_annotation.items():
	if img_id in image_id_to_info:
		image_info = image_id_to_info[img_id]
		file_name = image_info['file_name']
		full_path = os.path.join(image_dir, file_name)
		true_category_id = annotation['category_id']
		if os.path.exists(full_path): # Check if image file exists
			all_samples.append({
				"image_path": full_path,
				"true_label_id": true_category_id
			})
		else:
			print(f"Warning: Image file not found: {full_path}")
	else:
		print(f"Warning: Image ID {img_id} found in annotations but not in images list.")

if not all_samples:
	raise Exception("Error: No valid image samples found. Check image paths and COCO file.")

print(f"Total valid samples found: {len(all_samples)}")

# --- Select 10 random samples ---
num_samples_to_test = min(10, len(all_samples))
if num_samples_to_test < 10:
	print(f"Warning: Fewer than 10 valid samples available. Testing on {num_samples_to_test}.")
random_samples = random.sample(all_samples, num_samples_to_test)

# --- Load Model and Processor ---
print("Loading image processor...")
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")

# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print(f"Loading model structure...")
# Assuming the number of labels can be inferred or is known (e.g., from categories in coco)
num_labels = len(category_id_to_name) if category_id_to_name else 8 # Default to 8 if no categories
if not category_id_to_name:
	print("Warning: Could not determine number of labels from COCO categories. Defaulting to 8.")
		
model = DINOv2Classifier(num_labels=num_labels)

# Construct the expected path to the weights file
# Transformers Trainer saves as model.safetensors or pytorch_model.bin
safetensors_path = os.path.join(model_dir, "model.safetensors")
bin_path = os.path.join(model_dir, "pytorch_model.bin")

model_weights_path = None
if os.path.exists(safetensors_path):
	model_weights_path = safetensors_path
elif os.path.exists(bin_path):
	model_weights_path = bin_path
	
if model_weights_path:
	print(f"Loading model weights from: {model_weights_path}")
	try:
		# Use the correct loading function based on file type
		if model_weights_path.endswith(".safetensors"):
			state_dict = load_safetensors(model_weights_path, device=str(device))
		else: # Assume .bin or other torch.load compatible format
			# Explicitly set weights_only=False for older formats if needed, True is safer default
			state_dict = torch.load(model_weights_path, map_location=str(device), weights_only=True) 
			
		# Adjust for potential DDP prefix ('module.') if saved during distributed training
		if next(iter(state_dict)).startswith('module.'):
			state_dict = {k.partition('module.')[2]: v for k,v in state_dict.items()}
				
		model.load_state_dict(state_dict)
	except Exception as e:
		print(f"Error loading model weights: {e}")
		raise e
else:
	print(f"Error: Model weights not found in {model_dir} (checked for model.safetensors and pytorch_model.bin)")
	raise Exception("Model weights not found")
	

model.to(device)
model.eval() # Set model to evaluation mode
print("\n--- Starting Evaluation on Random Samples ---")
# --- Perform Inference ---
correct_predictions = 0
for i, sample in enumerate(random_samples):
	image_path = sample["image_path"]
	true_label_id = sample["true_label_id"]
	true_label_name = category_id_to_name.get(true_label_id, f"ID_{true_label_id}")

	try:
		image = Image.open(image_path).convert("RGB")
		display(image)  # Display the image in Jupyter

		inputs = processor(images=image, return_tensors="pt")
		pixel_values = inputs['pixel_values'].to(device)

		with torch.no_grad():
			outputs = model(pixel_values=pixel_values)

		logits = outputs['logits']
		predicted_label_id = logits.argmax(-1).item()
		predicted_label_name = category_id_to_name.get(predicted_label_id, f"ID_{predicted_label_id}")

		print(f"Sample {i+1}/{num_samples_to_test}:")
		print(f"Image: {os.path.basename(image_path)}")
		print(f"True Label: {true_label_name} (ID: {true_label_id})")
		print(f"Predicted Label: {predicted_label_name} (ID: {predicted_label_id})")
		if predicted_label_id == true_label_id:
			correct_predictions += 1
			print("Result: ✅ CORRECT")
		else:
			print("Result: ❌ INCORRECT")
		print("-" * 40)

	except Exception as e:
		print(f"Error processing sample {i+1} ({image_path}): {e}")
		print("-" * 40)

print(f"\nEvaluation Summary: {correct_predictions}/{num_samples_to_test} correct.")


KeyboardInterrupt: 