In [None]:
%%capture
!pip install -q -U transformers accelerate datasets qwen-vl-utils

In [None]:
import gc
import json
import torch
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm.auto import tqdm
from datetime import datetime
from datasets import load_dataset
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

Image.MAX_IMAGE_PIXELS = None

In [None]:
CONFIG = {
    "model_name": "Qwen/Qwen3-VL-4B-Instruct",
    "dataset_name": "yosubshin/m2sv",
    "split": "train",
    "max_new_tokens": 128,
    "temperature": 0.1,
    "output_dir": "./qwen3_eval_results",
    "save_predictions": True,
}

print(f"Configuration:")
print(f"  Model: {CONFIG['model_name']}")
print(f"  Dataset: {CONFIG['dataset_name']}")
print(f"  Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

In [None]:
print(f"[1/4] Loading Qwen3-VL model...")
print(f"  Model: {CONFIG['model_name']}")

device = "cuda" if torch.cuda.is_available() else "cpu"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    CONFIG['model_name'],
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(CONFIG['model_name'])

print(f"Model loaded successfully")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")
print(f"  Device: {device}")

In [None]:
def safe_load_image(image_obj):
    if image_obj is None:
        return None
    if isinstance(image_obj, Image.Image):
        img_copy = image_obj.copy()
        image_obj.close()
        return img_copy.convert('RGB')
    return image_obj

print(f"[2/4] Loading m2sv dataset...")

dataset = load_dataset(CONFIG['dataset_name'], split=CONFIG['split'])
print(f"Loaded {len(dataset)} samples")

sample = dataset[0]
print(f"\nSample question:")
print(f"  Q: {sample['question']}")
print(f"  Options: {sample['options']}")
print(f"  Answer: {sample['answer']}")

In [None]:
print(f"[3/4] Running evaluation...")

def format_prompt(question, options):
    prompt = f"{question}\n\nOptions:\n"
    for opt in options:
        prompt += f"{opt}\n"
    prompt += "\nProvide only the letter of the correct answer (A, B, C, or D)."
    return prompt

def extract_answer(response):
    response = response.strip().upper()
    for char in ['A', 'B', 'C', 'D']:
        if char in response:
            return char
    return response[0] if response else ""

results = []
correct = 0
total = len(dataset)

pbar = tqdm(dataset, desc="Evaluating", total=total)

for idx, item in enumerate(pbar):
    try:
        prompt = format_prompt(item['question'], item['options'])
        
        image_content = []
        for image_key in ("image_sv", "image_map"):
            image_value = item.get(image_key)
            if image_value is not None:
                safe_image = safe_load_image(image_value)
                if safe_image is not None:
                    image_content.append({"type": "image", "image": safe_image})
        
        if not image_content:
            raise ValueError("Sample is missing both scene and map images.")
        
        image_content.append({"type": "text", "text": prompt})
        
        messages = [{"role": "user", "content": image_content}]
        
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=CONFIG['max_new_tokens'],
                temperature=CONFIG['temperature'],
            )
        
        generated_ids_trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs['input_ids'], outputs)
        ]
        
        response = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]
        
        prediction = extract_answer(response)
        ground_truth = item['answer']
        is_correct = prediction == ground_truth
        
        if is_correct:
            correct += 1
        
        results.append({
            'id': item['id'],
            'question': item['question'],
            'options': item['options'],
            'ground_truth': ground_truth,
            'prediction': prediction,
            'raw_response': response,
            'correct': is_correct,
        })
        
        accuracy = correct / (idx + 1)
        pbar.set_postfix({'accuracy': f'{accuracy:.2%}'})
        
        if (idx + 1) % 10 == 0:
            gc.collect()
            if device == "cuda":
                torch.cuda.empty_cache()
    
    except Exception as e:
        print(f"\nError on sample {idx}: {e}")
        results.append({
            'id': item['id'],
            'question': item['question'],
            'options': item['options'],
            'ground_truth': item['answer'],
            'prediction': 'ERROR',
            'raw_response': str(e),
            'correct': False,
        })

pbar.close()

In [None]:
import os

print(f"[4/4] Analysis and results...")

accuracy = correct / total
results_df = pd.DataFrame(results)

print("\n" + "=" * 100)
print("EVALUATION RESULTS")
print("=" * 100)
print(f"Model: {CONFIG['model_name']}")
print(f"Dataset: {CONFIG['dataset_name']} ({CONFIG['split']} split)")
print(f"Total samples: {total}")
print(f"Correct: {correct}")
print(f"Incorrect: {total - correct}")
print(f"Accuracy: {accuracy:.2%}")
print("=" * 100)

print("\nSample Correct Predictions:")
correct_samples = results_df[results_df['correct'] == True].head(3)
for _, row in correct_samples.iterrows():
    print(f"\nQ: {row['question'][:80]}...")
    print(f"  GT: {row['ground_truth']} | Pred: {row['prediction']}")
    print(f"  Response: {row['raw_response'][:100]}")

print("\nSample Incorrect Predictions:")
incorrect_samples = results_df[results_df['correct'] == False].head(3)
for _, row in incorrect_samples.iterrows():
    print(f"\nQ: {row['question'][:80]}...")
    print(f"  GT: {row['ground_truth']} | Pred: {row['prediction']}")
    print(f"  Response: {row['raw_response'][:100]}")

if CONFIG['save_predictions']:
    os.makedirs(CONFIG['output_dir'], exist_ok=True)
    
    results_path = f"{CONFIG['output_dir']}/predictions.csv"
    results_df.to_csv(results_path, index=False)
    print(f"\nSaved predictions to: {results_path}")
    
    summary = {
        'model': CONFIG['model_name'],
        'dataset': CONFIG['dataset_name'],
        'split': CONFIG['split'],
        'total_samples': total,
        'correct': correct,
        'incorrect': total - correct,
        'accuracy': float(accuracy),
        'timestamp': datetime.now().isoformat(),
        'config': CONFIG,
    }
    
    summary_path = f"{CONFIG['output_dir']}/summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"Saved summary to: {summary_path}")

print("\n" + "=" * 100)
print("Evaluation complete!")
print("=" * 100)

del model
del processor
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()