In [1]:
!pip install -q transformers datasets accelerate tqdm bitsandbytes psutil

In [2]:
pip install -U vllm --pre --index-url https://pypi.org/simple --extra-index-url https://wheels.vllm.ai/nightly


Looking in indexes: https://pypi.org/simple, https://wheels.vllm.ai/nightly


In [3]:
"""
MMLU Evaluation - Medium Models (Google Colab Version)

This script evaluates medium-sized models (7B-14B parameters) on MMLU.
Optimized for Google Colab with GPU support.

Medium Models from Assignment:
- Qwen 2.5 7B (7B params)
- Llama 3.1 8B (8B params)
- Qwen 2.5 14B (14B params)

Note: Google Colab provides ~15GB GPU memory (T4).
Use 4-bit quantization for the 14B model if needed.
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import json
from tqdm.auto import tqdm
import os
from datetime import datetime
import sys
import platform
import time
import psutil
import gc

# Check if in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

# ============================================================================
# CONFIGURATION - Medium Models
# ============================================================================

# Medium-sized models (7B-14B parameters)
MODELS = [
    "Qwen/Qwen2.5-7B-Instruct",      # 7B - ~14GB
    "meta-llama/Llama-3.1-8B-Instruct",  # 8B - ~16GB
    "Qwen/Qwen2.5-14B-Instruct"      # 14B - ~28GB (use 4-bit quantization)
]

# GPU settings - Colab has GPU by default
USE_GPU = True

# Quantization settings
# Colab T4 has ~15GB memory, so we need quantization for larger models
QUANTIZATION_BITS = None  # Will be set per-model based on size

MAX_NEW_TOKENS = 1

# 10 MMLU subjects for evaluation
MMLU_SUBJECTS = [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "business_ethics",
    "clinical_knowledge",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_medicine"
]

# Verbose mode
VERBOSE_MODE = False  # Set to True to see detailed output


# ============================================================================
# Timing Utilities
# ============================================================================

class TimingTracker:
    """Track CPU and GPU timing for model evaluation"""

    def __init__(self, device):
        self.device = device
        self.process = psutil.Process(os.getpid())
        self.reset()

    def reset(self):
        self.start_time = None
        self.end_time = None
        self.start_cpu_time = None
        self.end_cpu_time = None

    def start(self):
        self.start_time = time.time()
        self.start_cpu_time = self.process.cpu_times()
        if self.device == "cuda":
            torch.cuda.synchronize()

    def stop(self):
        if self.device == "cuda":
            torch.cuda.synchronize()
        self.end_time = time.time()
        self.end_cpu_time = self.process.cpu_times()

    def get_times(self):
        real_time = self.end_time - self.start_time if self.end_time else 0
        if self.end_cpu_time and self.start_cpu_time:
            user_time = self.end_cpu_time.user - self.start_cpu_time.user
            system_time = self.end_cpu_time.system - self.start_cpu_time.system
            cpu_time = user_time + system_time
        else:
            cpu_time = user_time = system_time = 0

        return {
            "real_time": real_time,
            "cpu_time": cpu_time,
            "user_time": user_time,
            "system_time": system_time,
            "gpu_time": real_time if self.device == "cuda" else 0
        }


# ============================================================================
# Device Detection
# ============================================================================

def detect_device():
    """Detect the best available device"""
    if not USE_GPU:
        return "cpu"

    if torch.cuda.is_available():
        return "cuda"

    if torch.backends.mps.is_available():
        return "mps"

    return "cpu"


def check_environment():
    """Check environment and dependencies"""
    print("="*70)
    print("Environment Check - Medium Models")
    print("="*70)

    if IN_COLAB:
        print("✓ Running in Google Colab")
        # Mount Google Drive to save results
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("✓ Google Drive mounted")
        except:
            print("⚠️  Could not mount Google Drive")
    else:
        print("✓ Running locally")

    print(f"✓ Platform: {platform.system()} ({platform.machine()})")

    device = detect_device()

    if device == "cuda":
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"✓ GPU Available: {gpu_name}")
        print(f"✓ GPU Memory: {gpu_memory:.2f} GB")

        if gpu_memory < 20:
            print(f"⚠️  Note: GPU has {gpu_memory:.1f}GB. Will use 4-bit quantization for 14B model.")
    elif device == "mps":
        print("✓ Apple Metal (MPS) Available")
    else:
        print("✓ Using CPU")
        print("⚠️  WARNING: CPU inference will be VERY slow for medium models!")

    print("="*70 + "\n")
    return device


# ============================================================================
# Model Loading
# ============================================================================

def should_quantize(model_name, gpu_memory_gb=15):
    """Decide if model needs quantization based on size and available memory"""
    if "14B" in model_name or "13B" in model_name:
        return 4  # 14B/13B models need 4-bit quantization on T4
    elif "8B" in model_name and gpu_memory_gb < 20:
        return 4  # 8B might need quantization on smaller GPUs
    elif "7B" in model_name and gpu_memory_gb < 16:
        return 4  # 7B might need quantization on very small GPUs
    return None  # No quantization needed


def load_model_and_tokenizer(model_name, device):
    """Load model and tokenizer with automatic quantization"""
    print("="*70)
    print(f"Loading Model: {model_name}")
    print("="*70)
    print(f"Device: {device}")

    # Determine quantization automatically
    if device == "cuda":
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        quant_bits = should_quantize(model_name, gpu_memory)
    else:
        quant_bits = None

    print(f"Quantization: {quant_bits}-bit" if quant_bits else "None (full precision)")

    # Estimate memory
    if "7B" in model_name:
        mem = "~14 GB" if not quant_bits else "~4 GB"
    elif "8B" in model_name:
        mem = "~16 GB" if not quant_bits else "~4 GB"
    elif "14B" in model_name or "13B" in model_name:
        mem = "~28 GB" if not quant_bits else "~7 GB"
    else:
        mem = "~10-30 GB"

    print(f"Estimated memory: {mem}")
    print("="*70 + "\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if quant_bits == 4:
        print("Loading with 4-bit quantization...")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16
        )
    elif quant_bits == 8:
        print("Loading with 8-bit quantization...")
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
    else:
        print("Loading full precision model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto" if device == "cuda" else None
        )
        if device in ["cpu", "mps"]:
            model = model.to(device)

    model.eval()

    print(f"✓ Model loaded successfully")
    if device == "cuda":
        memory_allocated = torch.cuda.memory_allocated(0) / 1e9
        memory_reserved = torch.cuda.memory_reserved(0) / 1e9
        print(f"✓ GPU memory allocated: {memory_allocated:.2f} GB")
        print(f"✓ GPU memory reserved: {memory_reserved:.2f} GB\n")

    return model, tokenizer, quant_bits


# ============================================================================
# MMLU Evaluation
# ============================================================================

def format_mmlu_prompt(question, choices):
    """Format a question in MMLU format"""
    prompt = f"Question: {question}\n\nChoices:\n"
    for i, choice in enumerate(choices):
        prompt += f"{chr(65+i)}. {choice}\n"
    prompt += "\nAnswer (A, B, C, or D):"
    return prompt


def get_model_prediction(model, tokenizer, prompt):
    """Get model prediction for a single question"""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response = response.strip()

    if response and response[0] in ['A', 'B', 'C', 'D']:
        return response[0]

    return None


def evaluate_subject(model, tokenizer, subject, timer, verbose=False):
    """Evaluate model on a single MMLU subject"""
    print(f"\n{'='*70}")
    print(f"Evaluating: {subject}")
    print(f"{'='*70}")

    try:
        dataset = load_dataset("cais/mmlu", subject, split="test")
    except Exception as e:
        print(f"✗ Failed to load {subject}: {e}")
        return None

    correct = 0
    total = 0

    timer.start()

    for example in tqdm(dataset, desc=f"Testing {subject}", leave=True, disable=verbose):
        question = example["question"]
        choices = example["choices"]
        correct_answer_idx = example["answer"]
        correct_answer = ["A", "B", "C", "D"][correct_answer_idx]

        prompt = format_mmlu_prompt(question, choices)
        predicted_answer = get_model_prediction(model, tokenizer, prompt)

        is_correct = predicted_answer == correct_answer
        if is_correct:
            correct += 1
        total += 1

        if verbose:
            print(f"\n{'='*70}")
            print(f"Question {total}/{len(dataset)}:")
            print(f"{'='*70}")
            print(f"{question}")
            print(f"\nChoices:")
            for i, choice in enumerate(choices):
                print(f"  {chr(65+i)}. {choice}")
            print(f"\nCorrect Answer: {correct_answer}")
            print(f"Model Answer: {predicted_answer if predicted_answer else 'NO ANSWER'}")
            print(f"Result: {'✓ CORRECT' if is_correct else '✗ WRONG'}")

    timer.stop()
    timing = timer.get_times()

    accuracy = (correct / total * 100) if total > 0 else 0
    print(f"✓ Result: {correct}/{total} correct = {accuracy:.2f}%")
    print(f"  Real time: {timing['real_time']:.2f}s")
    print(f"  CPU time: {timing['cpu_time']:.2f}s")

    return {
        "subject": subject,
        "correct": correct,
        "total": total,
        "accuracy": accuracy,
        "timing": timing
    }


def evaluate_model(model_name, device):
    """Evaluate a single model on all subjects"""
    print("\n" + "="*70)
    print(f"EVALUATING MODEL: {model_name}")
    print("="*70 + "\n")

    try:
        model, tokenizer, quant_used = load_model_and_tokenizer(model_name, device)
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        import traceback
        traceback.print_exc()
        return None

    timer = TimingTracker(device)

    results = []
    total_correct = 0
    total_questions = 0
    total_timing = {
        "real_time": 0,
        "cpu_time": 0,
        "user_time": 0,
        "system_time": 0,
        "gpu_time": 0
    }

    print(f"\n{'='*70}")
    print(f"Starting evaluation on {len(MMLU_SUBJECTS)} subjects")
    print(f"{'='*70}\n")

    overall_start = time.time()

    for i, subject in enumerate(MMLU_SUBJECTS, 1):
        print(f"\nProgress: {i}/{len(MMLU_SUBJECTS)} subjects")

        timer.reset()

        result = evaluate_subject(model, tokenizer, subject, timer, verbose=VERBOSE_MODE)

        if result:
            results.append(result)
            total_correct += result["correct"]
            total_questions += result["total"]

            for key in total_timing:
                total_timing[key] += result["timing"][key]

    overall_end = time.time()
    total_timing["real_time"] = overall_end - overall_start

    overall_accuracy = (total_correct / total_questions * 100) if total_questions > 0 else 0

    print("\n" + "="*70)
    print(f"MODEL EVALUATION SUMMARY: {model_name}")
    print("="*70)
    print(f"Total Subjects: {len(results)}")
    print(f"Total Questions: {total_questions}")
    print(f"Total Correct: {total_correct}")
    print(f"Overall Accuracy: {overall_accuracy:.2f}%")
    print("\n" + "-"*70)
    print("TIMING BREAKDOWN:")
    print("-"*70)
    print(f"Real Time: {total_timing['real_time']:.2f}s ({total_timing['real_time']/60:.2f} min)")
    print(f"CPU Time: {total_timing['cpu_time']:.2f}s")
    print(f"User Time: {total_timing['user_time']:.2f}s")
    print(f"System Time: {total_timing['system_time']:.2f}s")
    if device == "cuda":
        print(f"GPU Time: {total_timing['gpu_time']:.2f}s")
    print("="*70)

    del model
    del tokenizer
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()

    return {
        "model_name": model_name,
        "device": str(device),
        "quantization_bits": quant_used,
        "overall_accuracy": overall_accuracy,
        "total_correct": total_correct,
        "total_questions": total_questions,
        "timing": total_timing,
        "subject_results": results
    }


# ============================================================================
# Main Function
# ============================================================================

def main():
    """Main evaluation function"""
    print("\n" + "="*70)
    print("MMLU Evaluation - Medium Models (7B-14B)")
    print("="*70 + "\n")

    print(f"Models to evaluate: {len(MODELS)}")
    for i, model in enumerate(MODELS, 1):
        print(f"  {i}. {model}")
    print(f"\nSubjects: {len(MMLU_SUBJECTS)}")
    print(f"Verbose mode: {'ON' if VERBOSE_MODE else 'OFF'}")
    print()

    device = check_environment()

    all_results = []

    for model_name in MODELS:
        try:
            result = evaluate_model(model_name, device)
            if result:
                all_results.append(result)
        except Exception as e:
            print(f"\n✗ Error evaluating {model_name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    if not all_results:
        print("\n✗ No models were successfully evaluated!")
        return

    print("\n" + "="*70)
    print("COMPARISON ACROSS ALL MODELS")
    print("="*70)
    print(f"\n{'Model':<35} {'Params':<8} {'Accuracy':<12} {'Real Time':<15} {'CPU Time':<15}")
    print("-"*70)
    for result in all_results:
        model_name = result['model_name']

        if "7B" in model_name:
            params = "7B"
        elif "8B" in model_name:
            params = "8B"
        elif "14B" in model_name:
            params = "14B"
        elif "13B" in model_name:
            params = "13B"
        else:
            params = "?"

        model_short = model_name.split('/')[-1][:33]
        print(f"{model_short:<35} {params:<8} {result['overall_accuracy']:>6.2f}%     "
              f"{result['timing']['real_time']/60:>6.2f} min     "
              f"{result['timing']['cpu_time']:>8.2f}s")
    print("="*70)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"medium_models_mmlu_results_{timestamp}.json"

    # Save to Colab drive if mounted
    if IN_COLAB:
        try:
            output_path = f"/content/drive/MyDrive/{output_file}"
            with open(output_path, "w") as f:
                json.dump({
                    "timestamp": timestamp,
                    "device": str(device),
                    "num_subjects": len(MMLU_SUBJECTS),
                    "subjects": MMLU_SUBJECTS,
                    "verbose_mode": VERBOSE_MODE,
                    "model_results": all_results
                }, f, indent=2)
            print(f"\n✓ Results saved to Google Drive: {output_file}")
        except:
            # Save locally if drive not mounted
            with open(output_file, "w") as f:
                json.dump({
                    "timestamp": timestamp,
                    "device": str(device),
                    "num_subjects": len(MMLU_SUBJECTS),
                    "subjects": MMLU_SUBJECTS,
                    "verbose_mode": VERBOSE_MODE,
                    "model_results": all_results
                }, f, indent=2)
            print(f"\n✓ Results saved locally: {output_file}")
    else:
        with open(output_file, "w") as f:
            json.dump({
                "timestamp": timestamp,
                "device": str(device),
                "num_subjects": len(MMLU_SUBJECTS),
                "subjects": MMLU_SUBJECTS,
                "verbose_mode": VERBOSE_MODE,
                "model_results": all_results
            }, f, indent=2)
        print(f"\n✓ Results saved: {output_file}")

    print("\n✅ Evaluation complete!")

    return output_file


if __name__ == "__main__":
    try:
        output_file = main()
    except KeyboardInterrupt:
        print("\n\n⚠️  Evaluation interrupted by user")
    except Exception as e:
        print(f"\n✗ Error during evaluation: {e}")
        import traceback
        traceback.print_exc()

✓ Running in Google Colab

MMLU Evaluation - Medium Models (7B-14B)

Models to evaluate: 3
  1. Qwen/Qwen2.5-7B-Instruct
  2. meta-llama/Llama-3.1-8B-Instruct
  3. Qwen/Qwen2.5-14B-Instruct

Subjects: 10
Verbose mode: OFF

Environment Check - Medium Models
✓ Running in Google Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted
✓ Platform: Linux (x86_64)
✓ GPU Available: Tesla T4
✓ GPU Memory: 15.83 GB
⚠️  Note: GPU has 15.8GB. Will use 4-bit quantization for 14B model.


EVALUATING MODEL: Qwen/Qwen2.5-7B-Instruct

Loading Model: Qwen/Qwen2.5-7B-Instruct
Device: cuda
Quantization: 4-bit
Estimated memory: ~4 GB



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading with 4-bit quantization...


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Model loaded successfully
✓ GPU memory allocated: 5.56 GB
✓ GPU memory reserved: 7.14 GB


Starting evaluation on 10 subjects


Progress: 1/10 subjects

Evaluating: abstract_algebra


Testing abstract_algebra:   0%|          | 0/100 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✓ Result: 35/100 correct = 35.00%
  Real time: 23.33s
  CPU time: 22.12s

Progress: 2/10 subjects

Evaluating: anatomy


Testing anatomy:   0%|          | 0/135 [00:00<?, ?it/s]

✓ Result: 84/135 correct = 62.22%
  Real time: 31.83s
  CPU time: 31.47s

Progress: 3/10 subjects

Evaluating: astronomy


Testing astronomy:   0%|          | 0/152 [00:00<?, ?it/s]

✓ Result: 114/152 correct = 75.00%
  Real time: 34.96s
  CPU time: 34.46s

Progress: 4/10 subjects

Evaluating: business_ethics


Testing business_ethics:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 72/100 correct = 72.00%
  Real time: 23.21s
  CPU time: 22.89s

Progress: 5/10 subjects

Evaluating: clinical_knowledge


Testing clinical_knowledge:   0%|          | 0/265 [00:00<?, ?it/s]

✓ Result: 194/265 correct = 73.21%
  Real time: 59.69s
  CPU time: 59.08s

Progress: 6/10 subjects

Evaluating: college_biology


Testing college_biology:   0%|          | 0/144 [00:00<?, ?it/s]

✓ Result: 113/144 correct = 78.47%
  Real time: 34.10s
  CPU time: 33.65s

Progress: 7/10 subjects

Evaluating: college_chemistry


Testing college_chemistry:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 34/100 correct = 34.00%
  Real time: 24.06s
  CPU time: 23.75s

Progress: 8/10 subjects

Evaluating: college_computer_science


Testing college_computer_science:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 49/100 correct = 49.00%
  Real time: 26.35s
  CPU time: 26.08s

Progress: 9/10 subjects

Evaluating: college_mathematics


Testing college_mathematics:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 20/100 correct = 20.00%
  Real time: 24.18s
  CPU time: 23.92s

Progress: 10/10 subjects

Evaluating: college_medicine


Testing college_medicine:   0%|          | 0/173 [00:00<?, ?it/s]

✓ Result: 106/173 correct = 61.27%
  Real time: 46.31s
  CPU time: 45.82s

MODEL EVALUATION SUMMARY: Qwen/Qwen2.5-7B-Instruct
Total Subjects: 10
Total Questions: 1369
Total Correct: 821
Overall Accuracy: 59.97%

----------------------------------------------------------------------
TIMING BREAKDOWN:
----------------------------------------------------------------------
Real Time: 337.33s (5.62 min)
CPU Time: 323.24s
User Time: 307.02s
System Time: 16.22s
GPU Time: 328.03s

EVALUATING MODEL: meta-llama/Llama-3.1-8B-Instruct

Loading Model: meta-llama/Llama-3.1-8B-Instruct
Device: cuda
Quantization: 4-bit
Estimated memory: ~4 GB

✗ Failed to load model: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct.
401 Client Error. (Request ID: Root=1-696e95f6-1eb7027217e480c94103fc16;48ccc74f-b483-4934-a4b1-bb28641c6a72)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/ma

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 402, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1007, in hf_hub_download
  

Loading with 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

✓ Model loaded successfully
✓ GPU memory allocated: 9.98 GB
✓ GPU memory reserved: 11.81 GB


Starting evaluation on 10 subjects


Progress: 1/10 subjects

Evaluating: abstract_algebra


Testing abstract_algebra:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 55/100 correct = 55.00%
  Real time: 42.05s
  CPU time: 41.54s

Progress: 2/10 subjects

Evaluating: anatomy


Testing anatomy:   0%|          | 0/135 [00:00<?, ?it/s]

✓ Result: 99/135 correct = 73.33%
  Real time: 58.28s
  CPU time: 57.66s

Progress: 3/10 subjects

Evaluating: astronomy


Testing astronomy:   0%|          | 0/152 [00:00<?, ?it/s]

✓ Result: 135/152 correct = 88.82%
  Real time: 70.95s
  CPU time: 70.34s

Progress: 4/10 subjects

Evaluating: business_ethics


Testing business_ethics:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 77/100 correct = 77.00%
  Real time: 47.90s
  CPU time: 47.40s

Progress: 5/10 subjects

Evaluating: clinical_knowledge


Testing clinical_knowledge:   0%|          | 0/265 [00:00<?, ?it/s]

✓ Result: 217/265 correct = 81.89%
  Real time: 122.15s
  CPU time: 121.12s

Progress: 6/10 subjects

Evaluating: college_biology


Testing college_biology:   0%|          | 0/144 [00:00<?, ?it/s]

✓ Result: 125/144 correct = 86.81%
  Real time: 70.02s
  CPU time: 69.26s

Progress: 7/10 subjects

Evaluating: college_chemistry


Testing college_chemistry:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 50/100 correct = 50.00%
  Real time: 49.37s
  CPU time: 48.89s

Progress: 8/10 subjects

Evaluating: college_computer_science


Testing college_computer_science:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 63/100 correct = 63.00%
  Real time: 54.06s
  CPU time: 53.57s

Progress: 9/10 subjects

Evaluating: college_mathematics


Testing college_mathematics:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Result: 54/100 correct = 54.00%
  Real time: 49.68s
  CPU time: 49.22s

Progress: 10/10 subjects

Evaluating: college_medicine


Testing college_medicine:   0%|          | 0/173 [00:00<?, ?it/s]

✓ Result: 136/173 correct = 78.61%
  Real time: 95.19s
  CPU time: 94.29s

MODEL EVALUATION SUMMARY: Qwen/Qwen2.5-14B-Instruct
Total Subjects: 10
Total Questions: 1369
Total Correct: 1011
Overall Accuracy: 73.85%

----------------------------------------------------------------------
TIMING BREAKDOWN:
----------------------------------------------------------------------
Real Time: 667.11s (11.12 min)
CPU Time: 653.29s
User Time: 542.69s
System Time: 110.60s
GPU Time: 659.64s

COMPARISON ACROSS ALL MODELS

Model                               Params   Accuracy     Real Time       CPU Time       
----------------------------------------------------------------------
Qwen2.5-7B-Instruct                 7B        59.97%       5.62 min       323.24s
Qwen2.5-14B-Instruct                14B       73.85%      11.12 min       653.29s

✓ Results saved to Google Drive: medium_models_mmlu_results_20260119_205040.json

✅ Evaluation complete!
