In [None]:
!pip install --upgrade bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
!pip install transformers datasets evaluate torch accelerate

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, pipeline, BitsAndBytesConfig
from datasets import load_dataset
from evaluate import load as load_metric
import logging
import os
import re

In [None]:
# Setup logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Define model paths
MODELS = {
    "UI-TARS 7B SFT": "bytedance-research/UI-TARS-7B-SFT",
    "UI-TARS 7B DPO": "bytedance-research/UI-TARS-7B-DPO"
}

In [None]:
def load_model(model_path):
    """Load model and tokenizer with Hugging Face authentication."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    hf_token = os.getenv("HUGGINGFACE_TOKEN")  # Store your token securely, not hardcoded!

    """Load model and tokenizer."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.cuda.empty_cache()  # Clear GPU memory before loading
    model = Qwen2VLForConditionalGeneration.from_pretrained(

        model_path,
        torch_dtype=torch.float16,
        device_map={"": 0},
        offload_folder="offload",  # Save excess weights to disk
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        token=hf_token  # Pass token for gated models
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=hf_token)
    logger.info(f"{model_path} Model Loaded Successfully!")
    return model, tokenizer

In [None]:
def evaluate_qa(model, tokenizer, dataset_name="squad"):
    """Evaluate on SQuAD for QA."""
    dataset = load_dataset(dataset_name, split="validation[:100]")
    metric = load_metric("squad_v2")



    for example in dataset:
        input_text = f"Q: {example['question']} Context: {example['context']} A:"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        metric.add(predictions={"id": example["id"], "prediction_text": result,  "no_answer_probability": 0.0}, references={"id": example["id"], "answers": example["answers"]})

    return metric.compute()

In [None]:
def evaluate_math(model, tokenizer, dataset_name="gsm8k"):
    """Evaluate on GSM8K for Math QA."""
    dataset = load_dataset(dataset_name, 'main', split="test[:50]")
    metric = load_metric("accuracy")

    for example in dataset:
        input_text = f"Solve: {example['question']}"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        answer = str(example['answer']).strip()
        if answer.isdigit():  # Ensure numeric conversion if possible
            answer = int(answer)
        try:
            prediction = float(prediction) if '.' in prediction else int(prediction)
            answer = float(answer) if '.' in answer else int(answer)
        except ValueError:
            prediction, answer = 0, 0  # Default to 0 if conversion fails
        metric.add(prediction=prediction, reference=answer)

    return metric.compute()

In [None]:
def evaluate_reasoning(model, tokenizer, dataset_name="hellaswag"):
    """Evaluate on HellaSwag for reasoning."""
    dataset = load_dataset(dataset_name, split="validation[:100]")
    metric = load_metric("accuracy")

    for example in dataset:
        input_text = f"{example['ctx']} \n Options: {example['endings']}"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=50)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        metric.add(prediction=prediction, reference=example['label'])

    return metric.compute()

In [None]:
def evaluate_summarization(model, tokenizer, dataset_name="cnn_dailymail", subset="3.0.0"):
    """Evaluate on CNN/DailyMail for summarization."""
    dataset = load_dataset(dataset_name, subset, split="test[:50]")
    metric = load_metric("rouge")

    for example in dataset:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = tokenizer(example['article'], return_tensors="pt", truncation=True).to(device) if torch.cuda.is_available() else tokenizer(example['article'], return_tensors="pt", truncation=True)
        outputs = model.generate(**inputs, max_new_tokens=150)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

        metric.add(prediction=summary, reference=example['highlights'])

    return metric.compute()

In [None]:
# Run benchmark
for model_name, model_path in MODELS.items():
    print(f"\nEvaluating {model_name}...")
    model, tokenizer = load_model(model_path)

    print("SQuAD QA:", evaluate_qa(model, tokenizer))
    print("GSM8K Math:", evaluate_math(model, tokenizer))
    print("HellaSwag Reasoning:", evaluate_reasoning(model, tokenizer))
    print("CNN/DailyMail Summarization:", evaluate_summarization(model, tokenizer))


Evaluating UI-TARS 7B SFT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/3.38G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

SQuAD QA: {'exact': 0.0, 'f1': 2.5704197842637626, 'total': 100, 'HasAns_exact': 0.0, 'HasAns_f1': 2.5704197842637626, 'HasAns_total': 100, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 2.5704197842637626, 'best_f1_thresh': 0.0}


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

ValueError: invalid literal for int() with base 10: '2.5'