In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.7.6-py3-none-any.whl.metadata (47 kB)
Collecting unsloth_zoo>=2025.7.8 (from unsloth)
  Downloading unsloth_zoo-2025.7.8-py3-none-any.whl.metadata (8.1 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.26-py3-none-any.whl.metadata (12 kB)
Collecting transformers!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,>=4.51.3 (from unsloth)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collectin

In [27]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-3B-Instruct",
    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.7.6: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA A100 80GB PCIe. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]


In [28]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [4]:
from datasets import load_dataset
data = load_dataset("unsloth/LaTeX_OCR", split = "train")

Generating train split: 100%|██████████| 68686/68686 [00:01<00:00, 43912.71 examples/s]
Generating test split: 100%|██████████| 7632/7632 [00:00<00:00, 111603.61 examples/s]


In [5]:
instruction = "Write the LaTeX representation for this image."

def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["text"]} ]
        },
    ]
    return { "messages" : conversation }


In [6]:
converted_dataset = [convert_to_conversation(sample) for sample in data]

In [7]:
converted_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'Write the LaTeX representation for this image.'},
    {'type': 'image',
     'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=160x40>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': '{ \\frac { N } { M } } \\in { \\bf Z } , { \\frac { M } { P } } \\in { \\bf Z } , { \\frac { P } { Q } } \\in { \\bf Z }'}]}]}

In [8]:
import json

with open('/workspace/Qwen2.5-VL/qwen-vl-finetune/demo/sequences_session_20250720_165353.json', 'r') as f:
    dataset = json.load(f)

In [9]:
dataset[0]

{'image': 'session_20250720_165353/frames/frame_000003.png',
 'conversations': [{'from': 'human',
   'value': '<image>\nclicks to move e2 to e4'},
  {'from': 'gpt',
   'value': '<tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 679]}}</tool_call><tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 543]}}</tool_call>'}]}

In [10]:
def convert_to_conversation_chess(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : sample['conversations'][0]['value']},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "gpt",
          "content" : [
            {"type" : "text",  "text"  : sample["conversations"][1]["value"]} ]
        },
    ]
    return { "messages" : conversation }

In [11]:
sample = dataset[0]
convert_to_conversation_chess(sample)

{'messages': [{'role': 'user',
   'content': [{'type': 'text', 'text': '<image>\nclicks to move e2 to e4'},
    {'type': 'image',
     'image': 'session_20250720_165353/frames/frame_000003.png'}]},
  {'role': 'gpt',
   'content': [{'type': 'text',
     'text': '<tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 679]}}</tool_call><tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 543]}}</tool_call>'}]}]}

In [12]:
converted_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'Write the LaTeX representation for this image.'},
    {'type': 'image',
     'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=160x40>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': '{ \\frac { N } { M } } \\in { \\bf Z } , { \\frac { M } { P } } \\in { \\bf Z } , { \\frac { P } { Q } } \\in { \\bf Z }'}]}]}

In [14]:
converted_dataset = [convert_to_conversation_chess(sample) for sample in dataset]

In [15]:
converted_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text', 'text': '<image>\nclicks to move e2 to e4'},
    {'type': 'image',
     'image': 'session_20250720_165353/frames/frame_000003.png'}]},
  {'role': 'gpt',
   'content': [{'type': 'text',
     'text': '<tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 679]}}</tool_call><tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [505, 543]}}</tool_call>'}]}]}

In [16]:
dataset[2]["image"]

'session_20250720_165353/frames/frame_000016.png'

In [17]:
dataset[2].keys()

dict_keys(['image', 'conversations'])

In [18]:
dataset[2]['conversations'][0]['value']

'<image>\nclicks to move g1 to h3'

In [20]:
dataset[2]["image"]

'session_20250720_165353/frames/frame_000016.png'

In [29]:
## BEFORE TRAINING ##

import os
from PIL import Image


FastVisionModel.for_inference(model) # Enable for inference!

image_path = dataset[2]["image"]

image = Image.open(os.path.join("/workspace/Qwen2.5-VL/", image_path))
instruction = dataset[2]['conversations'][0]['value']

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 0.01, min_p = 0.1)

To move the piece from g1 to h3, you need to click on the square where the piece is currently located (g1) and then click on the target square (h3). Here’s how you can do it:

1. Click on the square where the piece is currently located (g1).
2. Then, click on the square where you want to move the piece (h3).

This will move the piece from g1 to h3.<|im_end|>


In [31]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!


import torch._dynamo.config

# Increase cache limits for vision model training
torch._dynamo.config.cache_size_limit = 256
torch._dynamo.config.accumulated_cache_size_limit = 512


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 200,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_seq_length = 2048,
    ),
)

Unsloth: Model does not have a default image size - using 512


In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4 | Num Epochs = 200 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 41,084,928 of 3,795,707,904 (1.08% trained)


Step,Training Loss
1,4.6095
2,4.616
3,4.4865
4,4.1632
5,3.6423
6,2.949
7,2.3168
8,1.9173
9,1.6153
10,1.3368


In [33]:
## AFTER TRAINING ##

import os
from PIL import Image

FastVisionModel.for_inference(model) # Enable for inference!
image_path = dataset[2]["image"]

image = Image.open(os.path.join("/workspace/Qwen2.5-VL/", image_path))
instruction = dataset[2]['conversations'][0]['value']

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 0.01, min_p = 0.1)

<tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [1278, 174]}}</tool_call><tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [1333, 174]}}</tool_call><|im_end|>


In [35]:
dataset[2]

{'image': 'session_20250720_165353/frames/frame_000016.png',
 'conversations': [{'from': 'human',
   'value': '<image>\nclicks to move g1 to h3'},
  {'from': 'gpt',
   'value': '<tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [596, 743]}}</tool_call><tool_call>{"name": "computer_use", "arguments": {"action": "left_click", "coordinate": [632, 616]}}</tool_call>'}]}