In [1]:
import torch

from datasets import load_dataset
from datasets import load_from_disk

from tqdm import tqdm
from unsloth import FastVisionModel
from unsloth import is_bf16_supported
from trl import SFTTrainer, SFTConfig
from unsloth.trainer import UnslothVisionDataCollator

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

==((====))==  Unsloth 2025.3.18: Fast Mllama patching. Transformers: 4.50.1.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 1. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, 
    finetune_language_layers   = True, 
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    r = 16,           
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3443,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


### Dataset

In [4]:
dataset = load_from_disk("/Utilisateurs/umushtaq/multimodal_er/EmoComics35/datasets/emocomics35_pg_ttv")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 574
    })
    test: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 156
    })
    validation: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 144
    })
})

In [6]:
dataset['validation'][100]

{'SourceFile': 'QC copy - 1517 - 37 John Carpenter Historias para una noche de Halloween 3.xlsx',
 'Page': 121,
 'PageUtterances': '["I\'M NEVER NOT OVER MY HEAD.", \'MAMA?\', \'WHERE?\', \'GONE.\', "LET\'S LOOK.", \'HARD TO FIND THE " RIGHT THING. "\', \'NO.\', \'EVERYTHING IS CLUTTERED.\', \'CHAOS.\', \'#UH-UH.\', "CAN\'T FEEL HER.", \'YES.\', \'LEAVE.\', \'KEEP LOOKING.\', "WE\'LL FIND HER.", "CAN\'T FIND TRUE NORTH WITHOUT A COMPASS."]',
 'PageSpeakers': "['Franklin', 'ID- 52', 'ID- 53', 'ID- 54', 'ID- 55', 'Franklin', 'ID- 52', 'Franklin', 'Franklin', 'ID- 53', 'ID- 54', 'ID- 52', 'ID- 55', 'ID- 54', 'ID- 53', 'Franklin']",
 'PageEmotions': '["[\'Sadness\']", "[\'Sadness\', \'Surprise\']", "[\'Sadness\', \'Surprise\']", "[\'Sadness\']", \'[Neutral]\', "[\'Sadness\']", "[\'Sadness\']", "[\'Sadness\']", "[\'Sadness\']", "[\'Surprise\']", "[\'Sadness\']", "[\'Sadness\', \'Joy\']", "[\'Sadness\', \'Joy\']", "[\'Sadness\', \'Joy\']", "[\'Sadness\', \'Joy\']", "[\'Sadness\']"]',
 'PageP

In [7]:
#dataset = dataset.select(range(200))

In [8]:
instruction = """
You are an expert Amazon worker who is good at writing product descriptions. 
Write the product description accurately by looking at the image.
"""


def convert_to_conversation(sample):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["PageEmotions"]}],
        },
    ]
    return {"messages": conversation}


pass


train_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset['train'])]
test_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset['test'])]
val_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset['validation'])]

  0%|          | 0/574 [00:00<?, ?it/s]

100%|██████████| 574/574 [02:13<00:00,  4.28it/s]
100%|██████████| 156/156 [00:43<00:00,  3.60it/s]
100%|██████████| 144/144 [00:37<00:00,  3.84it/s]


In [9]:
val_dataset_p[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '\nYou are an expert Amazon worker who is good at writing product descriptions. \nWrite the product description accurately by looking at the image.\n'},
    {'type': 'image',
     'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3968x6070>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': '["[\'Anger\', \'Fear\']", "[\'Anger\']", "[\'Anger\', \'Fear\']", "[\'Anger\', \'Disgust\', \'Fear\']"]'}]}]}

In [10]:
# FastVisionModel.for_inference(model)  # Enable for inference!

# image = dataset[15]["image"]

# messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "image"},
#             {"type": "text", "text": instruction},
#         ],
#     }
# ]
# input_text = tokenizer.apply_chat_template(
#     messages, add_generation_prompt=True
# )
# inputs = tokenizer(
#     image,
#     input_text,
#     add_special_tokens=False,
#     return_tensors="pt",
# ).to("cuda")

# from transformers import TextStreamer

# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# _ = model.generate(
#     **inputs,
#     streamer=text_streamer,
#     max_new_tokens=128,
#     use_cache=True,
#     temperature=1.5,
#     min_p=0.1
# )

In [11]:
FastVisionModel.for_training(model)  # Enable for training!

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),  # Must use!
    train_dataset=train_dataset_p,
    eval_dataset=val_dataset_p,
    args=SFTConfig(
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        #max_steps=30,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=5,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="/Utilisateurs/umushtaq/multimodal_er/EmoComics35/model_outputs",
        report_to="none",  # For Weights and Biases
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=4,
        max_seq_length=2048,
    ),
)

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 574 | Num Epochs = 1 | Total steps = 71
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 67,174,400/11,000,000,000 (0.61% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,3.482
10,1.441
15,0.4639
20,0.3918
25,0.303
30,0.2866
35,0.2697
40,0.2739
45,0.2703
50,0.2412


In [13]:
trainer_stats

TrainOutput(global_step=71, training_loss=0.5969909423253905, metrics={'train_runtime': 1694.8288, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.042, 'total_flos': 3572241381987756.0, 'train_loss': 0.5969909423253905})

In [14]:
# ok works

In [15]:
# Inference

In [16]:
FastVisionModel.for_inference(model)  # Enable for inference!



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-31): 32 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [17]:
raw_output = []

for sample in tqdm(test_dataset_p):
    
    image = sample['messages'][0]['content'][1]['image']
    
    messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": instruction},
        ],
    }
    ]
    
    input_text = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True
    )
    
    inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")
    
    output = model.generate(
    **inputs,
    max_new_tokens=128,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)
    
    raw_output.append(output)
    
    
    

100%|██████████| 156/156 [16:13<00:00,  6.24s/it]


In [18]:
raw_output[0]

tensor([[128000, 128006,    882, 128007,    271, 128256,    198,   2675,    527,
            459,   6335,   8339,  12128,    889,    374,   1695,    520,   4477,
           2027,  28887,     13,    720,   8144,    279,   2027,   4096,  30357,
            555,   3411,    520,    279,   2217,    627, 128009, 128006,  78191,
         128007,    271,   1204,    681,  60765,   2136,  75847,    330,    681,
          91471,  75847,    330,    681,  91471,  75847,    330,    681,  91471,
          75847,    330,    681,  10976,    261,  75847,    330,    681,  91471,
            518,    364,  60765,   2136,  75847,    330,    681,  10976,    261,
          75847,    330,    681,  10976,    261,  75847,    330,    681,  91471,
          75847,    330,    681,  10976,    261,    518,    364,  60765,   2136,
          75847,    330,    681,  91471,    518,    364,  60765,   2136,  75847,
            330,    681,  91471,    518,    364,  23912,   9868,  75847,    330,
            681,  10976,    

In [21]:
decoded_outputs = []

for op in raw_output:
    
    decoded_outputs.append(tokenizer.decode(op[0], skip_special_tokens=True).split("assistant\n\n")[1])
    

In [22]:
decoded_outputs

['["[\'Sadness\']", "[\'Fear\']", "[\'Fear\']", "[\'Fear\']", "[\'Anger\']", "[\'Fear\', \'Sadness\']", "[\'Anger\']", "[\'Anger\']", "[\'Fear\']", "[\'Anger\', \'Sadness\']", "[\'Fear\', \'Sadness\']", "[\'Fear\', \'Surprise\']", "[\'Anger\', \'Surprise\']", "[\'Fear\', \'Sadness\']", "[\'Fear\']", "[\'Surprise\']"]',
 '["[\'Fear\']"]',
 '["[\'Sadness\']", "[\'Surprise\', \'Joy\']", "[\'Joy\']", "[\'Surprise\']", "[\'Joy\']", "[\'Surprise\', \'Joy\']", "[\'Fear\']", "[\'Anger\', \'Fear\']", "[\'Anger\', \'Fear\']", "[\'Surprise\', \'Joy\']", "[\'Anger\']", "[\'Sadness\']", "[\'Fear\']", "[\'Sadness\']"]',
 '["[\'Anger\']", "[\'Fear\', \'Surprise\']", "[\'Joy\']", "[\'Anger\', \'Surprise\']", "[\'Sadness\', \'Surprise\']", "[\'Anger\', \'Sadness\', \'Surprise\']", "[\'Anger\']", "[\'Anger\']", "[\'Surprise\']", "[\'Fear\']"]',
 '["[\'Anger\', \'Surprise\']", "[\'Anger\', \'Joy\']", "[\'Surprise\', \'Joy\']", "[\'Anger\']", "[\'Anger\']", "[\'Fear\']"]',
 '["[\'Anger\', \'Disgust\']"]',