In [1]:
import unsloth

import ast
import json
import torch
import random
import pandas as pd

from PIL import Image
from tqdm import tqdm
from pathlib import Path

from trl import SFTTrainer, SFTConfig
from unsloth.trainer import UnslothVisionDataCollator
from unsloth import FastVisionModel, is_bf16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
Path.cwd()
DATAFILES_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35" / "data_files"

In [3]:
max_seq_length = 4096

model, tokenizer = FastVisionModel.from_pretrained(

    model_name="unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
    
)

==((====))==  Unsloth 2025.3.18: Fast Mllama patching. Transformers: 4.50.1.
   \\   /|    NVIDIA H100 NVL. Num GPUs = 1. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 32,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 32,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules=[
    #     "q_proj",
    #     "k_proj",
    #     "v_proj",
    #     "o_proj",
    #     "gate_proj",
    #     "up_proj",
    #     "down_proj",
    # ],
    #use_gradient_checkpointing=True,
    #target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


In [5]:
df = pd.read_csv(DATAFILES_DIR / "emocomics35_pg_images.csv", index_col=0)

In [6]:
df_train = df[df.Split == "TRAIN"].reset_index(drop=True)
df_test = df[df.Split == "TEST"].reset_index(drop=True)

In [7]:
train_files = df_train.SourceFile.unique().tolist()

In [8]:
eval_titles = random.sample(train_files, 4)
train_titles = [title for title in train_files if title not in eval_titles]

In [9]:
df_train_f = df_train[df_train.SourceFile.isin(train_titles)].reset_index(drop=True)
df_eval_f = df_train[df_train.SourceFile.isin(eval_titles)].reset_index(drop=True)

In [None]:
### Prompt

In [10]:
def generation_instruction():
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Multimodal Emotion Analysis for Comic Dialogue

Task: Perform comprehensive emotion analysis of comic dialogue using both textual and visual context.

INPUT:
- Comic page image
- List of dialogue utterances in sequential order

CONTEXT ANALYSIS GUIDELINES:
- Analyze emotions holistically using:
  1. Dialogue text
  2. Character facial expressions
  3. Body language
  4. Visual scene composition
  5. Narrative context

EMOTION ANNOTATION REQUIREMENTS:
- Emotions Must Be From: {formatted_classes}
- Annotation Constraints:
  1. Assign at least one emotion per utterance
  2. Multiple emotions per utterance allowed
  3. Emotions must reflect both textual and visual context
  4. Prioritize contextual nuance over surface-level interpretation

OUTPUT FORMAT:
- Strict JSON structure
- Single key "emotions"
- Array of emotion arrays matching utterance order
- Exact emotion spelling and case preserved

EXAMPLE OUTPUT:
{{"emotions": [
    ["joy", "surprise"],    // First utterance
    ["anger", "fear"],      // Second utterance
    ["neutral"],            // Third utterance
    ["sadness"]             // Fourth utterance
]}}

CRITICAL INSTRUCTIONS:
- NO explanations
- ONLY JSON output
- PRECISELY match input utterance count
- USE all provided contextual information
"""
    return instruction

In [11]:
def build_question(comics_title, page_utterances):
    
    question = f"""
    
Comic Information:
Title: {comics_title}

Utterances to Classify:
{page_utterances}
    
    
    """
    
    return question


In [12]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [13]:
def convert_to_conversation(row):
  
    
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(eval(row.PageUtterances)))
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : generation_instruction() + build_question(row.ComicBookTitle, pg_utterances)},
            {"type" : "image", "image" : build_image_modality(row.image_path)} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : {"emotions": row.PageEmotions}} ]
        },
    ]
    return { "messages" : conversation }
pass

In [14]:
EC35V_dataset_train = [convert_to_conversation(row) for _, row in df_train_f.iterrows()]
EC35V_dataset_eval = [convert_to_conversation(row) for _, row in df_eval_f.iterrows()]

In [15]:
FastVisionModel.for_training(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-31): 32 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [16]:
OUTPUT_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35" / "model_outputs"

In [17]:
args = SFTConfig(
    
        do_train = True,
        do_eval=True,

        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        
        logging_steps = 10,
        eval_steps = 10,
        eval_strategy = "steps",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = OUTPUT_DIR,
        report_to = "none",     # For Weights and Biases
        
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 4096,
    )

In [18]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = EC35V_dataset_train,
    eval_dataset = EC35V_dataset_eval,
    args = args,
)

In [19]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
7.223 GB of memory reserved.


In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 630 | Num Epochs = 1 | Total steps = 157
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 134,348,800/11,000,000,000 (1.22% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.7441,0.333053
20,0.2402,0.19215
30,0.1792,0.160264
40,0.1547,0.149028
50,0.1482,0.147502


Unsloth: Not an error, but MllamaForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
