In [1]:
import unsloth

import ast
import json
import torch
import random
import pandas as pd

from PIL import Image
from tqdm import tqdm
from pathlib import Path

from trl import SFTTrainer, SFTConfig
from unsloth.trainer import UnslothVisionDataCollator
from unsloth import FastVisionModel, is_bf16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
Path.cwd()

PosixPath('/Utilisateurs/umushtaq/scripts')

In [None]:
DATAFILES_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35" / "data_files"

### Tokenizer and Model

In [4]:
max_seq_length = 4096

model, tokenizer = FastVisionModel.from_pretrained(

    model_name="unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
    
)

==((====))==  Unsloth 2025.3.18: Fast Mllama patching. Transformers: 4.50.1.
   \\   /|    NVIDIA H100 NVL. Num GPUs = 1. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 32,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 32,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules=[
    #     "q_proj",
    #     "k_proj",
    #     "v_proj",
    #     "o_proj",
    #     "gate_proj",
    #     "up_proj",
    #     "down_proj",
    # ],
    #use_gradient_checkpointing=True,
    #target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-31): 32 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

### Data

In [None]:
df = pd.read_csv(DATAFILES_DIR / "emocomics35_pg_images.csv", index_col=0)

In [8]:
df

Unnamed: 0,SourceFile,Page,PageUtterances,PageSpeakers,PageEmotions,PagePanels,PageBalloons,FileNr,Split,ComicBookTitle,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,['THIS VILE THING ATTACKED THE SMALL BEASTS OF...,"['AQUANYX', 'AQUANYX', 'ID-1', 'ID-1', 'AQUANY...","[""['Anger']"", ""['Anger']"", ""['Fear']"", ""['Fear...","[1, 1, 1, 2, 3, 3, 3, 4, 5, 6]","[2, 3, 4, 1, 1, 2, 3, 1, 2, 1]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,"['NO-- #GKKK…#', '#CHOMP!', 'BY THE SKIN OF M...","['ID-1', 'BLACKMANTASAURUS', 'AQUANYX', 'AQUAN...","[""['Fear']"", ""['Anger']"", ""['Surprise']"", ""['A...","[1, 1, 2, 3, 3, 3, 3, 3, 3]","[1, 2, 1, 1, 2, 3, 5, 6, 7]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,"['COME ON, BEAST!', 'SHOW YOURSELF!', 'WHY DO ...","['AQUANYX', 'AQUANYX', 'AQUANYX', 'AQUANYX']","[""['Joy']"", ""['Joy']"", ""['Anger']"", ""['Anger']""]","[1, 1, 1, 1]","[1, 2, 5, 6]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,['#AARGH! '],['AQUANYX'],"[""['Fear', 'Surprise']""]",[2],[2],1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,"['I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","['GREEN TORCH', 'GREEN TORCH', 'ATROCITAURUS',...","[""['Anger']"", ""['Anger']"", ""['Fear']"", ""['Fear...","[1, 1, 1, 3, 4, 5]","[2, 3, 5, 1, 1, 2]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
...,...,...,...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","['LAURA', 'LAURA', 'LAURA', 'DANIEL', 'DANIEL'...","[""['Anger']"", ""['Anger']"", ""['Anger']"", ""['Ang...","[1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6]","[1, 2, 3, 1, 2, 2, 3, 4, 5, 1, 2, 1, 2, 1]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
870,QC copy - 2200 - Stillwater 13.xlsx,17,"['SO WHAT ARE WE GOING TO DO?', 'THE WAY I SEE...","['ID-6', 'GALEN', 'ID-7', 'GALEN', 'GALEN', 'G...","[""['Sadness', 'Surprise']"", ""['Anger']"", ""['An...","[3, 3, 3, 3, 4, 4, 5]","[1, 2, 3, 4, 1, 2, 1]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
871,QC copy - 2200 - Stillwater 13.xlsx,18,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","['TED', 'KREEGS', 'ID-8', 'ID-8', 'GALEN', 'GA...","[""['Anger', 'Sadness']"", ""['Anger']"", ""['Anger...","[1, 1, 1, 2, 3, 4, 5, 6, 6, 7, 7]","[1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 2]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
872,QC copy - 2200 - Stillwater 13.xlsx,19,"[""IT'S BEEN… PEACEFUL. ASIDE FROM SHIT LIKE TH...","['KREEGS', 'GALEN', 'GALEN', 'KREEGS', 'GALEN'...","[""['Anger']"", ""['Joy']"", ""['Joy']"", ""['Anger',...","[1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6]","[1, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1, 2]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...


In [None]:
df_train = df[df.Split == "TRAIN"].reset_index(drop=True)
df_test = df[df.Split == "TEST"].reset_index(drop=True)

In [None]:
df_train.shape, df_test.shape

((718, 11), (156, 11))

In [None]:
train_files = df_train.SourceFile.unique().tolist()

In [None]:
len(train_files)

28

In [None]:
eval_titles = random.sample(train_files, 4)

In [None]:
train_titles = [title for title in train_files if title not in eval_titles]

In [None]:
df_train_f = df_train[df_train.SourceFile.isin(train_titles)].reset_index(drop=True)

In [None]:
df_eval_f = df_train[df_train.SourceFile.isin(eval_titles)].reset_index(drop=True)

In [None]:
len(df_train_f), len(df_eval_f)

(630, 88)

### Build Prompts

In [None]:
# def generation_instruction():
   
#     emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
#     formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
#     instruction = f"""### Emotion Analysis for Comics

# You are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.

# INPUT:
# - List of utterances from a page in a comic
# - An image of the comics page

# OUTPUT:
# - JSON with single key "emotions"
# - Value: array of emotion arrays matching utterance order
# - ONLY use these emotions: {formatted_classes}
# - NO OTHER EMOTION LABELS ARE ALLOWED

# RULES:
# 1. Each utterance must have at least one emotion from the list above
# 2. Multiple emotions per utterance are allowed
# 3. Keep emotions in arrays even for single emotions
# 4. Maintain exact emotion spelling and case
# 5. No explanations, only JSON output

# Example format:
# {{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}

# """
#     return instruction

In [None]:
def generation_instruction():
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Multimodal Emotion Analysis for Comic Dialogue

Task: Perform comprehensive emotion analysis of comic dialogue using both textual and visual context.

INPUT:
- Comic page image
- List of dialogue utterances in sequential order

CONTEXT ANALYSIS GUIDELINES:
- Analyze emotions holistically using:
  1. Dialogue text
  2. Character facial expressions
  3. Body language
  4. Visual scene composition
  5. Narrative context

EMOTION ANNOTATION REQUIREMENTS:
- Emotions Must Be From: {formatted_classes}
- Annotation Constraints:
  1. Assign at least one emotion per utterance
  2. Multiple emotions per utterance allowed
  3. Emotions must reflect both textual and visual context
  4. Prioritize contextual nuance over surface-level interpretation

OUTPUT FORMAT:
- Strict JSON structure
- Single key "emotions"
- Array of emotion arrays matching utterance order
- Exact emotion spelling and case preserved

EXAMPLE OUTPUT:
{{"emotions": [
    ["joy", "surprise"],    // First utterance
    ["anger", "fear"],      // Second utterance
    ["neutral"],            // Third utterance
    ["sadness"]             // Fourth utterance
]}}

CRITICAL INSTRUCTIONS:
- NO explanations
- ONLY JSON output
- PRECISELY match input utterance count
- USE all provided contextual information
"""
    return instruction

In [None]:
def build_question(comics_title, page_utterances):
    
    question = f"""
    
Comic Information:
Title: {comics_title}

Utterances to Classify:
{page_utterances}
    
    
    """
    
    return question

In [None]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [None]:
def convert_to_conversation(row):
  
    
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(eval(row.PageUtterances)))
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : generation_instruction() + build_question(row.ComicBookTitle, pg_utterances)},
            {"type" : "image", "image" : build_image_modality(row.image_path)} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : {"emotions": row.PageEmotions}} ]
        },
    ]
    return { "messages" : conversation }
pass

In [23]:
EC35V_dataset_train = [convert_to_conversation(row) for _, row in df_train_f.iterrows()]
EC35V_dataset_eval = [convert_to_conversation(row) for _, row in df_eval_f.iterrows()]

In [None]:
len(EC35V_dataset_train), len(EC35V_dataset_eval)

(630, 88)

In [None]:
EC35V_dataset_train[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Multimodal Emotion Analysis for Comic Dialogue\n\nTask: Perform comprehensive emotion analysis of comic dialogue using both textual and visual context.\n\nINPUT:\n- Comic page image\n- List of dialogue utterances in sequential order\n\nCONTEXT ANALYSIS GUIDELINES:\n- Analyze emotions holistically using:\n  1. Dialogue text\n  2. Character facial expressions\n  3. Body language\n  4. Visual scene composition\n  5. Narrative context\n\nEMOTION ANNOTATION REQUIREMENTS:\n- Emotions Must Be From: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- Annotation Constraints:\n  1. Assign at least one emotion per utterance\n  2. Multiple emotions per utterance allowed\n  3. Emotions must reflect both textual and visual context\n  4. Prioritize contextual nuance over surface-level interpretation\n\nOUTPUT FORMAT:\n- Strict JSON structure\n- Single key "emotions"\n- Array of emotion arrays matching 

In [None]:
EC35V_dataset_eval[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Multimodal Emotion Analysis for Comic Dialogue\n\nTask: Perform comprehensive emotion analysis of comic dialogue using both textual and visual context.\n\nINPUT:\n- Comic page image\n- List of dialogue utterances in sequential order\n\nCONTEXT ANALYSIS GUIDELINES:\n- Analyze emotions holistically using:\n  1. Dialogue text\n  2. Character facial expressions\n  3. Body language\n  4. Visual scene composition\n  5. Narrative context\n\nEMOTION ANNOTATION REQUIREMENTS:\n- Emotions Must Be From: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- Annotation Constraints:\n  1. Assign at least one emotion per utterance\n  2. Multiple emotions per utterance allowed\n  3. Emotions must reflect both textual and visual context\n  4. Prioritize contextual nuance over surface-level interpretation\n\nOUTPUT FORMAT:\n- Strict JSON structure\n- Single key "emotions"\n- Array of emotion arrays matching 

### Training

In [None]:
FastVisionModel.for_training(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-31): 32 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [None]:
OUTPUT_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35" / "model_outputs"

In [29]:
args = SFTConfig(
    
        do_train = True,
        do_eval=True,

        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        
        logging_steps = 10,
        eval_steps = 10,
        eval_strategy = "steps",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = OUTPUT_DIR,
        report_to = "none",     # For Weights and Biases
        
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 4096,
    )

In [30]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = EC35V_dataset_train,
    eval_dataset = EC35V_dataset_eval,
    args = args,
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
7.223 GB of memory reserved.


In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 630 | Num Epochs = 1 | Total steps = 79
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 134,348,800/11,000,000,000 (1.22% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.7295,0.317833
20,0.237,0.180208


Unsloth: Not an error, but MllamaForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


### Inference

In [None]:
FastVisionModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-31): 32 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [None]:
def convert_to_conversation_test(row):
  
    
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(eval(row.PageUtterances)))
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : generation_instruction() + build_question(row.ComicBookTitle, pg_utterances)},
            {"type" : "image", "image" : build_image_modality(row.image_path)} ]
        },
        { "role" : "assistant",
          "content" : ""
        },
    ]
    return { "messages" : conversation }
pass

In [None]:
EC35V_dataset_test = [convert_to_conversation_test(row) for _, row in df_test.iterrows()]

In [None]:
len(EC35V_dataset_test)

156

In [None]:
raw_outputs = []

for message in tqdm(EC35V_dataset_test):
    
    input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
    image = message['messages'][0]['content'][1]['image']
    #print(input_text)
    #break
   
    inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
    #print(inputs['input_ids'])
    #print(tokenizer.decode(inputs['input_ids'][0]))
    #print(inputs['input_ids'].shape)
    #break
    
    #output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    output = model.generate(**inputs, max_new_tokens=512)[0]
    
    input_length = inputs['input_ids'].shape[1]
    generated_tokens = output[input_length:]
    
    #decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    raw_outputs.append(decoded_output)

100%|██████████| 5/5 [02:59<00:00, 35.87s/it]


In [None]:
with open(OUTPUT_DIR / "raw_outputs.json", "w") as json_file:
    
    json.dump(raw_outputs, json_file)

5

### Post-Processing