In [1]:
import torch
import json
import json_repair
import pandas as pd

from tqdm import tqdm
from datasets import Dataset

from pathlib import Path
from PIL import Image

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, FastVisionModel, is_bfloat16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2048
model, tokenizer = FastVisionModel.from_pretrained(
    #model_name="unsloth/Qwen2.5-32B-Instruct-bnb-4bit",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    #model_name="unsloth/Llama-3.2-11B-Vision-Instruct",
    
    model_name="meta-llama/Llama-3.2-90B-Vision-Instruct",
    #model_name="unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

FileNotFoundError: unsloth/llama-3.2-90b-vision-instruct-unsloth-bnb-4bit/*.json (repository not found)

In [47]:
#model

In [48]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    #target_modules = "all-linear", # Optional now! Can specify a list if needed
)

## data

In [49]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset_pg.csv", index_col=0)

In [50]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],..."
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]"
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]"
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur..."
...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']..."
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [..."
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr..."


In [51]:
file_names_l = df.file_name.unique().tolist()

In [52]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [53]:
len(comics_titles_full)

35

In [54]:
df['comics_title'] = df['file_name'].apply(lambda x: comics_titles_full[file_names_l.index(x)] if x in file_names_l else None)

In [55]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4
...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13


In [56]:
df.file_name.tolist()

['QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass

In [57]:
def get_comics_id(row):
    
    return row.file_name.split("-")[1].strip()

In [58]:
df['comics_id'] = df.apply(lambda row: get_comics_id(row), axis=1)

In [59]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499
...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200


In [60]:
comics_images_dir = Path("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images")

In [61]:
def get_image_file_path(row):
    
    page_nr = row.page_nr
    comics_dir = "00" + row.comics_id
    
    images_dir = Path(comics_images_dir) / comics_dir / "images"    
    file_name = "page" + f"{page_nr:05d}" + ".jpg"
        
    file_path = images_dir / file_name    
    
    return file_path

In [62]:
df['image_path'] = df.apply(lambda row: get_image_file_path(row), axis=1)

In [63]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...


In [64]:
df_train = df[df.split == "TRAIN"].reset_index(drop=True)

In [65]:
df_train

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
...,...,...,...,...,...,...,...,...
713,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,16,TRAIN,"[""YOU BETTER HOPE I FORGET THAT SHIT JUST FELL...","[['anger', 'disgust'], ['anger'], ['anger'], [...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
714,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,17,TRAIN,"[""IT'S NOTHING--"", ""YOU TOUCH IT --AND I TAKE ...","[['fear', 'surprise'], ['anger'], ['anger'], [...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
715,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,18,TRAIN,"[""YOU BRING EVERYTHING ABOUT THIS TO HIM."", ""T...","[['neutral'], ['neutral'], ['neutral'], ['sadn...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
716,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,19,TRAIN,"[""ASK YOU ONE MORE TIME, SON--"", ""THE FUCK DID...","[['anger'], ['anger'], ['sadness'], ['fear'], ...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...


In [66]:
df_train.iloc[451]

file_name       QC copy - 1567 - 36 Fantasmas vol. 1 - Ghosted...
page_nr                                                        11
split                                                       TRAIN
utterance       ["GET THE FUCK BACK, MAGICIAN.", "OKIE DOKIE T...
emotion_c       [['anger', 'disgust'], ['fear', 'sadness'], ['...
comics_title                        Fantasmas vol. 1 - Ghosted #6
comics_id                                                    1567
image_path      /Utilisateurs/umushtaq/emotion_analysis_comics...
Name: 451, dtype: object

In [67]:
df_train.iloc[451]['image_path']

PosixPath('/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001567/images/page00011.jpg')

In [68]:
#df.to_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_pg_w_images.csv")

## Build vision dataset

In [69]:
def build_generation_instruction(comics_title, page_utterances):
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.

INPUT:
- List of utterances from a page in a comic
- An image of the comics page

OUTPUT:
- JSON with single key "emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: {formatted_classes}
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}

Comic Information:
Title: {comics_title}

Utterances to Classify:
{page_utterances}

"""
    return instruction

In [70]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [71]:
def convert_to_conversation(row):
  
    comics_title = row.comics_title
    image_path = row.image_path
    labels = row.emotion_c
    
    utterances_l = eval(row['utterance'])
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
    instruction = build_generation_instruction(comics_title, pg_utterances)
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : {"emotions": labels}} ]
        },
    ]
    return { "messages" : conversation }
pass

In [72]:
# Apply the function to each row and store the results in a list
comics_mm_dataset = [convert_to_conversation(row) for _, row in df_train.iterrows()]

In [73]:
len(comics_mm_dataset)

718

In [74]:
comics_mm_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Jurassic League #4\n\nUtterances to Classify:\n1. THIS VILE THING ATTA

In [75]:
# def split_dataset(dataset, train_ratio=0.8):
#     train_test = dataset.train_test_split(test_size=1 - train_ratio)
#     return train_test

# dataset_split = split_dataset(comics_mm_dataset)

In [76]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

In [77]:
FastVisionModel.for_training(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [78]:
args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 2, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 25,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    )

In [79]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = comics_mm_dataset,
    args = args,
)

In [80]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
74.732 GB of memory reserved.


In [81]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 718 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 718
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
25,0.7715
50,0.1489
75,0.1191
100,0.1023
125,0.1036
150,0.0914
175,0.0954
200,0.0867
225,0.0882
250,0.0841


TrainOutput(global_step=718, training_loss=0.10199288380511291, metrics={'train_runtime': 1711.3995, 'train_samples_per_second': 0.839, 'train_steps_per_second': 0.42, 'total_flos': 3.661173318446328e+16, 'train_loss': 0.10199288380511291, 'epoch': 2.0})

In [82]:
FastVisionModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [83]:
def convert_to_conversation_test(row):
  
    comics_title = row.comics_title
    image_path = row.image_path
    labels = row.emotion_c
    
    utterances_l = eval(row['utterance'])
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
    instruction = build_generation_instruction(comics_title, pg_utterances)
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : ""
        },
    ]
    return { "messages" : conversation }
pass

In [84]:
df_test = df[df.split == "TEST"].reset_index(drop=True)

In [85]:
comics_mm_dataset_test = [convert_to_conversation_test(row) for _, row in df_test.iterrows()]

In [86]:
len(comics_mm_dataset_test)

156

In [87]:
comics_mm_dataset_test[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. H

In [88]:
#comics_mm_dataset_test[0]['messages'][0]['content'][1]['image']

In [89]:
raw_outputs = []

for message in tqdm(comics_mm_dataset_test):
    
    input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
    image = message['messages'][0]['content'][1]['image']
    #print(input_text)
    #break
   
    inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
    #print(inputs['input_ids'])
    #print(tokenizer.decode(inputs['input_ids'][0]))
    #break
    
    #output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    output = model.generate(**inputs, max_new_tokens=512)[0]
    
    #input_length = inputs.shape[1]
    #generated_tokens = output[input_length:]
    
    #decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)

  0%|          | 0/156 [00:00<?, ?it/s]

100%|██████████| 156/156 [14:32<00:00,  5.59s/it]


In [90]:
len(raw_outputs)

156

In [124]:
raw_outputs

['user\n\n### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. HEY.\n3. CAN I GET YOU ANYTHING?\n4. JUST A COKE.\n5. OKAY. COMING U

In [125]:
op = []

for output in raw_outputs:
    op.append(output.split("assistant\n\nassistant\n\n")[1])

In [126]:
op

['{\'emotions\': "[[\'joy\'], [\'fear\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'neutral\'], [\'neutral\']]"}',
 '{\'emotions\': "[[\'sadness\'], [\'sadness\'], [\'anger\'], [\'anger\'], [\'sadness\'], [\'fear\'], [\'sadness\']]"}',
 '{\'emotions\': "[[\'anger\',\'sadness\'], [\'anger\',\'sadness\'], [\'anger\',\'sadness\'], [\'fear\',\'surprise\'], [\'anger\',\'surprise\'], [\'neutral\'], [\'surprise\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'sadness\'], [\'sadness\'], [\'anger\',\'surprise\']]"}',
 '{\'emotions\': "[[\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'surprise\', \'joy\'], [\'surprise\', \'joy\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'surprise\'], [\'anger\',\'surprise\'], [\'anger\',\'surprise\'], [\'anger\',\'surprise\'], [\'anger\',\'sadness\'], [\'anger\',\'sadness\'], [\'anger\',\'sadness\'], [\'anger\', \'joy\'], [\'anger\', \'joy\'], [\'anger\', \'joy\'

In [127]:
grounds = df_test.emotion_c.tolist()

In [128]:
import ast


In [129]:
grounds = [ast.literal_eval(x) for x in grounds]

In [130]:
bad_idx = []
predictions = []

for i, x in enumerate(op):
    try:
        predictions.append(json_repair.loads(x)['emotions'])
    except:
        print(i)
        bad_idx.append(i)

In [131]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del predictions[idx]

In [132]:
len(grounds), len(predictions)

(156, 156)

In [133]:
predictions

["[['joy'], ['fear'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['sadness'], ['sadness'], ['anger'], ['anger'], ['sadness'], ['fear'], ['sadness']]",
 "[['anger','sadness'], ['anger','sadness'], ['anger','sadness'], ['fear','surprise'], ['anger','surprise'], ['neutral'], ['surprise'], ['joy'], ['joy'], ['joy'], ['anger'], ['anger'], ['anger'], ['sadness'], ['sadness'], ['anger','surprise']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['surprise', 'joy'], ['surprise', 'joy'], ['anger'], ['anger'], ['anger'], ['surprise'], ['anger','surprise'], ['anger','surprise'], ['anger','surprise'], ['anger','sadness'], ['anger','sadness'], ['anger','sadness'], ['anger', 'joy'], ['anger', 'joy'], ['anger', 'joy'], ['joy'], ['anger', 'disgust','surprise'], ['anger', 'disgust','surprise']]",
 "[['neutral']], ['joy'], ['joy']]",
 "[['anger'], ['anger', 'fear'], ['anger'], ['anger', 'fear'], ['joy'], ['anger'], ['fear','sadne

In [134]:
bad_idx = []
predictions_l = []

for i, x in enumerate(predictions):
    try:
        predictions_l.append(ast.literal_eval(x))
    except:
        print(i)
        bad_idx.append(i)

4
14


In [135]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del predictions[idx]

In [136]:
len(grounds), len(predictions_l)

(154, 154)

In [137]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, predictions_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

4 20 19
13 23 24
25 17 16
34 17 16
119 19 20
149 14 13


In [138]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del predictions_l[idx]

In [139]:
grounds_l = [item for sublist in grounds for item in sublist]
predictions_l = [item for sublist in predictions_l for item in sublist]

In [140]:
len(grounds_l), len(predictions_l)

(1190, 1190)

In [141]:
grounds_l

[['surprise', 'joy'],
 ['joy'],
 ['surprise', 'joy'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['surprise'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['neutral'],
 ['sadness'],
 ['sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['fear', 'surprise'],
 ['surprise'],
 ['joy'],
 ['anger', 'surprise'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['surprise', 'joy'],
 ['fear', 'sadness'],
 ['fear', 'sadness'],
 ['fear', 'surprise'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['fear', 'sadness'],
 ['fear', 'sadness', 'surprise'],
 ['sadness'],
 ['sadness'],
 ['fear', 'sadness'],
 ['sadness', 'surprise'],
 ['sadness', 'surprise'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['anger'],
 ['anger', 'disgust'],
 ['joy'],
 ['joy'],
 ['surprise', 'joy'],
 ['surprise', 'joy'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['anger', 'sadness'],
 ['anger'],
 ['sadness

In [142]:
set([type(item) for sublist in grounds_l for item in sublist])

{str}

In [143]:
mlb = MultiLabelBinarizer()

In [144]:
y_true_mhot = mlb.fit_transform(grounds_l)
y_pred_mhot = mlb.transform(predictions_l)

In [145]:
y_pred_mhot.shape

(1190, 7)

In [146]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.584     0.539     0.561       419
     disgust      0.233     0.227     0.230        44
        fear      0.432     0.483     0.456       269
         joy      0.506     0.525     0.515       261
     neutral      0.400     0.311     0.350        90
     sadness      0.596     0.538     0.566       305
    surprise      0.545     0.648     0.592       327

   micro avg      0.522     0.529     0.526      1715
   macro avg      0.471     0.467     0.467      1715
weighted avg      0.524     0.529     0.525      1715
 samples avg      0.548     0.541     0.517      1715



In [78]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
#print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
#print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 15.172 GB.
Peak reserved memory for training = 7.533 GB.
Peak reserved memory % of max memory = 16.313 %.
Peak reserved memory for training % of max memory = 8.1 %.


In [None]:
# image1 = Image.open("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001499/images/page00001.jpg")
# #image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
# #display(image1)
# #display(image2)

In [38]:
#instruction = "You are an expert in comics. Explain the emotional content of this comics page."

In [39]:
# messages = [
#     {"role": "user", "content": [
#         {"type": "image"},
#         {"type": "text", "text": instruction}
#     ]}
# ]

In [40]:
#input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)

In [41]:
# inputs = tokenizer(
#     image1,
#     input_text,
#     add_special_tokens = False,
#     return_tensors = "pt",
# ).to("cuda")

In [42]:
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)

In [157]:
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
#                    use_cache = True, temperature = 1.5, min_p = 0.1)