In [2]:
import torch
import json
import json_repair
import pandas as pd

from tqdm import tqdm
from datasets import Dataset

from pathlib import Path
from PIL import Image

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, FastVisionModel, is_bfloat16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 4096

model, tokenizer = FastVisionModel.from_pretrained(
    #model_name="unsloth/Qwen2.5-32B-Instruct-bnb-4bit",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    model_name="unsloth/Llama-3.2-11B-Vision-Instruct",
    
    #model_name="unsloth/Pixtral-12B-2409-bnb-4bit",
    #model_name="unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Mllama vision patching. Transformers: 4.47.0.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### model

In [4]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 64,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 64,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    use_gradient_checkpointing=True,
    #target_modules = "all-linear", # Optional now! Can specify a list if needed
)

## data

In [5]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset_pg.csv", index_col=0)

In [6]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],..."
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]"
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]"
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur..."
...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']..."
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [..."
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr..."


In [7]:
file_names_l = df.file_name.unique().tolist()

In [8]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [9]:
len(comics_titles_full)

35

In [10]:
df['comics_title'] = df['file_name'].apply(lambda x: comics_titles_full[file_names_l.index(x)] if x in file_names_l else None)

In [11]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4
...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13


In [12]:
df.file_name.tolist()

['QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass

In [13]:
def get_comics_id(row):
    
    return row.file_name.split("-")[1].strip()

In [14]:
df['comics_id'] = df.apply(lambda row: get_comics_id(row), axis=1)

In [15]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499
...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200


In [16]:
comics_images_dir = Path("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images")

In [17]:
def get_image_file_path(row):
    
    page_nr = row.page_nr
    comics_dir = "00" + row.comics_id
    
    images_dir = Path(comics_images_dir) / comics_dir / "images"    
    file_name = "page" + f"{page_nr:05d}" + ".jpg"
        
    file_path = images_dir / file_name    
    
    return file_path

In [18]:
df['image_path'] = df.apply(lambda row: get_image_file_path(row), axis=1)

In [19]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...


In [20]:
df_train = df[df.split == "TRAIN"].reset_index(drop=True)

In [21]:
df_train

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
...,...,...,...,...,...,...,...,...
713,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,16,TRAIN,"[""YOU BETTER HOPE I FORGET THAT SHIT JUST FELL...","[['anger', 'disgust'], ['anger'], ['anger'], [...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
714,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,17,TRAIN,"[""IT'S NOTHING--"", ""YOU TOUCH IT --AND I TAKE ...","[['fear', 'surprise'], ['anger'], ['anger'], [...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
715,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,18,TRAIN,"[""YOU BRING EVERYTHING ABOUT THIS TO HIM."", ""T...","[['neutral'], ['neutral'], ['neutral'], ['sadn...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...
716,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,19,TRAIN,"[""ASK YOU ONE MORE TIME, SON--"", ""THE FUCK DID...","[['anger'], ['anger'], ['sadness'], ['fear'], ...",Thief Of Thieves 1 - #11,1910,/Utilisateurs/umushtaq/emotion_analysis_comics...


In [22]:
df_train.iloc[451]

file_name       QC copy - 1567 - 36 Fantasmas vol. 1 - Ghosted...
page_nr                                                        11
split                                                       TRAIN
utterance       ["GET THE FUCK BACK, MAGICIAN.", "OKIE DOKIE T...
emotion_c       [['anger', 'disgust'], ['fear', 'sadness'], ['...
comics_title                        Fantasmas vol. 1 - Ghosted #6
comics_id                                                    1567
image_path      /Utilisateurs/umushtaq/emotion_analysis_comics...
Name: 451, dtype: object

In [23]:
df_train.iloc[451]['image_path']

PosixPath('/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001567/images/page00011.jpg')

In [24]:
import random

In [25]:
train_titles_l = df_train.comics_title.unique().tolist()

In [26]:
eval_titles = random.sample(train_titles_l, 5)

In [27]:
train_titles = [title for title in train_titles_l if title not in eval_titles]

In [28]:
len(train_titles), len(eval_titles)

(23, 5)

In [29]:
df_train_f = df_train[df_train.comics_title.isin(train_titles)]

In [30]:
df_eval_f = df_train[df_train.comics_title.isin(eval_titles)]

In [31]:
len(df_train_f), len(df_eval_f)

(602, 116)

In [32]:
#df.to_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_pg_w_images.csv")

## Build vision dataset

In [33]:
def build_generation_instruction(comics_title, page_utterances):
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.

INPUT:
- List of utterances from a page in a comic
- An image of the comics page

OUTPUT:
- JSON with single key "emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: {formatted_classes}
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}

Comic Information:
Title: {comics_title}

Utterances to Classify:
{page_utterances}

"""
    return instruction

In [34]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [35]:
def convert_to_conversation(row):
  
    comics_title = row.comics_title
    image_path = row.image_path
    labels = row.emotion_c
    
    utterances_l = eval(row['utterance'])
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
    instruction = build_generation_instruction(comics_title, pg_utterances)
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : {"emotions": labels}} ]
        },
    ]
    return { "messages" : conversation }
pass

In [36]:
# Apply the function to each row and store the results in a list
comics_mm_dataset_train = [convert_to_conversation(row) for _, row in df_train_f.iterrows()]
comics_mm_dataset_eval = [convert_to_conversation(row) for _, row in df_eval_f.iterrows()]

In [37]:
len(comics_mm_dataset_eval)

116

In [38]:
comics_mm_dataset_eval[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Nightwing #95\n\nUtterances to Classify:\n1. DID YOU HAVE TO ELECTROCU

In [39]:
# def split_dataset(dataset, train_ratio=0.8):
#     train_test = dataset.train_test_split(test_size=1 - train_ratio)
#     return train_test

# dataset_split = split_dataset(comics_mm_dataset)

In [40]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

In [41]:
FastVisionModel.for_training(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [46]:
args = SFTConfig(
    
        do_train = True,
        do_eval=True,

        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 25,
        eval_steps = 25,
        eval_strategy = "steps",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/Utilisateurs/umushtaq/emotion_analysis_comics/outputs_dir_tmp",
        report_to = "none",     # For Weights and Biases
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    )

In [47]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = comics_mm_dataset_train,
    eval_dataset = comics_mm_dataset_eval,
    args = args,
)

In [48]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
30.836 GB of memory reserved.


In [49]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 602 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 225
 "-____-"     Number of trainable parameters = 235,929,600
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss,Validation Loss
25,0.0996,0.140835
50,0.0943,0.143142
75,0.0907,0.138553
100,0.0597,0.15481
125,0.0601,0.147008
150,0.0499,0.155361
175,0.0387,0.179822
200,0.0326,0.179585
225,0.0335,0.177836


Could not locate the best model at outputs/checkpoint-75/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=225, training_loss=0.0621332229508294, metrics={'train_runtime': 2230.8594, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.101, 'total_flos': 6.04819658094937e+16, 'train_loss': 0.0621332229508294, 'epoch': 2.966887417218543})

In [50]:
FastVisionModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [51]:
def convert_to_conversation_test(row):
  
    comics_title = row.comics_title
    image_path = row.image_path
    labels = row.emotion_c
    
    utterances_l = eval(row['utterance'])
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
    instruction = build_generation_instruction(comics_title, pg_utterances)
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : ""
        },
    ]
    return { "messages" : conversation }
pass

In [52]:
df_test = df[df.split == "TEST"].reset_index(drop=True)

In [53]:
comics_mm_dataset_test = [convert_to_conversation_test(row) for _, row in df_test.iterrows()]

In [54]:
len(comics_mm_dataset_test)

156

In [55]:
comics_mm_dataset_test[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. H

In [56]:
#comics_mm_dataset_test[0]['messages'][0]['content'][1]['image']

In [57]:
raw_outputs = []

for message in tqdm(comics_mm_dataset_test):
    
    input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
    image = message['messages'][0]['content'][1]['image']
    #print(input_text)
    #break
   
    inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
    #print(inputs['input_ids'])
    #print(tokenizer.decode(inputs['input_ids'][0]))
    #break
    
    #output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    output = model.generate(**inputs, max_new_tokens=512)[0]
    
    #input_length = inputs.shape[1]
    #generated_tokens = output[input_length:]
    
    #decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)

100%|██████████| 156/156 [12:28<00:00,  4.80s/it]


In [58]:
len(raw_outputs)

156

In [59]:
raw_outputs

['user\n\n### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. HEY.\n3. CAN I GET YOU ANYTHING?\n4. JUST A COKE.\n5. OKAY. COMING U

In [60]:
op = []

for output in raw_outputs:
    op.append(output.split("assistant\n\nassistant\n\n")[1])

In [61]:
op

['{\'emotions\': "[[\'joy\'], [\'surprise\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'neutral\'], [\'neutral\']]"}',
 '{\'emotions\': "[[\'sadness\'], [\'sadness\'], [\'anger\'], [\'anger\'], [\'sadness\'], [\'anger\',\'sadness\'], [\'anger\',\'sadness\']]"}',
 '{\'emotions\': "[[\'anger\', \'disgust\',\'sadness\'], [\'anger\', \'disgust\',\'sadness\'], [\'anger\', \'disgust\',\'sadness\'], [\'fear\',\'surprise\'], [\'surprise\'], [\'neutral\'], [\'anger\',\'surprise\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'anger\', \'disgust\'], [\'anger\'], [\'anger\',\'surprise\'], [\'joy\'], [\'joy\'], [\'joy\']]"}',
 '{\'emotions\': "[[\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'fear\',\'sadness\'], [\'surprise\'], [\'neutral\'], [\'neutral\'], [\'neutral\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\',

In [62]:
grounds = df_test.emotion_c.tolist()

In [63]:
import ast


In [64]:
grounds = [ast.literal_eval(x) for x in grounds]

In [65]:
bad_idx = []
predictions = []

for i, x in enumerate(op):
    try:
        predictions.append(json_repair.loads(x)['emotions'])
    except:
        print(i)
        bad_idx.append(i)

In [66]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del predictions[idx]

In [67]:
len(grounds), len(predictions)

(156, 156)

In [68]:
predictions

["[['joy'], ['surprise'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['sadness'], ['sadness'], ['anger'], ['anger'], ['sadness'], ['anger','sadness'], ['anger','sadness']]",
 "[['anger', 'disgust','sadness'], ['anger', 'disgust','sadness'], ['anger', 'disgust','sadness'], ['fear','surprise'], ['surprise'], ['neutral'], ['anger','surprise'], ['joy'], ['joy'], ['joy'], ['anger', 'disgust'], ['anger'], ['anger','surprise'], ['joy'], ['joy'], ['joy']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['fear','sadness'], ['surprise'], ['neutral'], ['neutral'], ['neutral'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['neutral'], ['neutral'], ['anger'], ['anger'], ['anger']]",
 "[['neutral'], ['joy'], ['joy']]",
 "[['anger','sadness'], ['anger', 'fear'], ['anger','sadness'], ['anger', 'fear','sadness'], ['joy'], 

In [69]:
bad_idx = []
predictions_l = []

for i, x in enumerate(predictions):
    try:
        predictions_l.append(ast.literal_eval(x))
    except:
        print(i)
        bad_idx.append(i)

In [70]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del predictions[idx]

In [71]:
len(grounds), len(predictions_l)

(156, 156)

In [72]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, predictions_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

3 21 20
5 20 18
27 17 16
36 17 16


In [73]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del predictions_l[idx]

In [74]:
grounds_l = [item for sublist in grounds for item in sublist]
predictions_l = [item for sublist in predictions_l for item in sublist]

In [75]:
len(grounds_l), len(predictions_l)

(1251, 1251)

In [76]:
grounds_l

[['surprise', 'joy'],
 ['joy'],
 ['surprise', 'joy'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['surprise'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['neutral'],
 ['sadness'],
 ['sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['fear', 'surprise'],
 ['surprise'],
 ['joy'],
 ['anger', 'surprise'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['surprise', 'joy'],
 ['fear', 'sadness'],
 ['fear', 'sadness'],
 ['fear', 'surprise'],
 ['neutral'],
 ['joy'],
 ['joy'],
 ['anger', 'sadness'],
 ['anger'],
 ['sadness'],
 ['surprise'],
 ['anger'],
 ['neutral'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['fear'],
 ['anger'],
 ['neutral'],
 ['fear'],
 ['anger'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['anger'],
 ['anger', 'disgust'],
 ['anger', 'surprise'],
 ['anger'],
 ['a

In [77]:
set([type(item) for sublist in grounds_l for item in sublist])

{str}

In [78]:
mlb = MultiLabelBinarizer()

In [79]:
y_true_mhot = mlb.fit_transform(grounds_l)
y_pred_mhot = mlb.transform(predictions_l)

In [80]:
y_pred_mhot.shape

(1251, 7)

In [81]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.572     0.630     0.600       441
     disgust      0.031     0.022     0.026        46
        fear      0.444     0.470     0.456       279
         joy      0.596     0.543     0.568       280
     neutral      0.460     0.404     0.430        99
     sadness      0.517     0.502     0.509       325
    surprise      0.614     0.582     0.598       328

   micro avg      0.537     0.532     0.534      1798
   macro avg      0.462     0.450     0.455      1798
weighted avg      0.534     0.532     0.532      1798
 samples avg      0.570     0.553     0.534      1798



In [82]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
#print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
#print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 31.213 GB.
Peak reserved memory for training = 0.377 GB.
Peak reserved memory % of max memory = 33.561 %.
Peak reserved memory for training % of max memory = 0.405 %.


In [None]:
# image1 = Image.open("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001499/images/page00001.jpg")
# #image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
# #display(image1)
# #display(image2)

In [38]:
#instruction = "You are an expert in comics. Explain the emotional content of this comics page."

In [39]:
# messages = [
#     {"role": "user", "content": [
#         {"type": "image"},
#         {"type": "text", "text": instruction}
#     ]}
# ]

In [40]:
#input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)

In [41]:
# inputs = tokenizer(
#     image1,
#     input_text,
#     add_special_tokens = False,
#     return_tensors = "pt",
# ).to("cuda")

In [42]:
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)

In [157]:
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
#                    use_cache = True, temperature = 1.5, min_p = 0.1)