In [1]:
import torch
import json
import json_repair
import pandas as pd

from tqdm import tqdm
from datasets import Dataset

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-32B-Instruct-bnb-4bit",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Qwen2 patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
model = FastLanguageModel.get_peft_model( 
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing=True
)

Unsloth 2024.12.4 patched 64 layers with 64 QKV layers, 64 O layers and 64 MLP layers.


## data

In [4]:
#df_u = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset.csv")

In [5]:
#df_u

In [6]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset_pg.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f..."
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],..."
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]"
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]"
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur..."
...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [..."
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']..."
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [..."
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr..."


In [8]:
def get_nr_emotions(row):

    return len(row.emotion_c)

In [9]:
df['nr_emotions'] = df.apply(lambda row: get_nr_emotions(row), axis=1)

In [10]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,nr_emotions
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",137
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",99
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",40
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",22
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",83
...,...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",166
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",98
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",143
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",144


In [11]:
df.file_name.unique()

array(['QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
       'QC copy - 1500 - 04 Nightwing 19 _Nightwing 95_.xlsx',
       'QC copy - 1501 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCWWJL GL 1.xlsx',
       'QC copy - 1502 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCVMM Lobo.xlsx',
       'QC copy - 1503 - 10 Crisis Oscura Flash - FLS 783.xlsx',
       'QC copy - 1507 - 22 Calle Peligro 1.xlsx',
       'QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx',
       'QC copy - 1513 - 21 Blanco Humano 9.xlsx',
       'QC copy - 1514 - 15 DC contra Vampiros 11.xlsx',
       'QC copy - 1517 - 37 John Carpenter Historias para una noche de Halloween 3.xlsx',
       'QC copy - 1518 - 50 Las asombrosas aventuras de las Tortugas Ninja 5.xlsx',
       'QC copy - 1520 - 51 Sonic The Hedgehog 44.xlsx',
       'QC copy - 1521 - 40 Amor eterno 2.xlsx',
       'QC copy - 1559 - 36 Fantasmas vol. 1 - Ghosted 2.xlsx',
       'QC copy - 1560 - 36 Fantasmas vol. 1 - G

In [12]:
df.nr_emotions.value_counts()

nr_emotions
116    11
44     10
156    10
123     9
109     9
       ..
313     1
236     1
256     1
187     1
233     1
Name: count, Length: 252, dtype: int64

In [13]:
file_names_l = df.file_name.unique().tolist()

In [14]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [15]:
len(comics_titles_full)

35

In [16]:
df['comics_title'] = df['file_name'].apply(lambda x: comics_titles_full[file_names_l.index(x)] if x in file_names_l else None)

In [17]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,nr_emotions,comics_title
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",137,Jurassic League #4
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",99,Jurassic League #4
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",40,Jurassic League #4
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",22,Jurassic League #4
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",83,Jurassic League #4
...,...,...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",166,Stillwater #13
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",98,Stillwater #13
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",143,Stillwater #13
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",144,Stillwater #13


### Get summary

In [18]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [19]:
model = FastLanguageModel.for_inference(model)

In [20]:
# title_summaries_d = {}

# for title in tqdm(comics_titles_full):
    
#     summary_instruction = f"""You are an expert in comics with deep insight into both narrative and emotional storytelling. When given a comics title, provide a thorough yet concise emotional journey analysis that includes:

#     1. The core emotional arc of the story from beginning to end
#     2. Key character transformations and their emotional evolution
#     3. Pivotal emotional moments that drive the narrative
#     4. The emotional resonance and impact of major themes
#     5. How the story's emotional stakes build and resolve

# Important guidelines:
# - Keep the summary focused and under 200 words while ensuring it has a proper conclusion. Do not leave any sentence unfinished.
# - Focus on emotional depth rather than just plot points
# - Highlight character relationships and their emotional dynamics
# - Describe how the story makes readers feel at critical moments
# - Connect emotional beats to show the story's psychological progression
# - Ensure proper resolution of both plot and emotional arcs

# Please create a cohesive summary that reveals both the story's external journey and its deeper emotional currents, showing how they interweave to create meaning."""
    
#     sys_msg = {'role': 'system', 'content': summary_instruction}
#     user_msg = {'role': 'user', 'content': "Here is the comics title: \n" + title}
#     assistant_msg = {'role': 'assistant', 'content': ""}
    
#     messages = [sys_msg, user_msg, assistant_msg]
    
#     input_text = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    
#     outputs = model.generate(
#         input_ids=input_text,
#         max_new_tokens=256,
#         do_sample=True,
#         temperature=0.7,
#         top_p=0.9,
#         repetition_penalty=1.1
#     )
    
#     input_length = input_text.shape[1]
#     generated_tokens = outputs[0][input_length:]
#     #print(outputs)
#     decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
#     title_summaries_d[title] = decoded_output


In [21]:
#title_summaries_d

In [22]:
with open('summaries.json', 'r') as json_file:
    title_summaries_d = json.load(json_file)

In [23]:
df['summary'] = df['comics_title'].map(title_summaries_d)

In [24]:
df_train = df[df.split == "TRAIN"].reset_index(drop=True)

In [25]:
len(df_train)

718

In [26]:
# with open('/Utilisateurs/umushtaq/emotion_analysis_comics/data/summaries.json', 'w') as json_file:
#     json.dump(title_summaries_d, json_file)

### Comics dataset

In [27]:
def build_generation_instruction():
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.

INPUT:
- List of utterances from a comic page
- Comic's emotional summary describing themes and character arcs

OUTPUT:
- JSON with single key "page_utterance_emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: {formatted_classes}
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}
"""
    return instruction

In [28]:
def build_user_prompt(comics_title, emotional_summary, pg_utterances):
    usr_prompt = f"""Comic Information:
Title: {comics_title}
Emotional Summary: {emotional_summary}

Utterances to Classify:
{pg_utterances}"""
    return usr_prompt

In [29]:
generation_instruction = build_generation_instruction()

In [30]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in df_train.iterrows():
        
        sys_msg = {'role': 'system', 'content': generation_instruction}
        
        comics_title = row['comics_title']
        comics_summary = row['summary']
        
        utterances_l = eval(row['utterance'])
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        # usr_prompt = f"Comics title: {comics_title}\n" + f"Comics summary: {comics_summary}\n" + "Here is the list of utterances that you will classify: \n" + pg_utterances
        usr_prompt = build_user_prompt(comics_title, comics_summary, pg_utterances)
        
        user_msg = {'role': 'user', 'content': usr_prompt}        
        #assistant_msg = {'role': 'assistant', 'content': ""}
        assistant_msg = {'role': 'assistant', 'content': f'{{"emotions": {row["emotion_c"]}}}'}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [31]:
comics_dataset = []

for i in range(len(sys_msg_l)):

    comics_dataset.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [32]:
len(comics_dataset)

718

In [33]:
comics_dataset[0]

[{'role': 'system',
  'content': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.\n\nINPUT:\n- List of utterances from a comic page\n- Comic\'s emotional summary describing themes and character arcs\n\nOUTPUT:\n- JSON with single key "page_utterance_emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n'},
 {'role': 'user',
  'content': 'Comic Information:\nTitle

In [34]:
def fix_comics_dataset(comics_dataset):
    fixed_comics_dataset = []
    for conversation in comics_dataset:
        fixed_conversation = []
        for message in conversation:
            if isinstance(message['content'], list):  # If the 'value' is a list of emotions
                message['content'] = ', '.join(message['content'])  # Join the list into a string
            fixed_conversation.append(message)
        fixed_comics_dataset.append(fixed_conversation)
    return fixed_comics_dataset

In [35]:
fixed_comics_dataset = fix_comics_dataset(comics_dataset)

In [36]:
dataset = Dataset.from_dict({
    'conversations': fixed_comics_dataset
})

In [37]:
dataset

Dataset({
    features: ['conversations'],
    num_rows: 718
})

In [38]:
dataset[0]['conversations']

[{'content': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.\n\nINPUT:\n- List of utterances from a comic page\n- Comic\'s emotional summary describing themes and character arcs\n\nOUTPUT:\n- JSON with single key "page_utterance_emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n',
  'role': 'system'},
 {'content': 'Comic Information:\nTitle: Jurassic League 

In [39]:
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

def apply_template_comics(examples):
    messages = examples["conversations"]
    #messages = examples['input'] + examples['output']
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


In [40]:
comics_dataset = dataset.map(apply_template_comics, batched=True)

Map:   0%|          | 0/718 [00:00<?, ? examples/s]

In [41]:
comics_dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 718
})

In [42]:
def split_dataset(dataset, train_ratio=0.8):
    train_test = dataset.train_test_split(test_size=1 - train_ratio)
    return train_test

dataset_split = split_dataset(comics_dataset)

In [43]:
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

In [44]:
train_dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 574
})

In [45]:
print(train_dataset[452]['text'])

<|im_start|>system
### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.

INPUT:
- List of utterances from a comic page
- Comic's emotional summary describing themes and character arcs

OUTPUT:
- JSON with single key "page_utterance_emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}
<|im_end|>
<|im_start|>user
Comic Information:
Title: Fantasmas vol. 1 - Ghosted #6
Emotional Summar

In [46]:
OUTPUT_DIR = "/Utilisateurs/umushtaq/emotion_analysis_comics/outputs_dir_tmp"

In [47]:
args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="cosine",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        
        eval_strategy="steps",  # Run evaluation during training (can also use "epoch")
        eval_steps=25,  # Perform evaluation every 50 steps
        save_strategy="steps",  # Save the model every few steps
        save_steps=25,  # Save every 200 steps
        load_best_model_at_end=True,
    
        output_dir=OUTPUT_DIR,
        seed=0,
    )

In [48]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Replace with your train dataset
    eval_dataset=eval_dataset, 
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=args,
)

Map (num_proc=2):   0%|          | 0/574 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/144 [00:00<?, ? examples/s]

In [49]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 574 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 360
 "-____-"     Number of trainable parameters = 134,217,728


Step,Training Loss,Validation Loss
25,2.3849,0.545329
50,0.9678,0.458902
75,0.9553,0.445368
100,0.802,0.458331
125,0.799,0.449259
150,0.6929,0.474376
175,0.4971,0.488668
200,0.5325,0.477409
225,0.4247,0.541745
250,0.2462,0.542545


TrainOutput(global_step=360, training_loss=0.6195371331440078, metrics={'train_runtime': 1553.5465, 'train_samples_per_second': 1.847, 'train_steps_per_second': 0.232, 'total_flos': 4.509897562699776e+17, 'train_loss': 0.6195371331440078, 'epoch': 5.0})

In [50]:
model = FastLanguageModel.for_inference(model)

In [51]:
df_test = df[df.split == "TEST"].reset_index(drop=True)

In [94]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in df_test.iterrows():
        
        sys_msg = {'role': 'system', 'content': generation_instruction}
        
        comics_title = row['comics_title']
        comics_summary = row['summary']
        
        utterances_l = eval(row['utterance'])
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        # usr_prompt = f"Comics title: {comics_title}\n" + f"Comics summary: {comics_summary}\n" + "Here is the list of utterances that you will classify: \n" + pg_utterances
        usr_prompt = build_user_prompt(comics_title, comics_summary, pg_utterances)
        
        user_msg = {'role': 'user', 'content': usr_prompt}        
        #assistant_msg = {'role': 'assistant', 'content': ""}
        assistant_msg = {'role': 'assistant', 'content': ""}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [95]:
test_messages = []

for i in range(len(sys_msg_l)):
    
    #obj = {"list_emotion_classes": ["Anger", "Fear"]}

    #comics_dataset.append([human_msg_l[i], assistant_msg_l[i]])
    test_messages.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [96]:
# human_msg_l = []
# assistant_msg_l = []

# for _, row in df_test.iterrows():
        
#         prompt = instruction.replace("<comic_title>", row['comics_title']).replace("<speaker_id>", row['speaker_id']).replace("<utterance>", row['utterance'])
        
#         human_msg = {'role': 'user', 'content': prompt}
        
#         #obj = {"list_emotion_classes": row['emotion_u']}
#         obj = row['emotion_u']
#         assistant_msg = {'role': 'assistant', 'content': ""}
        
#         human_msg_l.append(human_msg)
#         assistant_msg_l.append(assistant_msg)
        
        

In [97]:
# test_messages = []

# for i in range(len(human_msg_l)):
    
#     #obj = {"list_emotion_classes": ["Anger", "Fear"]}

#     test_messages.append([human_msg_l[i], assistant_msg_l[i]])

In [98]:
len(test_messages)

156

In [99]:
# test_messages = test_messages[:100]

In [100]:
raw_outputs = []

for message in tqdm(test_messages):
    
    inputs = tokenizer.apply_chat_template(
    message,
    tokenize=True,
    add_generation_prompt=True,
    #return_dict=True,
    return_tensors="pt",
).to("cuda")
    #print(inputs)
    #break
    
    output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    #output = model.generate(**inputs, max_new_tokens=128)[0]
    
    input_length = inputs.shape[1]
    generated_tokens = output[input_length:]
    
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    #decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)
    #break

100%|██████████| 156/156 [11:43<00:00,  4.51s/it]


In [101]:
#print(tokenizer.decode(inputs[0]))

In [102]:
len(raw_outputs)

156

In [103]:
raw_outputs

['{"emotions": [[\'surprise\', \'joy\'], [\'surprise\', \'joy\'], [\'surprise\', \'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'neutral\'], [\'neutral\']]}',
 '{"emotions": [[\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\']]}',
 '{"emotions": [[\'anger\'], [\'anger\', \'disgust\'], [\'anger\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\'], [\'anger\', \'joy\'], [\'anger\', \'joy\'], [\'joy\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\', \'fear\'], [\'anger\', \'fear\'], [\'anger\', \'fear\']]}',
 '{"emotions": [[\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'fear\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'], [\'anger\', \'surprise\'],

In [104]:
grounds = df_test.emotion_c.tolist()

In [105]:
len(grounds)

156

In [106]:
import json

In [107]:
predictions = [json_repair.loads(e) for e in raw_outputs]

In [108]:
predictions

[{'emotions': [['surprise', 'joy'],
   ['surprise', 'joy'],
   ['surprise', 'joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['neutral'],
   ['neutral']]},
 {'emotions': [['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger']]},
 {'emotions': [['anger'],
   ['anger', 'disgust'],
   ['anger'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger'],
   ['anger', 'joy'],
   ['anger', 'joy'],
   ['joy'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['anger', 'fear'],
   ['anger', 'fear'],
   ['anger', 'fear']]},
 {'emotions': [['anger', 'disgust'],
   ['anger', 'disgust'],
   ['anger', 'disgust'],
   ['fear'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   ['anger', 'surprise'],
   

In [109]:
# #predictions = [e.split('\n\n')[0] for e in raw_outputs]
# bad_idx = []
# predictions = []

# for idx, e in enumerate(raw_outputs):
#     try:
#         predictions.append(json.loads(e))
#     except:
#         print(idx)
#         bad_idx.append(idx)
        


In [110]:
#len(predictions)

In [111]:
#predictions

In [112]:
#predictions = [json_repair.loads(e) for e in predictions]

In [113]:
#predictions

In [114]:
preds_l = []
bad_idx = []

for i, pred in enumerate(predictions):
    try:        
        preds_l.append(pred['emotions'])
    except:
        print(i)
        bad_idx.append(i)

In [115]:
preds_l

[[['surprise', 'joy'],
  ['surprise', 'joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['neutral'],
  ['neutral']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger']],
 [['anger'],
  ['anger', 'disgust'],
  ['anger'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger'],
  ['anger', 'joy'],
  ['anger', 'joy'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'fear'],
  ['anger', 'fear'],
  ['anger', 'fear']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['fear'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['

In [116]:
grounds = [item for i, item in enumerate(grounds) if i not in bad_idx]

In [117]:
len(grounds), len(preds_l)

(156, 156)

In [118]:
preds_l

[[['surprise', 'joy'],
  ['surprise', 'joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['neutral'],
  ['neutral']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger']],
 [['anger'],
  ['anger', 'disgust'],
  ['anger'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger'],
  ['anger', 'joy'],
  ['anger', 'joy'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'fear'],
  ['anger', 'fear'],
  ['anger', 'fear']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['fear'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'surprise'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['

In [119]:
grounds

["[['surprise', 'joy'], ['joy'], ['surprise', 'joy'], ['joy'], ['joy'], ['joy'], ['surprise'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['neutral'], ['neutral'], ['anger', 'disgust'], ['anger', 'disgust'], ['neutral'], ['sadness'], ['sadness']]",
 "[['anger', 'sadness'], ['anger', 'sadness'], ['anger', 'sadness'], ['fear', 'surprise'], ['surprise'], ['joy'], ['anger', 'surprise'], ['joy'], ['joy'], ['joy'], ['anger'], ['anger'], ['surprise', 'joy'], ['fear', 'sadness'], ['fear', 'sadness'], ['fear', 'surprise']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['fear', 'sadness'], ['fear', 'sadness', 'surprise'], ['sadness'], ['sadness'], ['fear', 'sadness'], ['sadness', 'surprise'], ['sadness', 'surprise'], ['joy'], ['anger'], ['anger'], ['anger'], ['anger', 'disgust'], ['joy'], ['joy'], ['surprise', 'joy'], ['surprise', 'joy'], ['anger', 'surprise'], ['anger', 'surprise']]",
 "[['neutral'], ['joy'], ['joy']]",
 "[['neutral'], ['fear'], ['neutral'], ['ang

In [120]:
import ast

grounds = [ast.literal_eval(x) for x in grounds]

In [121]:
grounds

[[['surprise', 'joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['surprise'],
  ['joy'],
  ['joy'],
  ['neutral'],
  ['neutral']],
 [['neutral'],
  ['neutral'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['neutral'],
  ['sadness'],
  ['sadness']],
 [['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['fear', 'surprise'],
  ['surprise'],
  ['joy'],
  ['anger', 'surprise'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['surprise', 'joy'],
  ['fear', 'sadness'],
  ['fear', 'sadness'],
  ['fear', 'surprise']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['fear', 'sadness'],
  ['fear', 'sadness', 'surprise'],
  ['sadness'],
  ['sadness'],
  ['fear', 'sadness'],
  ['sadness', 'surprise'],
  ['sadness', 'surprise'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'disgust'],
  ['joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['surprise', 'joy'],
  ['anger', 'surprise'],
  ['anger', 

In [122]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, preds_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

3 21 17
14 23 21
15 23 22
120 17 15


In [123]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del preds_l[idx]

In [124]:
grounds = [item for sublist in grounds for item in sublist]
predictions = [item for sublist in preds_l for item in sublist]

In [125]:
len(grounds), len(predictions)

(1242, 1242)

In [126]:
mlb = MultiLabelBinarizer()

In [127]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)



In [128]:
y_pred_mhot.shape

(1242, 7)

In [129]:
y_pred_mhot.shape

(1242, 7)

In [130]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.465     0.707     0.561       423
     disgust      0.167     0.500     0.250        40
        fear      0.541     0.444     0.487       284
         joy      0.571     0.448     0.502       270
     neutral      0.667     0.059     0.108       102
     sadness      0.574     0.524     0.548       317
    surprise      0.536     0.574     0.554       340

   micro avg      0.499     0.525     0.512      1776
   macro avg      0.503     0.465     0.430      1776
weighted avg      0.531     0.525     0.504      1776
 samples avg      0.521     0.538     0.503      1776



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
