In [1]:
import torch
import json_repair
import pandas as pd

from tqdm import tqdm
from datasets import Dataset

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    #model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing=True
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Data

In [4]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset.csv")

In [5]:
df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TRAIN
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TRAIN
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TRAIN
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TRAIN
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...
7124,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,2,SHE WOULDN'T DO THAT TO US. WE TALKED FOR A LO...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
7125,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,3,… I KNOW HER.,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
7126,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,4,1,"UH, GUYS…",2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:JUANITA...,AN0-DI0-FE3-SA0-SU4-JO0,JUANITA SANCHEZ,TRAIN
7127,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,22,1,1,PUT YOUR WEAPONS DOWN AND PUT YOUR HANDS IN TH...,2024-09-06 - SyimykRasulov\nFeeling:AN4-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN4-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:ID- 2,AN4-DI0-FE0-SA0-SU0-JO0,ID- 2,TRAIN


In [6]:
df.speaker_id.unique()

array(['ID-1', 'ID-2', 'NIGHTWING', 'MELINDA', 'BLOCKBUSTER', 'AUDRE',
       'MACLEAN', 'ID-3', 'ID-3 ', 'MAGGIE', 'COMMISSIONER', 'BATWOMAN',
       'ROBIN', 'ID-4', 'ID-5', 'STARGIRL', 'MARONI', 'FLASH', 'ID-6',
       'ID-7', 'ID-8', 'ID-9', 'ELLIOT', 'Eleanor', 'Momma',
       'no annotation', 'Natasha', 'John', 'BRIGHT REVENANT', 'HAWKGIRL',
       'CHRISTOPHER', 'ICE', 'JESUS', 'unknown', 'ID- 1', 'ID- 2',
       'ID- 3', 'ID- 4', 'ID- 5', 'ID- 6', 'Ms.Jones', 'Felicia Book',
       'ID- 7', 'ID- 8', 'ID- 9', 'ID- 10', 'ID- 11', 'ID- 12', 'Skinner',
       'Cal', 'Homo Abominus', 'Agent Bixby', 'Lucia', 'Trapp', 'Kill',
       'ID- 14', 'OLIVIA', 'MAGISTER PAVUS', 'AARON', 'CALIX ', 'CALIX',
       'FRANCESCA', 'FLORIAN', 'VAEA', 'AGOSTO', 'ELF', 'AUTUMN', 'Carl',
       'Rick', 'Maggie', 'Eugene', 'STEPHANIE', 'Dwight', 'Negan', 'Lobo',
       'ID-2 ', 'Director', 'ID-1 ', 'Flash', 'Black Mask', 'Jay',
       'Linda', 'Jai', 'Max', 'Irey', 'Ace', 'Jesse', 'Mr.Allen',
       'Mr

In [7]:
filtered_df = df[~df['speaker_id'].str.contains('ID-', na=False)].reset_index(drop=True)

In [136]:
#filtered_df

In [137]:
#filtered_df.speaker_id.unique()

In [8]:
filtered_df = filtered_df[~filtered_df['speaker_id'].str.contains('no annotation', na=False)].reset_index(drop=True)

In [139]:
#filtered_df.shape

In [9]:
filtered_df = filtered_df[~filtered_df['speaker_id'].str.contains('unknown_speaker', na=False)].reset_index(drop=True)

In [141]:
#filtered_df.shape

In [10]:
filtered_df = filtered_df[~filtered_df['speaker_id'].str.contains('unknown', na=False)].reset_index(drop=True)

In [143]:
#filtered_df.shape

In [144]:
#filtered_df.speaker_id.unique()

In [145]:
#filtered_df[filtered_df.speaker_id == 'violet ring']

In [146]:
#filtered_df[filtered_df.file_name == 'QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx'].speaker_id.unique()

In [11]:
filtered_df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,1,CAN YOU WALK?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE5-SA0-SU0-JO0,NIGHTWING,TRAIN
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,2,HOW DID YOU FIND ME?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:MELINDA,AN0-DI0-FE0-SA0-SU5-JO0,MELINDA,TRAIN
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,3,ORACLE HAD THE LAST PING OFF YOUR PHONE. AND A...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE0-SA0-SU0-JO5,NIGHTWING,TRAIN
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,4,1,AUDRE'S HERE?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:MELINDA,AN0-DI0-FE3-SA0-SU5-JO0,MELINDA,TRAIN
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,4,2,IN A CAR OUTSIDE.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE0-SA0-SU0-JO3,NIGHTWING,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...
5692,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,1,MAYBE THIS STEPHANIE PERSON WAS JUST MESSING W...,2024-09-06 - SyimykRasulov\nFeeling:AN1-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN1-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Siddiq,AN1-DI0-FE3-SA0-SU0-JO0,Siddiq,TRAIN
5693,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,2,SHE WOULDN'T DO THAT TO US. WE TALKED FOR A LO...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
5694,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,3,… I KNOW HER.,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
5695,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,4,1,"UH, GUYS…",2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:JUANITA...,AN0-DI0-FE3-SA0-SU4-JO0,JUANITA SANCHEZ,TRAIN


In [148]:
#len(filtered_df.file_name.unique())

In [12]:
def get_unique_emotion(row):

    utterance_emotions = row.emotion

    utterance_emotions_l = []
    emotion_class_labels = ["anger", "disgust", "fear", "sadness", "surprise", "joy"]

    if utterance_emotions == 'Neutral':
        
        utterance_emotions_l.append('neutral')
    
    else:
        
        utterance_emotions = utterance_emotions.split("-")

        for idx, emotion_annotation in enumerate(utterance_emotions):

            if '0' not in emotion_annotation:
         
                utterance_emotions_l.append(emotion_class_labels[idx])

    return utterance_emotions_l

In [13]:
filtered_df['emotion_c'] = filtered_df.apply(lambda row: get_unique_emotion(row), axis=1)

In [14]:
# page_df = df.groupby(['file_name', 'page_nr', 'split']).agg({
#     'utterance': list,
#     'speaker_id': list,
#     'emotion_c': list
# }).reset_index()

In [15]:
#page_df

In [16]:
file_names_l = filtered_df.file_name.unique().tolist()

In [17]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [18]:
filtered_df['comics_title'] = filtered_df['file_name'].apply(lambda x: comics_titles_full[file_names_l.index(x)] if x in file_names_l else None)

In [19]:
# def get_stats(row):
    
#     return [len(row.utterance), len(row.speaker_id), len(row.emotion_c)]

In [20]:
#page_df['lens'] = page_df.apply(lambda row: get_stats(row), axis=1)

In [21]:
filtered_df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split,emotion_c,comics_title
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,1,CAN YOU WALK?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE5-SA0-SU0-JO0,NIGHTWING,TRAIN,[fear],Jurassic League #4
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,2,HOW DID YOU FIND ME?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:MELINDA,AN0-DI0-FE0-SA0-SU5-JO0,MELINDA,TRAIN,[surprise],Jurassic League #4
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,3,3,ORACLE HAD THE LAST PING OFF YOUR PHONE. AND A...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE0-SA0-SU0-JO5,NIGHTWING,TRAIN,[joy],Jurassic League #4
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,4,1,AUDRE'S HERE?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:MELINDA,AN0-DI0-FE3-SA0-SU5-JO0,MELINDA,TRAIN,"[fear, surprise]",Jurassic League #4
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,3,4,2,IN A CAR OUTSIDE.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:NIGHTWING,AN0-DI0-FE0-SA0-SU0-JO3,NIGHTWING,TRAIN,[joy],Jurassic League #4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5692,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,1,MAYBE THIS STEPHANIE PERSON WAS JUST MESSING W...,2024-09-06 - SyimykRasulov\nFeeling:AN1-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN1-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Siddiq,AN1-DI0-FE3-SA0-SU0-JO0,Siddiq,TRAIN,"[anger, fear]",Stillwater #13
5693,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,2,SHE WOULDN'T DO THAT TO US. WE TALKED FOR A LO...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN,"[fear, sadness]",Stillwater #13
5694,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,3,… I KNOW HER.,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN,"[fear, sadness]",Stillwater #13
5695,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,4,1,"UH, GUYS…",2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:JUANITA...,AN0-DI0-FE3-SA0-SU4-JO0,JUANITA SANCHEZ,TRAIN,"[fear, surprise]",Stillwater #13


In [22]:
len(filtered_df.speaker_id.unique().tolist())

247

### prepare prompts

In [202]:
def build_generation_instruction():
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.

INPUT:
- List of utterances from a comic page
- Comic's emotional summary describing themes and character arcs

OUTPUT:
- JSON with single key "page_utterance_emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: {formatted_classes}
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}
"""
    return instruction

In [203]:
def build_user_prompt(comics_title, emotional_summary, pg_utterances):
    usr_prompt = f"""Comic Information:
Title: {comics_title}
Emotional Summary: {emotional_summary}

Utterances to Classify:
{pg_utterances}"""
    return usr_prompt

In [204]:
generation_instruction = build_generation_instruction()

In [205]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in page_df.iterrows():
        
        sys_msg = {'role': 'system', 'content': generation_instruction}
        
        comics_title = row['comics_title']
        comics_summary = row['summary']
        
        utterances_l = row['utterance']
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        # usr_prompt = f"Comics title: {comics_title}\n" + f"Comics summary: {comics_summary}\n" + "Here is the list of utterances that you will classify: \n" + pg_utterances
        usr_prompt = build_user_prompt(comics_title, comics_summary, pg_utterances)
        
        user_msg = {'role': 'user', 'content': usr_prompt}        
        assistant_msg = {'role': 'assistant', 'content': ""}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [206]:
comics_dataset = []

for i in range(len(sys_msg_l)):

    comics_dataset.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [207]:
comics_dataset[10]

[{'role': 'system',
  'content': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Analyze utterances based on both the provided emotional summary and immediate context.\n\nINPUT:\n- List of utterances from a comic page\n- Comic\'s emotional summary describing themes and character arcs\n\nOUTPUT:\n- JSON with single key "page_utterance_emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"page_utterance_emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n'},
 {'role': 'user',
  'content': "Comic Information:\nTitle

In [208]:
raw_outputs = []

for message in tqdm(comics_dataset):
    
    inputs = tokenizer.apply_chat_template(
    message,
    tokenize=True,
    add_generation_prompt=True,
    #return_dict=True,
    return_tensors="pt",
).to("cuda")
    #print(inputs)
    #break
    
    output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    #output = model.generate(**inputs, max_new_tokens=128)[0]
    
    input_length = inputs.shape[1]
    generated_tokens = output[input_length:]
    
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    #decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)
    #break

100%|██████████| 874/874 [31:22<00:00,  2.15s/it]


In [268]:
raw_outputs

['{\n  "page_utterance_emotions": [\n    ["disgust", "anger"],\n    ["anger", "disgust"],\n    ["neutral"],\n    ["sadness"],\n    ["anger"],\n    ["sadness"],\n    ["anger"],\n    ["surprise"],\n    ["surprise"],\n    ["fear"]\n  ]\n}',
 '```json\n{\n  "page_utterance_emotions": [\n    ["fear"],\n    ["anger", "fear"],\n    ["anger", "fear"],\n    ["disgust", "fear"],\n    ["disgust", "fear"],\n    ["surprise"],\n    ["neutral"],\n    ["anger", "fear"],\n    ["anger", "fear"]\n  ]\n}\n```',
 '{\n  "page_utterance_emotions": [\n    ["anger"],\n    ["fear", "anger"],\n    ["disgust", "anger"],\n    ["anger"]\n  ]\n}',
 '{"page_utterance_emotions": [["anger"]]}',
 '{"page_utterance_emotions": [["neutral"], ["anger", "fear"], ["disgust"], ["disgust"], ["joy"], ["neutral"]]}',
 '{"page_utterance_emotions": [["anger"], ["fear"], ["threat"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"]]}',
 '{\n  "page_utterance_emotions": [\n    ["fear"],\n    ["

In [269]:
grounds = page_df.emotion_c.tolist()

In [270]:
len(grounds)

874

In [271]:
predictions = [json_repair.loads(e) for e in raw_outputs]

In [272]:
len(predictions)

874

In [273]:
predictions

[{'page_utterance_emotions': [['disgust', 'anger'],
   ['anger', 'disgust'],
   ['neutral'],
   ['sadness'],
   ['anger'],
   ['sadness'],
   ['anger'],
   ['surprise'],
   ['surprise'],
   ['fear']]},
 {'page_utterance_emotions': [['fear'],
   ['anger', 'fear'],
   ['anger', 'fear'],
   ['disgust', 'fear'],
   ['disgust', 'fear'],
   ['surprise'],
   ['neutral'],
   ['anger', 'fear'],
   ['anger', 'fear']]},
 {'page_utterance_emotions': [['anger'],
   ['fear', 'anger'],
   ['disgust', 'anger'],
   ['anger']]},
 {'page_utterance_emotions': [['anger']]},
 {'page_utterance_emotions': [['neutral'],
   ['anger', 'fear'],
   ['disgust'],
   ['disgust'],
   ['joy'],
   ['neutral']]},
 {'page_utterance_emotions': [['anger'],
   ['fear'],
   ['threat'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral']]},
 {'page_utterance_emotions': [['fear'],
   ['tension', 'disgust'],
   ['insight', 'surprise'],
   ['surprise'],
   ['confidence']]

In [274]:
preds_l = []
bad_idx = []

for i, pred in enumerate(predictions):
    try:        
        preds_l.append(pred['page_utterance_emotions'])
    except:
        print(i)
        bad_idx.append(i)

In [275]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del preds_l[idx]

In [276]:
len(grounds), len(preds_l)

(874, 874)

In [277]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, preds_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

23 12 11
26 14 12
33 11 10
42 13 12
43 8 7
66 8 7
73 15 14
75 18 16
76 16 14
77 16 14
82 13 12
85 9 8
88 10 9
101 16 13
102 21 20
104 20 16
105 15 14
106 16 15
107 12 11
108 11 10
109 14 13
113 23 16
114 23 21
117 13 12
118 14 13
120 11 10
124 13 12
125 13 14
126 17 15
128 12 11
131 12 11
148 12 11
161 18 16
166 12 11
167 13 12
173 14 13
183 17 15
192 13 12
193 11 10
198 18 16
200 18 16
202 12 11
205 22 19
206 11 10
213 9 8
216 15 14
230 15 14
232 17 16
233 13 12
235 9 8
241 11 10
245 13 12
247 16 15
248 15 13
249 17 16
250 14 13
252 9 8
253 16 15
254 17 16
255 14 13
270 1 2
272 19 16
273 19 17
274 23 22
275 13 12
276 15 14
278 15 14
282 11 10
283 25 21
288 15 14
291 17 16
293 11 10
299 9 8
301 31 24
306 13 12
307 9 8
309 11 10
312 16 15
316 9 8
317 15 14
319 12 11
320 19 16
321 17 14
322 11 10
325 14 13
329 11 10
335 1 3
341 4 5
343 12 11
347 11 10
351 14 13
360 16 15
364 12 11
366 11 10
368 15 13
371 7 6
375 8 7
405 14 13
409 8 7
411 15 13
413 11 10
414 15 14
416 11 10
452 9 8
455 9 

In [278]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del preds_l[idx]

In [279]:
len(grounds), len(preds_l)

(696, 696)

In [280]:
grounds = [item for sublist in grounds for item in sublist]
predictions = [item for sublist in preds_l for item in sublist]

In [281]:
len(grounds), len(predictions)

(4820, 4820)

In [282]:
grounds

[['anger'],
 ['anger'],
 ['fear'],
 ['fear'],
 ['fear', 'sadness'],
 ['sadness'],
 ['anger'],
 ['surprise'],
 ['surprise'],
 ['fear', 'surprise'],
 ['fear'],
 ['anger'],
 ['surprise'],
 ['anger'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['anger'],
 ['anger'],
 ['joy'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['fear', 'surprise'],
 ['anger'],
 ['anger'],
 ['fear'],
 ['fear', 'surprise'],
 ['anger', 'joy'],
 ['anger'],
 ['sadness'],
 ['fear'],
 ['joy'],
 ['neutral'],
 ['surprise'],
 ['fear'],
 ['surprise'],
 ['joy'],
 ['surprise'],
 ['sadness', 'surprise'],
 ['surprise'],
 ['fear', 'sadness'],
 ['surprise'],
 ['surprise'],
 ['joy'],
 ['fear'],
 ['fear'],
 ['anger'],
 ['fear'],
 ['anger', 'surprise'],
 ['fear'],
 ['fear'],
 ['fear', 'sadness'],
 ['surprise'],
 ['surprise'],
 ['anger'],
 ['anger'],
 ['fear'],
 ['fear'],
 ['anger'],
 ['anger', 'fear', 'surprise'],
 ['fear', 'sadness'],
 ['fear', 'sadness'],
 ['anger'],
 ['anger'],
 ['surprise'],
 ['fear'],
 ['anger', 'surprise'],
 ['anger'],
 ['anger

In [283]:
set([type(item) for sublist in grounds for item in sublist])

{str}

In [284]:
set([type(item) for sublist in predictions for item in sublist])

{list, str}

In [285]:
list_indices = [
    (outer_idx, inner_idx)
    for outer_idx, sublist in enumerate(predictions)
    if isinstance(sublist, list)
    for inner_idx, item in enumerate(sublist)
    if isinstance(item, list)
]

In [286]:
len(list_indices)

10

In [287]:
predictions[1263]

[['neutral']]

In [288]:
list_indices = [i[0] for i in list_indices]

In [289]:
list_indices

[1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263]

In [290]:
list_indices.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in list_indices:
    
    del grounds[idx]
    del predictions[idx]

In [291]:
predictions

[['disgust', 'anger'],
 ['anger', 'disgust'],
 ['neutral'],
 ['sadness'],
 ['anger'],
 ['sadness'],
 ['anger'],
 ['surprise'],
 ['surprise'],
 ['fear'],
 ['fear'],
 ['anger', 'fear'],
 ['anger', 'fear'],
 ['disgust', 'fear'],
 ['disgust', 'fear'],
 ['surprise'],
 ['neutral'],
 ['anger', 'fear'],
 ['anger', 'fear'],
 ['anger'],
 ['fear', 'anger'],
 ['disgust', 'anger'],
 ['anger'],
 ['anger'],
 ['neutral'],
 ['anger', 'fear'],
 ['disgust'],
 ['disgust'],
 ['joy'],
 ['neutral'],
 ['anger'],
 ['fear'],
 ['threat'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['fear'],
 ['tension', 'disgust'],
 ['insight', 'surprise'],
 ['surprise'],
 ['confidence'],
 ['fear'],
 ['fear'],
 ['fear'],
 ['fear'],
 ['anger', 'fear'],
 ['fear'],
 ['fear', 'disgust'],
 ['fear', 'disgust'],
 ['anger'],
 ['anger'],
 ['anger', 'disgust'],
 ['anger'],
 ['neutral'],
 ['fear'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['fear'],
 ['fear'],
 ['joy', 'hope'],
 

In [292]:
mlb = MultiLabelBinarizer()

In [293]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)

In [294]:
y_true_mhot.shape

(4810, 7)

In [295]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.597     0.384     0.468      1605
     disgust      0.113     0.367     0.172       237
        fear      0.464     0.289     0.356      1264
         joy      0.574     0.231     0.329      1075
     neutral      0.109     0.576     0.183       297
     sadness      0.529     0.311     0.391      1214
    surprise      0.658     0.122     0.206      1281

   micro avg      0.364     0.290     0.323      6973
   macro avg      0.435     0.326     0.301      6973
weighted avg      0.531     0.290     0.342      6973
 samples avg      0.350     0.303     0.311      6973



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
