In [1]:
import torch
import json_repair
import pandas as pd

from tqdm import tqdm
from datasets import Dataset

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model( 
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing=True
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## data

In [191]:
df_u = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset.csv")

In [192]:
df_u

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TRAIN
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TRAIN
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TRAIN
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TRAIN
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...
7124,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,2,SHE WOULDN'T DO THAT TO US. WE TALKED FOR A LO...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
7125,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,3,3,… I KNOW HER.,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:Eugene,AN0-DI0-FE1-SA3-SU0-JO0,Eugene,TRAIN
7126,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,21,4,1,"UH, GUYS…",2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:JUANITA...,AN0-DI0-FE3-SA0-SU4-JO0,JUANITA SANCHEZ,TRAIN
7127,QC copy - 1737 - 34 The Walking Dead vol 15 - ...,22,1,1,PUT YOUR WEAPONS DOWN AND PUT YOUR HANDS IN TH...,2024-09-06 - SyimykRasulov\nFeeling:AN4-DI0-FE...,2024-09-06 - SyimykRasulov\nFeeling:AN4-DI0-FE...,\n2024-09-06 - SyimykRasulov\nSpokenBy:ID- 2,AN4-DI0-FE0-SA0-SU0-JO0,ID- 2,TRAIN


In [101]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_dataset_pg.csv")

In [102]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f..."
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],..."
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]"
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]"
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur..."
...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [..."
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']..."
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [..."
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr..."


In [103]:
def get_nr_emotions(row):

    return len(row.emotion_c)

In [104]:
df['nr_emotions'] = df.apply(lambda row: get_nr_emotions(row), axis=1)

In [105]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,nr_emotions
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",137
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",99
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",40
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",22
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",83
...,...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",166
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",98
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",143
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",144


In [106]:
df.file_name.unique()

array(['QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
       'QC copy - 1500 - 04 Nightwing 19 _Nightwing 95_.xlsx',
       'QC copy - 1501 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCWWJL GL 1.xlsx',
       'QC copy - 1502 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCVMM Lobo.xlsx',
       'QC copy - 1503 - 10 Crisis Oscura Flash - FLS 783.xlsx',
       'QC copy - 1507 - 22 Calle Peligro 1.xlsx',
       'QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx',
       'QC copy - 1513 - 21 Blanco Humano 9.xlsx',
       'QC copy - 1514 - 15 DC contra Vampiros 11.xlsx',
       'QC copy - 1517 - 37 John Carpenter Historias para una noche de Halloween 3.xlsx',
       'QC copy - 1518 - 50 Las asombrosas aventuras de las Tortugas Ninja 5.xlsx',
       'QC copy - 1520 - 51 Sonic The Hedgehog 44.xlsx',
       'QC copy - 1521 - 40 Amor eterno 2.xlsx',
       'QC copy - 1559 - 36 Fantasmas vol. 1 - Ghosted 2.xlsx',
       'QC copy - 1560 - 36 Fantasmas vol. 1 - G

In [107]:
df.nr_emotions.value_counts()

nr_emotions
116    11
44     10
156    10
123     9
109     9
       ..
313     1
236     1
256     1
187     1
233     1
Name: count, Length: 252, dtype: int64

In [108]:
file_names_l = df.file_name.unique().tolist()

In [109]:
comics_titles_full = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]


In [110]:
len(comics_titles_full)

35

In [111]:
# comics_titles = [
    
#     "Nightwing",
#     "Worlds Without a Justice League - Green Lantern",
#     "Human Target",
#     "American Vampire",
#     "Dragon Age",
#     "Dragon Age",
#     "The Walking Dead",
#     "Worlds Without a Justice League - Green Lantern",
#     "Dark Crisis: The Flash",
#     "Danger Street",
#     "Tiny Titans",
#     "The Amazing Adventures of the Ninja Turtles",
#     "Sonic The Hedgehog",
#     "Love Everlasting",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "Fantasmas",
#     "American Vampire",
#     "American Vampire",
#     "Dragon Age",
#     "Stillwater",
#     "Jurassic League",
#     "John Carpenter's Tales for a Halloweenight",
#     "DC vs. Vampires",
#     "Thief Of Thieves",
#     "The Walking Dead",
#     "The Walking Dead",
#     "The Walking Dead",

# ]


In [112]:
#len(comics_titles)

In [113]:
df['comics_title'] = df['file_name'].apply(lambda x: comics_titles_full[file_names_l.index(x)] if x in file_names_l else None)

In [114]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,nr_emotions,comics_title
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",137,Jurassic League #4
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",99,Jurassic League #4
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",40,Jurassic League #4
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",22,Jurassic League #4
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",83,Jurassic League #4
...,...,...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",166,Stillwater #13
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",98,Stillwater #13
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",143,Stillwater #13
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",144,Stillwater #13


In [115]:
### Get summary

In [116]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [117]:
max_seq_length = 2048
summary_model, summary_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/SmolLM2-1.7B-Instruct",
    #model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [118]:
summary_model = FastLanguageModel.for_inference(summary_model)

In [119]:
title_summaries_d = {}

for title in tqdm(comics_titles_full):
    
    summary_instruction = f"""You are an expert in comics with comprehensive knowledge of storylines, characters, and narrative arcs. When given a comics title, provide a thorough but concise summary that includes:

    1. The main plot from beginning to end
    2. Major story arcs and their resolutions
    3. Any significant themes or messages

    Important: Keep the summary focused and under 200 words while ensuring it has a proper conclusion. Do not leave any sentence unfinished.
    
    Please ensure the summary is cohesive and complete, with proper narrative flow from start to finish.
    Provide a well-structured summary that connects all plot points and gives closure to the story."""
    
    sys_msg = {'role': 'system', 'content': summary_instruction}
    user_msg = {'role': 'user', 'content': "Here is the comics title: \n" + title}
    assistant_msg = {'role': 'assistant', 'content': ""}
    
    messages = [sys_msg, user_msg, assistant_msg]
    
    input_text = summary_tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    
    outputs = summary_model.generate(
        input_ids=input_text,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )
    
    input_length = input_text.shape[1]
    generated_tokens = outputs[0][input_length:]
    #print(outputs)
    decoded_output = summary_tokenizer.decode(generated_tokens, skip_special_tokens=True)
    title_summaries_d[title] = decoded_output
    
    #break

100%|██████████| 35/35 [02:43<00:00,  4.67s/it]


In [120]:
title_summaries_d

{'Jurassic League #4': 'In "Jurassic League #4," the main plot begins when a team of adventurers, consisting of paleontologist Dr. Sophia Patel, her colleague Dr. Henry Wong, and tech expert Jack "Sentry" Sullivan, embark on a perilous journey through the Jurassic Coast, following the trail of clues left behind by the enigmatic treasure hunter, Marcus Blackwood. Along the way, they face numerous challenges, including treacherous terrain, relentless storms, and ferocious prehistoric creatures.\n\nAs they progress deeper into the heart of the Jurassic Coast, the team discovers ancient ruins containing cryptic messages and mysterious artifacts that hint at the existence of a long-lost civilization. Their investigation leads them to a hidden underground bunker where they uncover a shocking revelation about Blackwood\'s true intentions and his connection to Blackwood Island.\n\nMajor Story Arcs and Resolutions:\n\n1. Blackwood\'s sinister plans for the island\'s treasures and its inhabitant

In [121]:
len(title_summaries_d)

35

In [122]:
df['comics_summary'] = df['comics_title'].map(title_summaries_d)

In [123]:
df

Unnamed: 0.1,Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,nr_emotions,comics_title,comics_summary
0,0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",137,Jurassic League #4,"In ""Jurassic League #4,"" the main plot begins ..."
1,1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",99,Jurassic League #4,"In ""Jurassic League #4,"" the main plot begins ..."
2,2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",40,Jurassic League #4,"In ""Jurassic League #4,"" the main plot begins ..."
3,3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",22,Jurassic League #4,"In ""Jurassic League #4,"" the main plot begins ..."
4,4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",83,Jurassic League #4,"In ""Jurassic League #4,"" the main plot begins ..."
...,...,...,...,...,...,...,...,...,...
869,869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",166,Stillwater #13,"In Stillwater #13, the main plot follows the j..."
870,870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",98,Stillwater #13,"In Stillwater #13, the main plot follows the j..."
871,871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",143,Stillwater #13,"In Stillwater #13, the main plot follows the j..."
872,872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",144,Stillwater #13,"In Stillwater #13, the main plot follows the j..."


In [124]:
df_train = df[df.split == "TRAIN"].reset_index(drop=True)

In [125]:
len(df_train)

718

### Comics dataset

In [126]:
def build_instruction():
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a list of utterances from a page in a comic book
- The utterance may express one or multiple emotions
- The name and summary of the comic book

TASK:
1. Carefully analyze the emotional context and tone of each utterance in the page
2. Identify applicable emotions from the following classes:
   {formatted_classes}
3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.

RULES:
1. Use ONLY the labels listed above
2. Output must be a JSON with single key "page_utterance_emotions"
3. Value must be an array where:
   - Each element is an array of emotions for one utterance
   - Order matches the input utterances order
   - Multiple emotions are allowed per utterance
4. No explanations, only JSON output

IMPORTANT:
- Each array element corresponds to one utterance
- One utterance can have multiple emotions
- Maintain exact spelling and case of emotion labels
- Keep emotions in arrays even for single emotions

"""
    return instruction

In [127]:
# def build_instruction():
  
#     emotion_classes = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy", "Neutral"]
#     formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
#     instruction = f"""### Emotion Analysis Expert Role

# You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content while considering conversational context.

# INPUT:
# - You will receive a list of 6 consecutive utterances from a comic book:
#   * The first 5 utterances provide conversational context
#   * The 6th (last) utterance is the one to be classified
# - Each utterance may express one or multiple emotions

# TASK:
# 1. Read through the context utterances to understand the emotional flow
# 2. Carefully analyze the emotional context, tone, and potential emotional shifts in the final utterance
# 3. Identify applicable emotions for the final utterance only from the following classes:
#    {formatted_classes}

# OUTPUT REQUIREMENTS:
# - Format: JSON object with a single key "list_emotion_classes"
# - Value: Array of one or more emotion classes as strings
# - The classification should be for the final utterance only, using previous utterances as context

# IMPORTANT NOTES:
# - Do not include any explanations in the output, only the JSON object
# - Use the context to better understand emotional transitions and current emotional state
# - Context utterances may reveal emotional buildup or shifts that influence the final utterance

# """
#     return instruction

In [128]:
instruction = build_instruction()

In [130]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in df_train.iterrows():
        
        sys_msg = {'role': 'system', 'content': instruction}
        
        comics_title = row['comics_title']
        comics_summary = row['comics_summary']
        
        utterances_l = eval(row['utterance'])
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        usr_prompt = f"Comics title: {comics_title}\n" + f"Comics summary: {comics_summary}\n" + "Here is the list of utterances that you will classify: \n" + pg_utterances
        
        user_msg = {'role': 'user', 'content': usr_prompt}

        #obj = {"list_emotion_classes": row['emotion_u']}
        #obj = row['emotion_u']
        #assistant_msg = {'from': 'gpt', 'value': obj}
        assistant_msg = {'role': 'assistant', 'content': f'{{"list_emotion_classes": {row["emotion_c"]}}}'}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [131]:
comics_dataset = []

for i in range(len(sys_msg_l)):

    comics_dataset.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [132]:
len(comics_dataset)

718

In [133]:
comics_dataset[0]

[{'role': 'system',
  'content': '### Emotion Analysis Expert Role\n\nYou are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.\n\nINPUT:\n- You will receive a list of utterances from a page in a comic book\n- The utterance may express one or multiple emotions\n- The name and summary of the comic book\n\nTASK:\n1. Carefully analyze the emotional context and tone of each utterance in the page\n2. Identify applicable emotions from the following classes:\n   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.\n\nRULES:\n1. Use ONLY the labels listed above\n2. Output must be a JSON with single key "page_utterance_emotions"\n3. Value must be an array where:\n   - Each element is an array of emotions for one utterance\n   - Order matches the input utt

In [134]:
def fix_comics_dataset(comics_dataset):
    fixed_comics_dataset = []
    for conversation in comics_dataset:
        fixed_conversation = []
        for message in conversation:
            if isinstance(message['content'], list):  # If the 'value' is a list of emotions
                message['content'] = ', '.join(message['content'])  # Join the list into a string
            fixed_conversation.append(message)
        fixed_comics_dataset.append(fixed_conversation)
    return fixed_comics_dataset

In [135]:
fixed_comics_dataset = fix_comics_dataset(comics_dataset)

In [136]:
dataset = Dataset.from_dict({
    'conversations': fixed_comics_dataset
})

In [137]:
dataset

Dataset({
    features: ['conversations'],
    num_rows: 718
})

In [138]:
dataset[0]['conversations']

[{'content': '### Emotion Analysis Expert Role\n\nYou are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.\n\nINPUT:\n- You will receive a list of utterances from a page in a comic book\n- The utterance may express one or multiple emotions\n- The name and summary of the comic book\n\nTASK:\n1. Carefully analyze the emotional context and tone of each utterance in the page\n2. Identify applicable emotions from the following classes:\n   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.\n\nRULES:\n1. Use ONLY the labels listed above\n2. Output must be a JSON with single key "page_utterance_emotions"\n3. Value must be an array where:\n   - Each element is an array of emotions for one utterance\n   - Order matches the input utterances order\n   - 

In [139]:
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

def apply_template_comics(examples):
    messages = examples["conversations"]
    #messages = examples['input'] + examples['output']
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}

Unsloth: Will map <|im_end|> to EOS = <|eot_id|>.


In [140]:
comics_dataset = dataset.map(apply_template_comics, batched=True)

Map:   0%|          | 0/718 [00:00<?, ? examples/s]

In [141]:
comics_dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 718
})

In [142]:
def split_dataset(dataset, train_ratio=0.8):
    train_test = dataset.train_test_split(test_size=1 - train_ratio)
    return train_test

dataset_split = split_dataset(comics_dataset)

In [143]:
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

In [144]:
train_dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 574
})

In [145]:
print(train_dataset[452]['text'])

<|im_start|>system
### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a list of utterances from a page in a comic book
- The utterance may express one or multiple emotions
- The name and summary of the comic book

TASK:
1. Carefully analyze the emotional context and tone of each utterance in the page
2. Identify applicable emotions from the following classes:
   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"
3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.

RULES:
1. Use ONLY the labels listed above
2. Output must be a JSON with single key "page_utterance_emotions"
3. Value must be an array where:
   - Each element is an array of emotions for one utterance
   - Order matches the input utterances order
   - Multiple emotion

In [146]:
OUTPUT_DIR = "/Utilisateurs/umushtaq/emotion_analysis_comics/outputs_dir_tmp"

In [149]:
args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="cosine",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        num_train_epochs=10,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        
        eval_strategy="steps",  # Run evaluation during training (can also use "epoch")
        eval_steps=25,  # Perform evaluation every 50 steps
        save_strategy="steps",  # Save the model every few steps
        save_steps=25,  # Save every 200 steps
        load_best_model_at_end=True,
    
        output_dir=OUTPUT_DIR,
        seed=0,
    )

In [150]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Replace with your train dataset
    eval_dataset=eval_dataset, 
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=args,
)

Map (num_proc=2):   0%|          | 0/574 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/144 [00:00<?, ? examples/s]

In [151]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 574 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 2
\        /    Total batch size = 32 | Total steps = 180
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
25,2.0529,0.443779
50,0.7921,0.423043
75,0.5836,0.51333
100,0.3272,0.558803
125,0.1553,0.631203
150,0.0574,0.711663
175,0.0328,0.76536


TrainOutput(global_step=180, training_loss=0.5565659624834856, metrics={'train_runtime': 3038.0313, 'train_samples_per_second': 1.889, 'train_steps_per_second': 0.059, 'total_flos': 2.564785483190108e+17, 'train_loss': 0.5565659624834856, 'epoch': 10.0})

In [152]:
model = FastLanguageModel.for_inference(model)

In [153]:
df_test = df[df.split == "TEST"].reset_index(drop=True)

In [154]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in df_test.iterrows():
        
        sys_msg = {'role': 'system', 'content': instruction}
        
        
        comics_title = row['comics_title']
        comics_summary = row['comics_summary']
        
        utterances_l = eval(row['utterance'])
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        usr_prompt = f"Comics title: {comics_title}\n" + f"Comics summary: {comics_summary}\n" + "Here is the list of utterances that you will classify: \n" + pg_utterances
        
        user_msg = {'role': 'user', 'content': usr_prompt}

        #obj = {"list_emotion_classes": row['emotion_u']}
        #obj = row['emotion_u']
        #assistant_msg = {'from': 'gpt', 'value': obj}
        assistant_msg = {'role': 'assistant', 'content': ""}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [155]:
test_messages = []

for i in range(len(sys_msg_l)):
    
    #obj = {"list_emotion_classes": ["Anger", "Fear"]}

    #comics_dataset.append([human_msg_l[i], assistant_msg_l[i]])
    test_messages.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [156]:
# human_msg_l = []
# assistant_msg_l = []

# for _, row in df_test.iterrows():
        
#         prompt = instruction.replace("<comic_title>", row['comics_title']).replace("<speaker_id>", row['speaker_id']).replace("<utterance>", row['utterance'])
        
#         human_msg = {'role': 'user', 'content': prompt}
        
#         #obj = {"list_emotion_classes": row['emotion_u']}
#         obj = row['emotion_u']
#         assistant_msg = {'role': 'assistant', 'content': ""}
        
#         human_msg_l.append(human_msg)
#         assistant_msg_l.append(assistant_msg)
        
        

In [157]:
# test_messages = []

# for i in range(len(human_msg_l)):
    
#     #obj = {"list_emotion_classes": ["Anger", "Fear"]}

#     test_messages.append([human_msg_l[i], assistant_msg_l[i]])

In [158]:
len(test_messages)

156

In [159]:
# test_messages = test_messages[:100]

In [160]:
raw_outputs = []

for message in tqdm(test_messages):
    
    inputs = tokenizer.apply_chat_template(
    message,
    tokenize=True,
    add_generation_prompt=True,
    #return_dict=True,
    return_tensors="pt",
).to("cuda")
    #print(inputs)
    #break
    
    output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
    #output = model.generate(**inputs, max_new_tokens=128)[0]
    
    input_length = inputs.shape[1]
    generated_tokens = output[input_length:]
    
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    #decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)
    #break

100%|██████████| 156/156 [05:54<00:00,  2.27s/it]


In [161]:
#print(tokenizer.decode(inputs[0]))

In [162]:
len(raw_outputs)

156

In [163]:
raw_outputs

['{"list_emotion_classes": [[\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'surprise\', \'joy\'], [\'joy\'], [\'joy\'], [\'sadness\'], [\'sadness\']]}',
 '{"list_emotion_classes": [[\'joy\'], [\'joy\'], [\'anger\', \'fear\'], [\'anger\', \'fear\'], [\'joy\'], [\'joy\'], [\'anger\']]}',
 '{"list_emotion_classes": [[\'anger\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'surprise\'], [\'anger\'], [\'sadness\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'joy\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\']]}',
 '{"list_emotion_classes": [[\'anger\'], [\'anger\'], [\'anger\'], [\'sadness\'], [\'anger\', \'surprise\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'anger\', \'sadness\'], [\'an

In [164]:
grounds = df_test.emotion_c.tolist()

In [165]:
len(grounds)

156

In [166]:
import json

In [167]:
predictions = [json_repair.loads(e) for e in raw_outputs]

In [168]:
predictions

[{'list_emotion_classes': [['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['joy'],
   ['surprise', 'joy'],
   ['joy'],
   ['joy'],
   ['sadness'],
   ['sadness']]},
 {'list_emotion_classes': [['joy'],
   ['joy'],
   ['anger', 'fear'],
   ['anger', 'fear'],
   ['joy'],
   ['joy'],
   ['anger']]},
 {'list_emotion_classes': [['anger'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'surprise'],
   ['anger'],
   ['sadness'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['joy'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['anger']]},
 {'list_emotion_classes': [['anger'],
   ['anger'],
   ['anger'],
   ['sadness'],
   ['anger', 'surprise'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['anger', 'sadness'],
   ['ange

In [169]:
# #predictions = [e.split('\n\n')[0] for e in raw_outputs]
# bad_idx = []
# predictions = []

# for idx, e in enumerate(raw_outputs):
#     try:
#         predictions.append(json.loads(e))
#     except:
#         print(idx)
#         bad_idx.append(idx)
        


In [170]:
#len(predictions)

In [171]:
#predictions

In [172]:
#predictions = [json_repair.loads(e) for e in predictions]

In [173]:
#predictions

In [174]:
preds_l = []
bad_idx = []

for i, pred in enumerate(predictions):
    try:        
        preds_l.append(pred['list_emotion_classes'])
    except:
        print(i)
        bad_idx.append(i)

In [175]:
preds_l

[[['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['sadness'],
  ['sadness']],
 [['joy'],
  ['joy'],
  ['anger', 'fear'],
  ['anger', 'fear'],
  ['joy'],
  ['joy'],
  ['anger']],
 [['anger'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'surprise'],
  ['anger'],
  ['sadness'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['joy'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger']],
 [['anger'],
  ['anger'],
  ['anger'],
  ['sadness'],
  ['anger', 'surprise'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger'],
  ['anger'],
  ['anger']],
 [['anger'], ['joy'], ['joy']],
 [['anger', 'fear'],
  ['fear', 'surprise'],
  ['anger'],
  ['fear', 'sadness', 'surpri

In [176]:
grounds = [item for i, item in enumerate(grounds) if i not in bad_idx]

In [177]:
len(grounds), len(preds_l)

(156, 156)

In [178]:
preds_l

[[['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['sadness'],
  ['sadness']],
 [['joy'],
  ['joy'],
  ['anger', 'fear'],
  ['anger', 'fear'],
  ['joy'],
  ['joy'],
  ['anger']],
 [['anger'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'surprise'],
  ['anger'],
  ['sadness'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['joy'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger']],
 [['anger'],
  ['anger'],
  ['anger'],
  ['sadness'],
  ['anger', 'surprise'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger'],
  ['anger'],
  ['anger']],
 [['anger'], ['joy'], ['joy']],
 [['anger', 'fear'],
  ['fear', 'surprise'],
  ['anger'],
  ['fear', 'sadness', 'surpri

In [179]:
grounds

["[['surprise', 'joy'], ['joy'], ['surprise', 'joy'], ['joy'], ['joy'], ['joy'], ['surprise'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['neutral'], ['neutral'], ['anger', 'disgust'], ['anger', 'disgust'], ['neutral'], ['sadness'], ['sadness']]",
 "[['anger', 'sadness'], ['anger', 'sadness'], ['anger', 'sadness'], ['fear', 'surprise'], ['surprise'], ['joy'], ['anger', 'surprise'], ['joy'], ['joy'], ['joy'], ['anger'], ['anger'], ['surprise', 'joy'], ['fear', 'sadness'], ['fear', 'sadness'], ['fear', 'surprise']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['fear', 'sadness'], ['fear', 'sadness', 'surprise'], ['sadness'], ['sadness'], ['fear', 'sadness'], ['sadness', 'surprise'], ['sadness', 'surprise'], ['joy'], ['anger'], ['anger'], ['anger'], ['anger', 'disgust'], ['joy'], ['joy'], ['surprise', 'joy'], ['surprise', 'joy'], ['anger', 'surprise'], ['anger', 'surprise']]",
 "[['neutral'], ['joy'], ['joy']]",
 "[['neutral'], ['fear'], ['neutral'], ['ang

In [180]:
import ast

grounds = [ast.literal_eval(x) for x in grounds]

In [181]:
grounds

[[['surprise', 'joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['surprise'],
  ['joy'],
  ['joy'],
  ['neutral'],
  ['neutral']],
 [['neutral'],
  ['neutral'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['neutral'],
  ['sadness'],
  ['sadness']],
 [['anger', 'sadness'],
  ['anger', 'sadness'],
  ['anger', 'sadness'],
  ['fear', 'surprise'],
  ['surprise'],
  ['joy'],
  ['anger', 'surprise'],
  ['joy'],
  ['joy'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['surprise', 'joy'],
  ['fear', 'sadness'],
  ['fear', 'sadness'],
  ['fear', 'surprise']],
 [['anger', 'disgust'],
  ['anger', 'disgust'],
  ['anger', 'disgust'],
  ['fear', 'sadness'],
  ['fear', 'sadness', 'surprise'],
  ['sadness'],
  ['sadness'],
  ['fear', 'sadness'],
  ['sadness', 'surprise'],
  ['sadness', 'surprise'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['anger', 'disgust'],
  ['joy'],
  ['joy'],
  ['surprise', 'joy'],
  ['surprise', 'joy'],
  ['anger', 'surprise'],
  ['anger', 

In [182]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, preds_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

3 21 20
6 15 14
36 17 16
58 6 7


In [183]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del preds_l[idx]

In [184]:
grounds = [item for sublist in grounds for item in sublist]
predictions = [item for sublist in preds_l for item in sublist]

In [185]:
len(grounds), len(predictions)

(1267, 1267)

In [186]:
mlb = MultiLabelBinarizer()

In [187]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)

In [188]:
y_pred_mhot.shape

(1267, 7)

In [189]:
y_pred_mhot.shape

(1267, 7)

In [190]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.538     0.635     0.582       438
     disgust      0.238     0.109     0.149        46
        fear      0.440     0.563     0.494       293
         joy      0.491     0.509     0.500       279
     neutral      0.250     0.037     0.065       108
     sadness      0.484     0.519     0.501       324
    surprise      0.547     0.471     0.506       331

   micro avg      0.496     0.505     0.500      1819
   macro avg      0.427     0.406     0.400      1819
weighted avg      0.482     0.505     0.485      1819
 samples avg      0.519     0.524     0.493      1819

