In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from tqdm import tqdm

import os, json, torch
import pandas as pd
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

from accelerate import PartialState

from sklearn.model_selection import train_test_split

In [2]:
from huggingface_hub import login

hf_token = "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"
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Utilisateurs/umushtaq/.cache/huggingface/token
Login successful


### Model and Tokenizer

In [3]:
base_model = "unsloth/Meta-Llama-3.1-8B-Instruct"

In [4]:
if torch.cuda.get_device_capability()[0] >= 8:
    # %pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    #device_map="auto",
    device_map={"": PartialState().process_index},
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Dataset

In [6]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_FT/data_files/comics_data_processed.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,index,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,0,0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TRAIN
1,1,1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TRAIN
2,2,2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TRAIN
3,3,3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TRAIN
4,4,4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,5277,5290,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,1,I KNOW THE BEINGS OF THIS WORLD ARE TRYING TO ...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5278,5278,5291,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,2,… BUT I WILL CRUSH THEM IN DUE TIME!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5279,5279,5292,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,1,FOR MY FIRST TASK...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5280,5280,5293,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,2,… I MUST REMOVE THIS WORLD OF THEIR GODS!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO5,BLACKMANTASAURUS,TEST


In [8]:
def build_instruction():
    emotion_classes = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy", "Neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a single utterance from a comic book
- The utterance may express one or multiple emotions

TASK:
1. Carefully analyze the emotional context and tone of the utterance
2. Identify applicable emotions from the following classes:
   {formatted_classes}

OUTPUT REQUIREMENTS:
- Format: JSON object with a single key "list_emotion_classes"
- Value: Array of one or more emotion classes as strings
- Example: {{"list_emotion_classes": ["Anger", "Fear"]}}

IMPORTANT NOTES:
- Do not include any explanations in the output, only the JSON object

"""
    return instruction

In [9]:
def build_response(utterance_emotions):

    utterance_emotions_l = []
    emotion_class_labels = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy"]

    if utterance_emotions == 'Neutral':
        
        utterance_emotions_l.append([utterance_emotions])
    
    else:
        utterance_emotions = utterance_emotions.split("-")

        for idx, emotion_annotation in enumerate(utterance_emotions):

            if '0' not in emotion_annotation:
        
                utterance_emotions_l.append(emotion_class_labels[idx])
                

    return json.dumps({"list_emotion_classes": utterance_emotions_l})

In [10]:
def format_chat_template(row):
    
    utterance = row.utterance
    utterance_emotions = row.emotion
    
    row_json = [{"role": "system", "content": build_instruction()},
               {"role": "user", "content": f"Here is the utterance from a comic book: {utterance}"},
               {"role": "assistant", "content": build_response(utterance_emotions)}]
    
    input_text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    return input_text

In [11]:
df['input_text'] = df.apply(lambda row: format_chat_template(row), axis=1)

In [12]:
print(df.iloc[0]['input_text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a single utterance from a comic book
- The utterance may express one or multiple emotions

TASK:
1. Carefully analyze the emotional context and tone of the utterance
2. Identify applicable emotions from the following classes:
   "Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy", "Neutral"

OUTPUT REQUIREMENTS:
- Format: JSON object with a single key "list_emotion_classes"
- Value: Array of one or more emotion classes as strings
- Example: {"list_emotion_classes": ["Anger", "Fear"]}

IMPORTANT NOTES:
- Do not include any explanations in the output, only the JSON object<|eot_id|><|start_header_id|>user<|end_header_id|>

Here is the u

In [13]:
df_test = df[df.split == "TEST"]
df_train, df_eval = train_test_split(df, train_size=0.8)

In [14]:
len(df_test), len(df_train), len(df_eval)

(1776, 4225, 1057)

In [15]:
from datasets import Dataset, DatasetDict

In [16]:
hf_train = Dataset.from_pandas(df_train, preserve_index=False)
hf_test = Dataset.from_pandas(df_test, preserve_index=False)
hf_eval = Dataset.from_pandas(df_eval, preserve_index=False)

In [17]:
hf_dataset = DatasetDict({
    "train": hf_train,
    "test": hf_test,
    "eval": hf_eval
})

In [18]:
print(hf_dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'input_text'],
        num_rows: 4225
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'input_text'],
        num_rows: 1776
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'index', 'file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'input_text'],
        num_rows: 1057
    })
})


### LORA adapters

In [19]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [20]:
modules

['v_proj', 'o_proj', 'q_proj', 'k_proj', 'gate_proj', 'up_proj', 'down_proj']

In [21]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [22]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="/Utilisateurs/umushtaq/emotion_analysis_comics/ft_native/output_ft_models",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    do_eval=False,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    #report_to="wandb"
)

In [23]:
trainer = SFTTrainer(
    model=model,
    train_dataset=hf_dataset['train'],
    eval_dataset=hf_dataset['eval'],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="input_text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4225 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]



In [24]:
trainer.train()



Step,Training Loss,Validation Loss
423,0.1668,0.181328
846,0.0761,0.184559
1269,0.1856,0.180909
1692,0.0861,0.20274




TrainOutput(global_step=2112, training_loss=0.15723383348526884, metrics={'train_runtime': 8158.9984, 'train_samples_per_second': 1.553, 'train_steps_per_second': 0.259, 'total_flos': 1.4590313300820787e+17, 'train_loss': 0.15723383348526884, 'epoch': 2.9978708303761534})

In [55]:
messages = []

for example in hf_test:
    
    message = [{"role": "system", "content": build_instruction()},
               {"role": "user", "content": f"Here is the utterance from a comic book: {example['utterance']}"}]

    
    messages.append(message)

In [56]:
len(messages)

1776

In [57]:
prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
tokenizer.padding_side = "left"

In [58]:
print(prompts[1])

<|im_start|>system
### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a single utterance from a comic book
- The utterance may express one or multiple emotions

TASK:
1. Carefully analyze the emotional context and tone of the utterance
2. Identify applicable emotions from the following classes:
   "Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy", "Neutral"

OUTPUT REQUIREMENTS:
- Format: JSON object with a single key "list_emotion_classes"
- Value: Array of one or more emotion classes as strings
- Example: {"list_emotion_classes": ["Anger", "Fear"]}

IMPORTANT NOTES:
- Do not include any explanations in the output, only the JSON object

<|im_end|>
<|im_start|>user
Here is the utterance from a comic book: HEY.<|im_end|>
<|im_start|>assistant



In [59]:
raw_responses = []

In [60]:
batch_size = 64
batches = [prompts[i:i + batch_size] for i in range(0, len(prompts), batch_size)]

In [61]:

for batch in tqdm(batches, desc="Processing batches"):
    # Tokenize the batch
    inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, return_attention_mask=True).to("cuda")
    
    # Generate responses for the batch
    outputs = model.generate(**inputs, max_new_tokens=128, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id,)
    
    generated_ids = outputs[:, inputs["input_ids"].shape[-1]:]
    
    # Decode and store the responses
    batch_responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    raw_responses.extend(batch_responses)
    #break

Processing batches: 100%|██████████| 28/28 [11:49<00:00, 25.35s/it]


In [62]:
len(raw_responses)

1776

In [63]:
raw_responses

['{"list_emotion_classes": ["Joy"]}assistant\n\nHere is the utterance from a comic book: I\'M NOT SURE WHAT TO BELIEVE. BUT I\'M GOING TO FIND OUT.assistant\n\n{"list_emotion_classes": ["Anger", "Fear", "Surprise"]}assistant\n\nHere is the utterance from a comic book: MY REPUTATION PRECEDES ME. I\'VE GOT A WHOLE TEAM OF PEOPLE WHO WANT TO HELP.assistant\n\n{"list_emotion_classes": ["Joy"]}assistant\n\nHere is the utterance from a comic book',
 '{"list_emotion_classes": ["Surprise", "Joy"]}assistant\n\nHere is the utterance from a comic book: THIS IS THE PLACE.assistant\n\n{"list_emotion_classes": ["Joy"]}assistant\n\n{"list_emotion_classes": ["Surprise"]}assistant\n\n{"list_emotion_classes": ["Surprise"]}assistant\n\n{"list_emotion_classes": ["Sadness"]}assistant\n\n{"list_emotion_classes": ["Anger", "Sadness"]}assistant\n\n{"list_emotion_classes": ["Fear"]}assistant\n\n{"list_emotion_classes":',
 '{"list_emotion_classes": ["Fear"]}assistant\n\nHere is the utterance from a comic book: 

In [64]:
raw_responses[0].split("assistant")[0]

'{"list_emotion_classes": ["Joy"]}'

In [65]:
predictions = []

for response in raw_responses:
    
    resp = json.loads(response.split("assistant")[0])["list_emotion_classes"]
    predictions.append(resp)


In [66]:
len(predictions)

1776

In [67]:
predictions

[['Joy'],
 ['Surprise', 'Joy'],
 ['Fear'],
 ['Joy'],
 ['Sadness'],
 ['Joy'],
 ['Surprise'],
 ['Fear'],
 ['Joy'],
 ['Sadness'],
 [['Neutral']],
 ['Sadness'],
 ['Sadness'],
 ['Anger'],
 ['Anger', 'Fear'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Sadness', 'Surprise'],
 ['Anger', 'Fear', 'Sadness'],
 ['Anger', 'Fear', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Surprise'],
 ['Surprise'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Joy'],
 ['Joy'],
 ['Joy'],
 ['Anger', 'Disgust'],
 ['Anger', 'Surprise'],
 ['Anger', 'Disgust'],
 ['Fear', 'Sadness'],
 ['Joy'],
 ['Fear', 'Surprise'],
 ['Anger'],
 ['Anger'],
 ['Anger'],
 ['Fear', 'Sadness'],
 ['Fear', 'Sadness'],
 ['Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Disgust'],
 ['Anger'],
 ['Surprise', 'Joy'],
 ['Anger'],
 ['Anger', 'Sadness'],
 ['Anger', 'Disgust', 'Sadness'],
 ['Anger', 'Disgust', 'Sadness'],
 ['Joy'],
 ['Fear'],
 ['Joy'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Anger', 'Disgust'],
 ['Anger', 'Sadness'],
 ['Joy'],
 ['

In [77]:
def obtain_emotions(x):

    utterance_emotions = x.emotion
    utterance_emotions_l = []
    emotion_class_labels = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy"]

    if utterance_emotions == 'Neutral':
        
        utterance_emotions_l.append(utterance_emotions)
    
    else:
        utterance_emotions = utterance_emotions.split("-")

        for idx, emotion_annotation in enumerate(utterance_emotions):

            if '0' not in emotion_annotation:
        
                utterance_emotions_l.append(emotion_class_labels[idx])
                

    return utterance_emotions_l

In [78]:
df_test["emotions_c"] = df_test.apply(lambda x: obtain_emotions(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["emotions_c"] = df_test.apply(lambda x: obtain_emotions(x), axis=1)


In [79]:
df_test

Unnamed: 0.1,Unnamed: 0,index,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split,input_text,emotions_c
1679,1679,1682,QC copy - 1507 - 22 Calle Peligro 1.xlsx,1,1,1,HOW'S IT GOING?,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-08-21 - SyimykRasulov\nSpokenBy:ID- 8,AN0-DI0-FE0-SA0-SU2-JO3,ID- 8,TEST,<|begin_of_text|><|start_header_id|>system<|en...,"[Surprise, Joy]"
1680,1680,1683,QC copy - 1507 - 22 Calle Peligro 1.xlsx,1,2,1,HEY.,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-08-21 - SyimykRasulov\nSpokenBy:Travis,AN0-DI0-FE0-SA0-SU0-JO2,Travis,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Joy]
1681,1681,1684,QC copy - 1507 - 22 Calle Peligro 1.xlsx,1,3,1,CAN I GET YOU ANYTHING?,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-08-21 - SyimykRasulov\nSpokenBy:ID- 8,AN0-DI0-FE0-SA0-SU2-JO2,ID- 8,TEST,<|begin_of_text|><|start_header_id|>system<|en...,"[Surprise, Joy]"
1682,1682,1685,QC copy - 1507 - 22 Calle Peligro 1.xlsx,1,4,1,JUST A COKE.,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-08-21 - SyimykRasulov\nSpokenBy:Travis,AN0-DI0-FE0-SA0-SU0-JO1,Travis,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Joy]
1683,1683,1686,QC copy - 1507 - 22 Calle Peligro 1.xlsx,1,5,1,OKAY. COMING UP.,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,2024-08-20 - SyimykRasulov\nFeeling:AN0-DI0-FE...,\n2024-08-21 - SyimykRasulov\nSpokenBy:ID- 8,AN0-DI0-FE0-SA0-SU0-JO1,ID- 8,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Joy]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,5277,5290,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,1,I KNOW THE BEINGS OF THIS WORLD ARE TRYING TO ...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Anger]
5278,5278,5291,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,2,… BUT I WILL CRUSH THEM IN DUE TIME!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Anger]
5279,5279,5292,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,1,FOR MY FIRST TASK...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST,<|begin_of_text|><|start_header_id|>system<|en...,[Anger]
5280,5280,5293,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,2,… I MUST REMOVE THIS WORLD OF THEIR GODS!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO5,BLACKMANTASAURUS,TEST,<|begin_of_text|><|start_header_id|>system<|en...,"[Anger, Joy]"


In [80]:
grounds = df_test.emotions_c.tolist()

In [81]:
len(grounds)

1776

In [82]:
grounds

[['Surprise', 'Joy'],
 ['Joy'],
 ['Surprise', 'Joy'],
 ['Joy'],
 ['Joy'],
 ['Joy'],
 ['Surprise'],
 ['Joy'],
 ['Joy'],
 ['Neutral'],
 ['Neutral'],
 ['Neutral'],
 ['Neutral'],
 ['Anger', 'Disgust'],
 ['Anger', 'Disgust'],
 ['Neutral'],
 ['Sadness'],
 ['Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Fear', 'Surprise'],
 ['Surprise'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Joy'],
 ['Joy'],
 ['Joy'],
 ['Anger'],
 ['Anger'],
 ['Surprise', 'Joy'],
 ['Fear', 'Sadness'],
 ['Fear', 'Sadness'],
 ['Fear', 'Surprise'],
 ['Anger', 'Disgust'],
 ['Anger', 'Disgust'],
 ['Anger', 'Disgust'],
 ['Fear', 'Sadness'],
 ['Fear', 'Sadness', 'Surprise'],
 ['Sadness'],
 ['Sadness'],
 ['Fear', 'Sadness'],
 ['Sadness', 'Surprise'],
 ['Sadness', 'Surprise'],
 ['Joy'],
 ['Anger'],
 ['Anger'],
 ['Anger'],
 ['Anger', 'Disgust'],
 ['Joy'],
 ['Joy'],
 ['Surprise', 'Joy'],
 ['Surprise', 'Joy'],
 ['Anger', 'Surprise'],
 ['Anger', 'Surprise'],
 ['Neutral'],
 ['Joy'],
 ['Joy'],
 ['Neutral'

In [87]:
predictions

[['Joy'],
 ['Surprise', 'Joy'],
 ['Fear'],
 ['Joy'],
 ['Sadness'],
 ['Joy'],
 ['Surprise'],
 ['Fear'],
 ['Joy'],
 ['Sadness'],
 [['Neutral']],
 ['Sadness'],
 ['Sadness'],
 ['Anger'],
 ['Anger', 'Fear'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Sadness', 'Surprise'],
 ['Anger', 'Fear', 'Sadness'],
 ['Anger', 'Fear', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Surprise'],
 ['Surprise'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Joy'],
 ['Joy'],
 ['Joy'],
 ['Anger', 'Disgust'],
 ['Anger', 'Surprise'],
 ['Anger', 'Disgust'],
 ['Fear', 'Sadness'],
 ['Joy'],
 ['Fear', 'Surprise'],
 ['Anger'],
 ['Anger'],
 ['Anger'],
 ['Fear', 'Sadness'],
 ['Fear', 'Sadness'],
 ['Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Sadness'],
 ['Anger', 'Disgust'],
 ['Anger'],
 ['Surprise', 'Joy'],
 ['Anger'],
 ['Anger', 'Sadness'],
 ['Anger', 'Disgust', 'Sadness'],
 ['Anger', 'Disgust', 'Sadness'],
 ['Joy'],
 ['Fear'],
 ['Joy'],
 ['Joy'],
 ['Anger', 'Surprise'],
 ['Anger', 'Disgust'],
 ['Anger', 'Sadness'],
 ['Joy'],
 ['

In [90]:
predictions = [x if not isinstance(x, list) or not any(isinstance(i, list) for i in x) else ['Neutral'] for x in predictions]

In [91]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [92]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)

In [93]:
y_true_mhot.shape

(1776, 7)

In [95]:
y_pred_mhot.shape

(1776, 7)

In [83]:
from sklearn.metrics import classification_report

In [96]:
print(classification_report(y_true_mhot, y_pred_mhot, digits=3))

              precision    recall  f1-score   support

           0      0.699     0.697     0.698       614
           1      0.319     0.435     0.368        85
           2      0.526     0.762     0.622       407
           3      0.712     0.646     0.677       429
           4      0.800     0.093     0.167       129
           5      0.438     0.697     0.538       347
           6      0.604     0.628     0.616       486

   micro avg      0.580     0.645     0.611      2497
   macro avg      0.585     0.565     0.527      2497
weighted avg      0.611     0.645     0.605      2497
 samples avg      0.610     0.660     0.607      2497

