In [1]:
import sys
import torch
import pickle
import argparse
import pandas as pd

import random
from operator import itemgetter
import torch.nn.functional as F

# sys.path.append('../')

from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

In [2]:
model_id = "microsoft/Phi-3.5-mini-instruct"
k = 5

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
embedding_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)



In [4]:
#inference_tokenizer = AutoTokenizer.from_pretrained(model_id, padding='left', padding_side='left')
inference_tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
inference_tokenizer.pad_token = inference_tokenizer.eos_token
#terminators = [inference_tokenizer.eos_token_id, inference_tokenizer.convert_tokens_to_ids("<|eot_id|>")]

generation_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
CURRENT_DIR = Path.cwd()
EAC_DIR = Path(CURRENT_DIR) / "emotion_analysis_comics"
ICL_DIR = Path(EAC_DIR) / "incontext_learning"
DATASET_DIR = Path(EAC_DIR) / "dataset_files"
OUTPUT_DIR = Path(EAC_DIR) / "incontext_learning" / "results" / f"comics35_pg_icl_{model_id.split('/')[1]}"

In [6]:
df = pd.read_csv(DATASET_DIR / "comics_dataset_pg.csv", index_col=False)
df = df.drop(df.columns[0], axis=1)

In [7]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],..."
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]"
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]"
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur..."
...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']..."
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [..."
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr..."


In [8]:
file_names = df.file_name.unique().tolist()

In [9]:
book_titles = ['Jurassic League',
'Nightwing',
'Worlds Without a Justice League - Green Lantern',
'Worlds Without a Justice League - Green Lantern',
'Dark Crisis: The Flash',
'Danger Street',
'Tiny Titans',
'Human Target',
'DC vs. Vampires',
'Tales For a Halloween Night',
'The Amazing Adventures of the Ninja Turtles',
'Sonic The Hedgehog',
'Love Everlasting',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'Fantasmas',
'American Vampire',
'American Vampire',
'American Vampire',
'Dragon Age',
'Dragon Age',
'Dragon Age',
'The Walking Dead',
'The Walking Dead',
'The Walking Dead',
'The Walking Dead',
'Thief Of Thieves',
'Stillwater']

In [10]:
def get_book_title(row):
    
    file_name = row.file_name
    
    return book_titles[file_names.index(file_name)]

In [11]:
df['book_title'] = df.apply(lambda row: get_book_title(row), axis=1)

In [12]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,book_title
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League
...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater


In [13]:
def get_book_title_embeddings(df):    
    
    
    title_embed_d = {}

    for title in tqdm(df.book_title):
        # print(utterance)
        while True:
            try:
                inputs = embedding_tokenizer(title, return_tensors="pt").to(device)
                output = embedding_model(**inputs)
                embedding = output[1][0].squeeze().cpu()
                title_embed_d[title] = embedding.detach().numpy()
                break
            except Exception as e:
                print(e)
                
    return title_embed_d

In [14]:
title_embed_d = get_book_title_embeddings(df)

  0%|          | 0/874 [00:00<?, ?it/s]

In [15]:
df['title_embedding'] = df.book_title.apply(lambda x: title_embed_d[x])

In [16]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,book_title,title_embedding
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League,"[-0.8664565, -0.3687145, -0.18329039, 0.621039..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League,"[-0.8664565, -0.3687145, -0.18329039, 0.621039..."
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League,"[-0.8664565, -0.3687145, -0.18329039, 0.621039..."
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League,"[-0.8664565, -0.3687145, -0.18329039, 0.621039..."
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League,"[-0.8664565, -0.3687145, -0.18329039, 0.621039..."
...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater,"[-0.9474294, -0.42099634, -0.85804147, 0.90005..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater,"[-0.9474294, -0.42099634, -0.85804147, 0.90005..."
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater,"[-0.9474294, -0.42099634, -0.85804147, 0.90005..."
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater,"[-0.9474294, -0.42099634, -0.85804147, 0.90005..."


In [17]:
def get_k_neighbours(k, title, train_df, test_df):
    
    
    test_title_embedding = test_df[test_df.book_title == title]["title_embedding"].values[0]

    title_embed_d = {}
    
    for e in train_df.iterrows():
        
        embedding_key = tuple(e[1].title_embedding)
        if embedding_key not in title_embed_d:
            
            title_embed_d[e[1].book_title] = e[1].title_embedding

    train_title = set(train_df.book_title)

    dist_l = []
    
    for t, v in title_embed_d.items():
        if t in train_title:
            
            d = F.cosine_similarity(torch.tensor(test_title_embedding), torch.tensor(v), dim=0)
            dist_l.append((t, d.item()))

    sorted_dist_l = sorted(dist_l, key=itemgetter(1), reverse=True)
    
    return sorted_dist_l[0: k]

In [18]:
train_df = df[df.split == "TRAIN"].reset_index(drop=True)
test_df = df[df.split == "TEST"].reset_index(drop=True)

In [19]:
get_k_neighbours(3, test_df.iloc[3]['book_title'], train_df, test_df)

[('Tales For a Halloween Night', 0.9812239408493042),
 ('Jurassic League', 0.9766192436218262),
 ('Dragon Age', 0.9764137864112854)]

In [20]:
import ast
import json

In [21]:
def prepare_similar_example_prompts(title, k, train_df, test_df, seed=33):
    """
    Create a part of the prompt made of k examples in the train set,
    whose topic is most similar to a given title.
    """

    random.seed(seed)

    # Get 2*k neighbors and sample k randomly
    #neighbours_l = get_k_neighbours(2 * k, title, train_df=train_df, test_df=test_df)
    #sampled_neighbours_l = random.sample(neighbours_l, k)
    
    neighbours_l = get_k_neighbours(k, title, train_df=train_df, test_df=test_df)
    sampled_neighbours_l = random.sample(neighbours_l, 1)

    prompt = ''
    
    for i, (title, dist) in enumerate(sampled_neighbours_l):
        # Filter rows by title and randomly select 3 rows
        example_df = train_df[train_df.book_title == title]
        sampled_rows = example_df.sample(k, random_state=42)

        
        cnt = 0
        for _, row in sampled_rows.iterrows():
            
            prompt += f'EXAMPLE {cnt + 1}\n\n'
            # Convert utterance string to a list
            utterance_list = json.loads(row.utterance) if row.utterance.strip().startswith('[') else ast.literal_eval(row.utterance)

            # Add numbered utterances to the prompt
            for idx, utt in enumerate(utterance_list, start=1):
                prompt += f'{idx}. {utt}\n'

            # Add emotion class for this row
            #prompt += f'\nEmotion class: {row.emotion_c}\n\n'
            prompt += f'\nOutput: {{"page_emotion_classes": "{row.emotion_c}"}}\n\n'
            cnt += 1

        
        prompt += '\n'
    return prompt


In [22]:
print(prepare_similar_example_prompts(test_df.iloc[0]['book_title'], k, train_df, test_df, seed=33))

EXAMPLE 1

1. #AHHH.
2. WHAT THE HELL? WHERE AM I?
3. NO, NO, NO… WHAT IS THAT?
4. OH MY GOD!
5. NO!
6. #AAAHHHH--

Output: {"page_emotion_classes": "[['fear'], ['fear', 'surprise'], ['fear', 'surprise'], ['fear', 'surprise'], ['fear', 'surprise'], ['fear', 'surprise']]"}

EXAMPLE 2

1. " HAUNTED. "
2. IT'S REALLY A SHAME, JACKSON. YOUR UNCANNY ABILITY TO ATTRACT THE NEFARIOUS IS STRONG, AND YET YOU'VE NEVER BEEN ABLE TO PUT IT TO USE FOR YOUR OWN GAIN.
3. INSTEAD YOU CONTINUE TO BRING OTHERS DOWN WITH YOU.
4. WHAT'RE YOU TALKING ABOUT…?
5. JACKSON! GODDAMN, BOY, IT'S GOOD TO SEE YOU. WENONA IS HERE AND--
6. YEAH, YEAH, I CAUGHT THE GIST OF IT ALREADY. WENONA MADE A DEAL WITH THE MAESTRO AND… SPEAK OF THE DEVIL.
7. AH, JACKSON. IT'S SO EXQUISITE TO SEE YOU.
8. I KNEW OUR DARK ELDER GODS WOULD ANSWER MY PRAYERS AND BRING YOU BACK TO US.
9. WELL, YOU KNOW WHAT THEY SAY, " IF YOU LOVE IT, SET IT FREE. "
10. ALWAYS WITH THE FOOLHARDY COMMENTS, JACKSON. EVEN IN THE FACE OF YOUR OWN DEMISE Y

In [23]:
# def build_sys_instruction():
    
#     emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
#     formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
#     instruction = f"""### Page-Level Emotion Analysis Expert Role

# You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze all utterances on a comic book page.

# INPUT:
# - You will receive a list of utterances from a single page in a comic book
# - Each utterance may express one or multiple emotions
# - You will receive lists of utterances from {k} pages and their respective emotion classifications as example
# - Each example will contain multiple utterances and their respective emotion classifications
# - Given these examples, analyze the list of utterances following the same pattern

# TASK:
# - Carefully analyze the emotional context and tone of each utterance on the page
# - Identify applicable emotions from the following classes:
#    {formatted_classes}

# OUTPUT REQUIREMENTS:
# - Format: JSON object with a single key "page_emotion_classes"
# - Value: Array of one or more emotion classes as strings
# - Example: {{"page_emotion_classes": ["anger", "fear"], ["neutral"], ["joy"]}}
# - You must absolutely not include any explanations in the output, only the JSON object

# """
#     return instruction

In [24]:
def build_sys_instruction():
    
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Page-Level Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze all utterances on a comic book page.

INPUT:
- You will receive a list of utterances from a single page in a comic book
- Each utterance may express one or multiple emotions
- You will receive lists of utterances from {k} pages and their respective emotion classifications as example
- Each example will contain multiple utterances and their respective emotion classifications
- Given these examples, analyze the list of utterances following the same pattern

TASK:
- Carefully analyze the emotional context and tone of each utterance on the page
- Identify applicable emotions from the following classes:
   {formatted_classes}

OUTPUT REQUIREMENTS:
- STRICT JSON FORMAT IS MANDATORY
- NO ADDITIONAL TEXT OR EXPLANATION IS ALLOWED
- Example valid output: {{"page_emotion_classes": [["anger"], ["fear", "sadness"], ["neutral"]]}}
- Invalid formats will result in immediate rejection

CRITICAL CONSTRAINTS:
- Wrap EACH emotion in square brackets
- Ensure valid JSON syntax at all times
- Do NOT deviate from the specified JSON structure under any circumstances
- Do NOT generate any additional text or explanation except the JSON object

"""
    return instruction

In [25]:
# For LLaMA

sys_msg_l = []
user_msg_l = []
task_msg_l = []

for _,row in tqdm(test_df.iterrows(), total=len(test_df)):
    
    utterance_list = json.loads(row.utterance) if row.utterance.strip().startswith('[') else ast.literal_eval(row.utterance)
    utterances = '\n'.join([f'{i+1}. {utt}' for i, utt in enumerate(utterance_list)])
    
    sys_msg = {"role": "system", "content": build_sys_instruction()}
    user_msg = {"role":"user", "content": "EXAMPLES:\n\n" + prepare_similar_example_prompts(row.book_title, k, train_df=train_df, test_df=test_df) + f"INPUT:\n\nNow classify these utterances from a page:\n\n{utterances}\n"}
    task_msg = {"role": "assistant", "content": f"\nOutput: "}
    
    sys_msg_l.append(sys_msg)
    user_msg_l.append(user_msg)
    task_msg_l.append(task_msg)
    
    #break

  0%|          | 0/156 [00:00<?, ?it/s]

In [26]:
# # For Qwen

# sys_msg_l = []
# user_msg_l = []
# task_msg_l = []

# for _,row in tqdm(test_df.iterrows(), total=len(test_df)):
    
#     utterance_list = json.loads(row.utterance) if row.utterance.strip().startswith('[') else ast.literal_eval(row.utterance)
#     utterances = '\n'.join([f'{i+1}. {utt}' for i, utt in enumerate(utterance_list)])
    
#     #sys_msg = {"role": "system", "content": build_sys_instruction()}
#     user_msg = {"role":"user", "content": build_sys_instruction() + "EXAMPLES:\n\n" + prepare_similar_example_prompts(row.book_title, k, train_df=train_df, test_df=test_df) + f"INPUT:\n\nNow classify these utterances from a page:\n\n{utterances}\n"}
#     task_msg = {"role": "assistant", "content": f"\nOutput: "}
    
#     #sys_msg_l.append(sys_msg)
#     user_msg_l.append(user_msg)
#     task_msg_l.append(task_msg)

In [27]:
#For LLaMA

prepared_sys_task_msg_l = []

for i in range(len(sys_msg_l)):
    prepared_sys_task_msg_l.append([sys_msg_l[i], user_msg_l[i], task_msg_l[i]])

In [28]:
# # For Qwen

# prepared_sys_task_msg_l = []

# for i in range(len(user_msg_l)):
#     prepared_sys_task_msg_l.append([user_msg_l[i], task_msg_l[i]])

In [29]:
len(prepared_sys_task_msg_l)

156

In [30]:
def batch_tensor(tensor, batch_size):
    return [tensor[i:i+batch_size] for i in range(0, tensor.size(0), batch_size)]

In [31]:
inputs = inference_tokenizer.apply_chat_template(
            prepared_sys_task_msg_l,
            #pad_token = inference_tokenizer.bos_token,
            padding=True,
            truncation=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt",
)

In [32]:
BATCH_SIZE = 1

input_ids_batches = batch_tensor(inputs['input_ids'], BATCH_SIZE) # type: ignore
attention_mask_batches = batch_tensor(inputs['attention_mask'], BATCH_SIZE) # type: ignore

In [33]:
generated_outputs = []

for i, (input_ids_batch, attention_mask_batch) in tqdm(enumerate(zip(input_ids_batches, attention_mask_batches)), total=len(input_ids_batches)):
    
    print(f"\n\n ***** Processing batch {i + 1} *****\n\n")
    
    if torch.any(torch.isnan(input_ids_batch)) or torch.any(torch.isinf(input_ids_batch)): # type: ignore
        print("Invalid input_ids detected")

    if torch.any(torch.isnan(attention_mask_batch)) or torch.any(torch.isinf(attention_mask_batch)): # type: ignore
        print("Invalid attention_mask detected")

    
    inputs = {
        'input_ids': input_ids_batch.to(generation_model.device), # type: ignore
        'attention_mask': attention_mask_batch.to(generation_model.device) # type: ignore
    }

    outputs = generation_model.generate(
    **inputs,
    max_new_tokens=512,
    pad_token_id=inference_tokenizer.eos_token_id,
    #eos_token_id=terminators,
    do_sample=True,
    temperature=0.1,
    top_p=1.0,
    )
    
    #generated_outputs.append(outputs)
    
    #generated_outputs.append(inference_tokenizer.decode(outputs[inputs['input_ids'].shape[1]:]))
    generated_outputs.append(inference_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True))
    #break

  0%|          | 0/156 [00:00<?, ?it/s]



 ***** Processing batch 1 *****




Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)




 ***** Processing batch 2 *****




 ***** Processing batch 3 *****




 ***** Processing batch 4 *****




 ***** Processing batch 5 *****




 ***** Processing batch 6 *****




 ***** Processing batch 7 *****




 ***** Processing batch 8 *****




 ***** Processing batch 9 *****




 ***** Processing batch 10 *****




 ***** Processing batch 11 *****




 ***** Processing batch 12 *****




 ***** Processing batch 13 *****




 ***** Processing batch 14 *****




 ***** Processing batch 15 *****




 ***** Processing batch 16 *****




 ***** Processing batch 17 *****




 ***** Processing batch 18 *****




 ***** Processing batch 19 *****




 ***** Processing batch 20 *****




 ***** Processing batch 21 *****




 ***** Processing batch 22 *****




 ***** Processing batch 23 *****




 ***** Processing batch 24 *****




 ***** Processing batch 25 *****




 ***** Processing batch 26 *****




 ***** Processing batch 27 *****




 ***** Processing batch 28 *****




 ***** 

In [34]:
len(generated_outputs)

156

In [35]:
generated_outputs

['{"page_emotion_classes": [["neutral", "neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral", "neutral"], ["neutral"], ["neutral"], ["neutral", "neutral"]]}',
 '{"page_emotion_classes": [["neutral", "anger"], ["neutral", "anger"], ["anger"], ["anger", "surprise"], ["joy", "surprise"]]}',
 '{"page_emotion_classes": [["anger", "frustration"], ["anger", "frustration"], ["anger"], ["confusion", "anger"], ["anger"], ["joy"], ["anger"], ["anger"], ["anger"], ["sarcasm", "anger"], ["anger"], ["anger"], ["anger"], ["neutral"], ["joy", "neutral"]]}',
 '{"page_emotion_classes": [["anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "joy", "anger", "anger"], ["anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "joy", "anger", "anger"]]}',
 '{"page_emotion_classes": [["surprise"], ["

In [36]:
generated_outputs = [x.split("\n\n")[0] for x in generated_outputs]

In [37]:
len(generated_outputs)

156

In [38]:
generated_outputs

['{"page_emotion_classes": [["neutral", "neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral"], ["neutral", "neutral"], ["neutral"], ["neutral"], ["neutral", "neutral"]]}',
 '{"page_emotion_classes": [["neutral", "anger"], ["neutral", "anger"], ["anger"], ["anger", "surprise"], ["joy", "surprise"]]}',
 '{"page_emotion_classes": [["anger", "frustration"], ["anger", "frustration"], ["anger"], ["confusion", "anger"], ["anger"], ["joy"], ["anger"], ["anger"], ["anger"], ["sarcasm", "anger"], ["anger"], ["anger"], ["anger"], ["neutral"], ["joy", "neutral"]]}',
 '{"page_emotion_classes": [["anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "joy", "anger", "anger"], ["anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "anger", "joy", "anger", "anger"]]}',
 '{"page_emotion_classes": [["surprise"], ["

In [39]:
import json_repair


In [40]:
outputs_processed = [json_repair.loads(output) for output in generated_outputs]

In [41]:
len(outputs_processed)

156

In [42]:
outputs_processed

[{'page_emotion_classes': [['neutral', 'neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral', 'neutral'],
   ['neutral'],
   ['neutral'],
   ['neutral', 'neutral']]},
 {'page_emotion_classes': [['neutral', 'anger'],
   ['neutral', 'anger'],
   ['anger'],
   ['anger', 'surprise'],
   ['joy', 'surprise']]},
 {'page_emotion_classes': [['anger', 'frustration'],
   ['anger', 'frustration'],
   ['anger'],
   ['confusion', 'anger'],
   ['anger'],
   ['joy'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['sarcasm', 'anger'],
   ['anger'],
   ['anger'],
   ['anger'],
   ['neutral'],
   ['joy', 'neutral']]},
 {'page_emotion_classes': [['anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'joy',
    'anger',
    'anger'],
   ['anger',
    'anger',
    'anger',
    'anger',
    'anger',
    'an

In [43]:
predictions = []

for obj in outputs_processed:
    if isinstance(obj, dict):  # Ensure it's a dictionary
        predictions.extend(obj.values())

In [44]:
# predictions = [x['page_emotion_classes'] for x in outputs_processed]

In [45]:
len(predictions)

156

In [46]:
predictions

[[['neutral', 'neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral', 'neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral', 'neutral']],
 [['neutral', 'anger'],
  ['neutral', 'anger'],
  ['anger'],
  ['anger', 'surprise'],
  ['joy', 'surprise']],
 [['anger', 'frustration'],
  ['anger', 'frustration'],
  ['anger'],
  ['confusion', 'anger'],
  ['anger'],
  ['joy'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['sarcasm', 'anger'],
  ['anger'],
  ['anger'],
  ['anger'],
  ['neutral'],
  ['joy', 'neutral']],
 [['anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'joy',
   'anger',
   'anger'],
  ['anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'anger',
   'joy',
   'anger',
   'an

In [47]:
grounds = test_df.emotion_c.tolist()

In [48]:
len(grounds)

156

In [49]:
grounds

["[['surprise', 'joy'], ['joy'], ['surprise', 'joy'], ['joy'], ['joy'], ['joy'], ['surprise'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['neutral'], ['neutral'], ['anger', 'disgust'], ['anger', 'disgust'], ['neutral'], ['sadness'], ['sadness']]",
 "[['anger', 'sadness'], ['anger', 'sadness'], ['anger', 'sadness'], ['fear', 'surprise'], ['surprise'], ['joy'], ['anger', 'surprise'], ['joy'], ['joy'], ['joy'], ['anger'], ['anger'], ['surprise', 'joy'], ['fear', 'sadness'], ['fear', 'sadness'], ['fear', 'surprise']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['fear', 'sadness'], ['fear', 'sadness', 'surprise'], ['sadness'], ['sadness'], ['fear', 'sadness'], ['sadness', 'surprise'], ['sadness', 'surprise'], ['joy'], ['anger'], ['anger'], ['anger'], ['anger', 'disgust'], ['joy'], ['joy'], ['surprise', 'joy'], ['surprise', 'joy'], ['anger', 'surprise'], ['anger', 'surprise']]",
 "[['neutral'], ['joy'], ['joy']]",
 "[['neutral'], ['fear'], ['neutral'], ['ang

In [50]:
grounds = [ast.literal_eval(item) for item in grounds]

In [51]:
results_file = Path(OUTPUT_DIR) / f"results_{k}.pickle"
results_file.parent.mkdir(parents=True, exist_ok=True)

results_d = {"grounds": grounds,
             "predictions": predictions}

with results_file.open('wb') as fh:
  
    pickle.dump(results_d, fh)

In [52]:
import ast

final_preds = []
bad_idx = []

for i, pred in enumerate(predictions):
    try:
        if isinstance(pred, list):  # Check if pred is already a list
            final_preds.append(pred)
        else:
            final_preds.append(ast.literal_eval(pred))
    except Exception as e:  # Optional: Capture the specific exception
        print(f"Error at index {i}: {e}")
        bad_idx.append(i)


In [53]:
#bad_idx = []

In [54]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del final_preds[idx]

In [55]:
len(grounds), len(final_preds)

(156, 156)

In [56]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, final_preds)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

0 11 10
1 7 5
2 16 15
3 21 2
4 3 2
5 20 19
6 15 12
8 12 10
10 14 2
12 13 12
13 9 8
14 23 24
15 23 22
16 13 12
17 12 11
18 13 12
19 14 13
21 11 10
23 10 9
24 13 11
25 13 11
26 13 11
28 6 5
30 6 5
32 9 8
33 10 9
35 9 8
36 17 16
45 13 12
46 11 10
51 8 7
58 6 5
59 6 5
70 2 1
76 7 8
78 2 1
79 8 9
84 15 13
87 15 14
90 2 1
91 8 9
92 8 9
98 8 6
99 13 12
103 13 16
105 6 5
107 8 7
115 13 12
117 9 7
119 13 11
124 14 12
126 13 12
130 7 1
131 8 7
132 11 10
133 8 7
145 12 11
149 11 10
150 11 10
153 11 10


In [57]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del final_preds[idx]

In [58]:
len(grounds), len(final_preds)

(96, 96)

In [59]:
grounds = [item for sublist in grounds for item in sublist]
predictions = [item for sublist in final_preds for item in sublist]

In [60]:
len(grounds), len(predictions)

(670, 670)

In [61]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
mlb = MultiLabelBinarizer()

In [63]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)



In [64]:
y_true_mhot.shape, y_pred_mhot.shape

((670, 7), (670, 7))

In [65]:
class_labels = mlb.classes_

In [66]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=class_labels, digits=3))

              precision    recall  f1-score   support

       anger      0.639     0.457     0.533       267
     disgust      0.200     0.037     0.062        27
        fear      0.455     0.417     0.435       168
         joy      0.471     0.266     0.340       124
     neutral      0.139     0.651     0.230        43
     sadness      0.565     0.268     0.364       179
    surprise      0.419     0.368     0.392       182

   micro avg      0.426     0.373     0.398       990
   macro avg      0.412     0.352     0.336       990
weighted avg      0.499     0.373     0.410       990
 samples avg      0.400     0.383     0.372       990



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [67]:
classification_file = Path(OUTPUT_DIR) / f"classification_report_{k}.pickle"

with classification_file.open('wb') as fh:
    
    pickle.dump(classification_report(y_true_mhot, y_pred_mhot, target_names=class_labels, output_dict=True), fh)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#len(decoded_outputs)

156

In [None]:
#decoded_outputs

['system\n\nCutting Knowledge Date: December 2023\nToday Date: 25 Nov 2024\n\n### Page-Level Emotion Analysis Expert Role\n\nYou are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze all utterances on a comic book page.\n\nINPUT:\n- You will receive a list of utterances from a single page in a comic book\n- Each utterance may express one or multiple emotions\n- You will receive lists of utterances from 3 pages and their respective emotion classifications as example\n- Each example will contain multiple utterances and their respective emotion classifications\n- Given these examples, analyze the list of utterances following the same pattern\n\nTASK:\n- Carefully analyze the emotional context and tone of each utterance on the page\n- Identify applicable emotions from the following classes:\n   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n\nOUTPUT REQUIREMENTS:\n- Format: JSON object with a single key "pa

In [None]:
# def get_utterance_embeddings(df):    
    
    
#     utterance_embed_d = {}

#     for utterance in tqdm(df.utterance):
#         # print(utterance)
#         while True:
#             try:
#                 inputs = embedding_tokenizer(utterance, return_tensors="pt").to(device)
#                 output = embedding_model(**inputs)
#                 embedding = output[1][0].squeeze().cpu()
#                 utterance_embed_d[utterance] = embedding.detach().numpy()
#                 break
#             except Exception as e:
#                 print(e)
                
#     return utterance_embed_d

In [None]:
# emotion_map = {
#     'AN': 'anger',
#     'DI': 'disgust',
#     'FE': 'fear',
#     'SA': 'sadness',
#     'SU': 'surprise',
#     'JO': 'joy'
# }

# def extract_emotions(row):

#     emotion_str = row.emotion

#     if emotion_str == 'Neutral':
#         return ['neutral']

#     emotions = emotion_str.split('-')
#     tags = []

#     for emotion in emotions:
#         abbrev = emotion[:2]  # Get the abbreviation
#         value_part = emotion[2:]  # Get the value part
        
#         # Ensure that the value part is a valid integer and abbrev is in the emotion_map
#         if abbrev in emotion_map and value_part.isdigit():
#             value = int(value_part)
#             if value > 0:
#                 tags.append(emotion_map[abbrev].lower())
#         else:
#             print(f"Warning: Skipping invalid emotion entry: '{emotion}'")
#     return tags  

In [None]:
# CURRENT_DIR = Path.cwd()
# EAC_DIR = Path(CURRENT_DIR) / "emotion_analysis_comics"
# ICL_DIR = Path(CURRENT_DIR) / "incontext_learning"
# DATASET_DIR = Path(EAC_DIR) / "dataset_files"
# OUTPUT_DIR = Path(EAC_DIR) / "incontext_learning" / "results" / f"comics35_icl_{model_id.split('/')[1]}"

In [None]:
# OUTPUT_DIR

PosixPath('/Utilisateurs/umushtaq/emotion_analysis_comics/incontext_learning/results/comics35_icl_gemma-2-27b-it-bnb-4bit')

In [None]:
# df = pd.read_csv(DATASET_DIR / "comics_dataset.csv")
# df['emotions_list'] = df.apply(lambda row: extract_emotions(row), axis=1)

In [None]:
# utterance_embed_d = get_utterance_embeddings(df)
# df['utterance_embedding'] = df.utterance.apply(lambda x: utterance_embed_d[x])

# train_df = df[df.split == "TRAIN"].reset_index(drop=True)
# test_df = df[df.split == "TEST"].reset_index(drop=True)

  0%|          | 0/7129 [00:00<?, ?it/s]

In [None]:
# import torch
# import random
# from operator import itemgetter
# import torch.nn.functional as F

# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# def get_k_neighbours(k, utterance, train_df, test_df):
    
    
#     test_utterance_embedding = test_df[test_df.utterance == utterance]["utterance_embedding"].values[0]
#     #test_utterance_embedding = torch.tensor(test_utterance_embedding)#.to(device)

#     utterance_embed_d = {}
#     for e in train_df.iterrows():
#         if e[1].utterance not in utterance_embed_d:
#             #utterance_embed_d[e[1].utterance] = e[1].utterance_embedding
#             utterance_embed_d[e[1].utterance] = e[1].utterance_embedding#.to(device)

#     # train_titles = set(df[df.split == 'TRAIN'].title.unique())
#     train_utterances = set(train_df.utterance)

#     dist_l = []
#     for t, v in utterance_embed_d.items():
#         if t in train_utterances:
#             # d = cos_sim(title_embed_d[title], v)
#             d = F.cosine_similarity(torch.tensor(test_utterance_embedding), torch.tensor(v), dim=0)
#             dist_l.append((t, d.item()))

#     sorted_dist_l = sorted(dist_l, key=itemgetter(1), reverse=True)
    
#     return sorted_dist_l[0: k]

# def prepare_similar_example_prompts(utterance, k, train_df, test_df, seed=33):
#     """
#     Create a part of prompt made of k examples in the train set, whose topic is most similar to a given title.
#     """

#     random.seed(seed)

#     neighbours_l = get_k_neighbours(2*k, utterance, train_df=train_df, test_df=test_df) # Fetch the 2*k closest neighbors
#     # print(neighbours_l)
#     sampled_neighbours_l = random.sample(neighbours_l, k) # Only keep k of them
#     # bprint(sampled_neighbours_l)

#     prompt = ''
#     cnt = 0
#     for i, (utterance, dist) in enumerate(sampled_neighbours_l):
#         prompt += f'EXAMPLE {i+1}\n'

#         example_df = train_df[train_df.utterance == utterance]
#         # example_df = example_df[example_df.aty != 'none'].reset_index()
        
#         class_l = []
#         for k in example_df.iterrows():
            
#             # if k[0] == 0:

#             #     prompt += f'# Abstract:\n{example_df.iloc[0].utterance}\n\n# Arguments:\n'
#             #     cnt = 0
                
#             # prompt += f'Argument {cnt + 1}={k[1].text} - Class={k[1].aty}\n'
#             prompt += f'Input: {k[1].utterance}'
#             class_l.append(k[1].emotions_list)
#             cnt += 1
            
#         prompt += '\nOutput: '
#         prompt += '{' + ', '.join([f'"list_emotion_classes": "{class_l[i]}"' for i in range(len(class_l))]) + '}'
#         prompt += '\n\n'

#     return prompt

In [None]:
# def build_instruction():
    
#     emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
#     formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
#     instruction = f"""### Emotion Analysis Expert Role

# You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

# INPUT:
# - You will receive a single utterance from a comic book
# - The utterance may express one or multiple emotions
# - You will receive {k} example utterances and their emotion classifications
# - Given example utterances and their emotion classifications, analyze the new utterance following the same pattern

# TASK:
# 1. Carefully analyze the emotional context and tone of the utterance
# 2. Identify applicable emotions from the following classes:
#    {formatted_classes}

# OUTPUT REQUIREMENTS:
# - Format: JSON object with a single key "list_emotion_classes"
# - Value: Array of one or more emotion classes as strings
# - Example: {{"list_emotion_classes": ["anger", "fear"]}}

# IMPORTANT NOTES:
# - Do not include any explanations in the output, only the JSON object

# """
#     return instruction

In [None]:
#instruction = build_instruction()

In [None]:
# sys_msg_l = []
# task_msg_l = []

In [None]:
# test_df = test_df[:10]

In [None]:
#test_df.shape

(1326, 13)

In [None]:
# for row in tqdm(test_df.iterrows(), total=len(test_df)):
    
#     sys_msg = {"role": "user", "content": instruction + "EXAMPLES:\n\n" + prepare_similar_example_prompts(row[1].utterance, k, train_df=train_df, test_df=test_df)}
#     #sys_msg = {"role":"system", "content": "### Task description: You are an expert biomedical assistant that takes 1) an abstract text, 2) the list of all arguments from this abstract text, and must classify all arguments into one of two classes: Claim or Premise. " + proportion_desc + " You must absolutely not generate any text or explanation other than the following JSON format {\"Argument 1\": <predicted class for Argument 1 (str)>, ..., \"Argument n\": <predicted class for Argument n (str)>}\n\n### Class definitions:" + " Claim = " + claim_fulldesc + " Premise = " + premise_fulldesc + "\n\n### Examples:\n\n" + prepare_similar_example_prompts(title_l[i], experiment_df, k=3, seed=seed)}  # Sample by similar title
#     task_msg = {"role": "assistant", "content": f"Now classify this utternace:\nInput: {row[1].utterance}\nOutput: "}
    
#     sys_msg_l.append(sys_msg)
#     task_msg_l.append(task_msg)

  0%|          | 0/1326 [00:00<?, ?it/s]

In [None]:
#print(sys_msg_l[0]['content'])

### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a single utterance from a comic book
- The utterance may express one or multiple emotions
- You will receive 20 example utterances and their emotion classifications
- Given example utterances and their emotion classifications, analyze the new utterance following the same pattern

TASK:
1. Carefully analyze the emotional context and tone of the utterance
2. Identify applicable emotions from the following classes:
   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"

OUTPUT REQUIREMENTS:
- Format: JSON object with a single key "list_emotion_classes"
- Value: Array of one or more emotion classes as strings
- Example: {"list_emotion_classes": ["anger", "fear"]}

IMPORTANT NOTES:
- Do not include any explanations in the output, only the JSON objec

In [None]:
#print(task_msg_l[0]['content'])

Now classify this utternace:
Input: TIME TO FACE OUR FEARS, PEOPLE…
Output: 


In [None]:
# prepared_sys_task_msg_l = []

# for i in range(len(sys_msg_l)):
#     prepared_sys_task_msg_l.append([sys_msg_l[i], task_msg_l[i]])

In [None]:
# def batch_tensor(tensor, batch_size):
#     return [tensor[i:i+Abatch_size] for i in range(0, tensor.size(0), batch_size)]

In [None]:
# inputs = inference_tokenizer.apply_chat_template(
#             prepared_sys_task_msg_l,
#             #pad_token = inference_tokenizer.bos_token,
#             padding=True,
#             truncation=True,
#             add_generation_prompt=True,
#             return_dict=True,
#             return_tensors="pt",
# )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
#inputs

{'input_ids': tensor([[   1,    1,    1,  ...,  106, 2516,  108],
         [   1,    1,    1,  ...,  106, 2516,  108],
         [   1,    1,    1,  ...,  106, 2516,  108],
         ...,
         [   1,    1,    1,  ...,  106, 2516,  108],
         [   1,    1,    1,  ...,  106, 2516,  108],
         [   1,    1,    1,  ...,  106, 2516,  108]], device='cuda:0'),
 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]], device='cuda:0')}

In [None]:
# BATCH_SIZE = 16

# input_ids_batches = batch_tensor(inputs['input_ids'], BATCH_SIZE) # type: ignore
# attention_mask_batches = batch_tensor(inputs['attention_mask'], BATCH_SIZE) # type: ignore

In [None]:
# generated_outputs = []

# for i, (input_ids_batch, attention_mask_batch) in tqdm(enumerate(zip(input_ids_batches, attention_mask_batches)), total=len(input_ids_batches)):
    
#     print(f"\n\n ***** Processing batch {i + 1} *****\n\n")
    
#     if torch.any(torch.isnan(input_ids_batch)) or torch.any(torch.isinf(input_ids_batch)): # type: ignore
#         print("Invalid input_ids detected")

#     if torch.any(torch.isnan(attention_mask_batch)) or torch.any(torch.isinf(attention_mask_batch)): # type: ignore
#         print("Invalid attention_mask detected")

    
#     inputs = {
#         'input_ids': input_ids_batch.to(generation_model.device), # type: ignore
#         'attention_mask': attention_mask_batch.to(generation_model.device) # type: ignore
#     }

#     outputs = generation_model.generate(
#     **inputs,
#     max_new_tokens=64,
#     pad_token_id=inference_tokenizer.eos_token_id,
#     eos_token_id=terminators,
#     do_sample=False,
#     temperature=0.1,
#     top_p=1.0,
#     )
    
#     generated_outputs.append(outputs)

  0%|          | 0/83 [00:00<?, ?it/s]



 ***** Processing batch 1 *****








 ***** Processing batch 2 *****




 ***** Processing batch 3 *****




 ***** Processing batch 4 *****




 ***** Processing batch 5 *****




 ***** Processing batch 6 *****




 ***** Processing batch 7 *****




 ***** Processing batch 8 *****




 ***** Processing batch 9 *****




 ***** Processing batch 10 *****




 ***** Processing batch 11 *****




 ***** Processing batch 12 *****




 ***** Processing batch 13 *****




 ***** Processing batch 14 *****




 ***** Processing batch 15 *****




 ***** Processing batch 16 *****




 ***** Processing batch 17 *****




 ***** Processing batch 18 *****




 ***** Processing batch 19 *****




 ***** Processing batch 20 *****




 ***** Processing batch 21 *****




 ***** Processing batch 22 *****




 ***** Processing batch 23 *****




 ***** Processing batch 24 *****




 ***** Processing batch 25 *****




 ***** Processing batch 26 *****




 ***** Processing batch 27 *****




 ***** Processing batch 28 *****




 ***** 

In [None]:
# decoded_outputs = []

# for batch in generated_outputs:

#     for prediction in batch:

#         decoded_outputs.append(inference_tokenizer.decode(prediction, skip_special_tokens=True)) # type: ignore

In [None]:
#decoded_outputs

['user\n### Emotion Analysis Expert Role\n\nYou are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.\n\nINPUT:\n- You will receive a single utterance from a comic book\n- The utterance may express one or multiple emotions\n- You will receive 20 example utterances and their emotion classifications\n- Given example utterances and their emotion classifications, analyze the new utterance following the same pattern\n\nTASK:\n1. Carefully analyze the emotional context and tone of the utterance\n2. Identify applicable emotions from the following classes:\n   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n\nOUTPUT REQUIREMENTS:\n- Format: JSON object with a single key "list_emotion_classes"\n- Value: Array of one or more emotion classes as strings\n- Example: {"list_emotion_classes": ["anger", "fear"]}\n\nIMPORTANT NOTES:\n- Do not include any explanations in th

In [None]:
#grounds = test_df.emotions_list.tolist()   

In [None]:
# results_file = Path(OUTPUT_DIR) / f"results_{k}.pickle"
# results_file.parent.mkdir(parents=True, exist_ok=True)

In [None]:
# results_d = {"grounds": grounds,
#             "predictions": decoded_outputs    
        
#     }

In [None]:
# with results_file.open('wb') as fh:
  
#     pickle.dump(results_d, fh)

In [None]:
## Post process

In [None]:
#import json

In [None]:
# predictions_l = []

# for i, prediction in enumerate(decoded_outputs):
#         try:
#             # Use json.loads to safely parse the JSON-like string
#             parsed_prediction = json.loads(prediction)
#             # Append the values of the parsed prediction to preds
#             predictions_l.append(parsed_prediction["list_emotion_classes"])
            
#         except json.JSONDecodeError as e:
#             print(f"Error decoding prediction: {i}")

Error decoding prediction: 0
Error decoding prediction: 1
Error decoding prediction: 2
Error decoding prediction: 3
Error decoding prediction: 4
Error decoding prediction: 5
Error decoding prediction: 6
Error decoding prediction: 7
Error decoding prediction: 8
Error decoding prediction: 9
Error decoding prediction: 10
Error decoding prediction: 11
Error decoding prediction: 12
Error decoding prediction: 13
Error decoding prediction: 14
Error decoding prediction: 15
Error decoding prediction: 16
Error decoding prediction: 17
Error decoding prediction: 18
Error decoding prediction: 19
Error decoding prediction: 20
Error decoding prediction: 21
Error decoding prediction: 22
Error decoding prediction: 23
Error decoding prediction: 24
Error decoding prediction: 25
Error decoding prediction: 26
Error decoding prediction: 27
Error decoding prediction: 28
Error decoding prediction: 29
Error decoding prediction: 30
Error decoding prediction: 31
Error decoding prediction: 32
Error decoding predi

In [None]:
#grounds

[['neutral'],
 ['joy'],
 ['surprise', 'joy'],
 ['surprise', 'joy'],
 ['fear', 'surprise'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['fear', 'surprise'],
 ['fear', 'sadness', 'surprise'],
 ['joy'],
 ['disgust', 'fear', 'sadness'],
 ['fear', 'sadness', 'surprise'],
 ['sadness'],
 ['anger', 'sadness', 'surprise'],
 ['sadness'],
 ['neutral'],
 ['surprise'],
 ['neutral'],
 ['anger'],
 ['neutral'],
 ['surprise'],
 ['fear'],
 ['disgust', 'surprise'],
 ['fear', 'surprise'],
 ['neutral'],
 ['surprise'],
 ['neutral'],
 ['sadness'],
 ['fear', 'sadness'],
 ['fear'],
 ['disgust', 'surprise'],
 ['surprise'],
 ['disgust', 'sadness'],
 ['sadness'],
 ['sadness'],
 ['sadness'],
 ['sadness', 'joy'],
 ['sadness', 'joy'],
 ['joy'],
 ['anger'],
 ['fear', 'surprise'],
 ['anger'],
 ['sadness', 'surprise'],
 ['fear', 'surprise'],
 ['fear', 'surprise'],
 ['surprise'],
 ['fear', 'surprise'],
 ['fear'],
 ['surprise'],
 ['surprise'],
 ['fear', 'surprise'],
 ['fear', 'surprise'],
 ['fear', 'surprise'],
 ['

In [None]:
#predictions_l

[]

In [None]:
#len(grounds), len(predictions_l)

(1326, 0)

In [None]:
#import ast

In [None]:
# predictions = []

# for item in predictions_l:
#     if isinstance(item, str):
#         # Convert the string to a list using ast.literal_eval
#         predictions.append(ast.literal_eval(item))
#     else:
#         # If the item is already a list, append as is
#         predictions.append(item)

In [None]:
#len(grounds), len(predictions_l)

(1326, 0)

In [None]:
#from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# def get_mlb(grounds, predictions):
    
#     mlb = MultiLabelBinarizer()
#     grounds_mhot = mlb.fit_transform(grounds)
#     predictions_mhot = mlb.transform(predictions)
    
#     return grounds_mhot, predictions_mhot, mlb.classes_

In [None]:
#grounds_matrix, predictions_matrix, classes = get_mlb(grounds, predictions)

In [None]:
#print(classification_report(grounds_matrix, predictions_matrix, target_names=classes, digits=3))

ValueError: Found input variables with inconsistent numbers of samples: [1326, 0]

In [None]:
classification_file = Path(OUTPUT_DIR) / f"classification_report_{k}.pickle"

with classification_file.open('wb') as fh:
    
    pickle.dump(classification_report(grounds_matrix, predictions_matrix, target_names=classes, output_dict=True), fh)