In [163]:
import json
import torch
import pandas as pd

from PIL import Image
from tqdm import tqdm
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoProcessor, LlavaForConditionalGeneration

In [2]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [3]:
model_id = "llava-hf/llava-1.5-7b-hf"

In [4]:
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Data

In [5]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_pg_w_images.csv", index_col=0)

In [28]:
df.shape

(874, 8)

### Prompts

In [95]:
images = [Image.open(image) for image in df.image_path]

In [97]:
comic_titles = df.comics_title.tolist()

'/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001499/images/page00001.jpg'

In [99]:
# conversation = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "text",
#                 "text": """Describe this comic book page with attention to:
# - The visual composition (page layout, perspective, framing)
# - Character poses, expressions, and positioning
# - Environment and setting details
# - Action and movement
# - Visual effects and artistic style
# - Text elements (speech bubbles, captions, sound effects)
# - Color palette and shading
# - Mood and atmosphere conveyed

# Please structure your description to flow naturally from the key focal point to supporting details, as if explaining the page to someone who can't see it."""
#             },
#             {"type": "image"},
#         ],
#     },
# ]

In [129]:
# conversation = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "text",
#                 "text": """Describe this comic book page taking into account:
# - The visual composition (page layout, perspective, framing)
# - Character poses, expressions, and positioning
# - Environment and setting details
# - Action and movement
# - Visual effects and artistic style
# - Text elements (speech bubbles, captions, sound effects)
# - Color palette and shading
# - Mood and atmosphere conveyed

# Please structure your description in form of a narrative, as if explaining the page to someone who can't see it."""
#             },
#             {"type": "image"},
#         ],
#     },
# ]

In [153]:
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": """Tell me what happens in this comic book page as a flowing narrative, weaving in these visual elements naturally:

- How the story unfolds across the page
- What the characters are doing, saying, and expressing
- How the environment and setting frame the action
- The way movement and action flow from panel to panel
- How text and dialogue integrate with the visuals

Describe it as if you're telling an engaging story to someone who can't see the page"""
            },
            {"type": "image"},
        ],
    },
]

In [154]:
#conversation[0]['content'][0]['text'] = conversation[0]['content'][0]['text'].replace("<title>", "Jurrasic League")

In [155]:
#prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

In [156]:
#inputs = processor(images=images[0], text=prompt, return_tensors='pt').to(0, torch.float16)
#output = model.generate(**inputs, max_new_tokens=256, do_sample=True)

In [157]:
#print(processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

## Generate scene descriptions

In [158]:
scene_descriptions = {}

for image in tqdm(images, desc="Generating scene descriptions ..."):
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    inputs = processor(images=image, text=prompt, return_tensors='pt').to(0, torch.float16)
    output = model.generate(**inputs, max_new_tokens=256, do_sample=False)
    scene_descriptions[image.filename] = processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

Generating scene descriptions ...: 100%|██████████| 874/874 [1:34:57<00:00,  6.52s/it]  


In [159]:
with open("/Utilisateurs/umushtaq/emotion_analysis_comics/data/scene_descriptions.json", "w") as file:
    json.dump(scene_descriptions, file)

In [166]:
df["scene_description"] = df["image_path"].map(scene_descriptions)

In [167]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path,scene_description
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...,The comic book page is divided into three dist...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...,The comic book page features a dynamic scene w...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...,"In this comic book page, the story unfolds wit..."
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...,"In this comic book page, the story unfolds acr..."
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...,The comic book page features a dynamic scene w...
...,...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...,"In this comic book page, there is a series of ..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...,The comic book page features a group of young ...
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...,The comic book page features a group of people...
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...,"In this comic book page, the story unfolds acr..."


## Load generation models

In [161]:
model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

In [164]:
inference_tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')

In [165]:
generation_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


### build messages for inference

In [172]:
# def build_instruction():
#     emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
#     formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
#     instruction = f"""### Emotion Analysis Expert Role

# You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

# INPUT:
# - You will receive a numbered list of utterances from a page in a comic book
# - You will receive a narrative description of the scene in the comic book page

# TASK:
# 1. Carefully analyze the emotional context and tone of each utterance in the page
# 2. Identify applicable emotions from the following classes:
#    {formatted_classes}
# 3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.

# RULES:
# 1. Use ONLY the labels listed above
# 2. Output MUST BE a one-line compact JSON with single key "page_utterance_emotions"
# 3. Respond ONLY with the JSON object. No additional text before or after.
# 4. Value must be an array where:
#    - Each element is an array of emotions for one utterance
#    - Order matches the input utterances order
#    - Multiple emotions are allowed per utterance
# 5. No explanations, only JSON output

# IMPORTANT:
# - Respond with a ONE-LINE JSON
# - Each array element corresponds to one utterance
# - One utterance can have multiple emotions
# - Maintain exact spelling and case of emotion labels
# - Keep emotions in arrays even for single emotions

# """
#     return instruction

In [299]:
def build_instruction():
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comic Dialogue

You are an emotion analysis expert for comic book dialogue. Analyze utterances to identify their emotional content.

INPUT:
- Numbered list of comic book utterances
- Scene description

TASK:
Analyze and classify each utterance's emotional tone using these emotions only: {formatted_classes}

OUTPUT RULES:
1. Single-line JSON with key "emotions"
2. Value must be array of emotion arrays, one per utterance
3. Output array length MUST EXACTLY match number of input utterances
4. Multiple emotions allowed per utterance
5. Return ONLY the JSON object, no other text

Example format:
{{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}
"""
    return instruction

In [300]:
instruction = build_instruction()

In [301]:
# def build_tagged_text(utterances):

#     result = '\n'.join(f'{i+1}: {line}' for i, line in enumerate(utterances))
    
#     question = f"""Now analyze these utterances in a page:\n{result}"""

#     return question

In [302]:
sys_msg_l = []
user_msg_l = []
assistant_msg_l = []

for _, row in df.iterrows():
        
        sys_msg = {'role': 'system', 'content': instruction}
        
        comics_title = row['comics_title']
        page_description = row['scene_description']
        
        utterances_l = eval(row['utterance'])
        pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
        
        usr_prompt = f"Comics title: {comics_title}\n\n" + f"Page scene description: {page_description}\n\n" + "Now classify this list of utterances in the page: \n" + pg_utterances
        
        user_msg = {'role': 'user', 'content': usr_prompt}        
        assistant_msg = {'role': 'assistant', 'content': ""}


        sys_msg_l.append(sys_msg)
        user_msg_l.append(user_msg)
        assistant_msg_l.append(assistant_msg)
        

In [303]:
comics_dataset = []

for i in range(len(sys_msg_l)):

    comics_dataset.append([sys_msg_l[i], user_msg_l[i], assistant_msg_l[i]])

In [304]:
comics_dataset[0]

[{'role': 'system',
  'content': '### Emotion Analysis for Comic Dialogue\n\nYou are an emotion analysis expert for comic book dialogue. Analyze utterances to identify their emotional content.\n\nINPUT:\n- Numbered list of comic book utterances\n- Scene description\n\nTASK:\nAnalyze and classify each utterance\'s emotional tone using these emotions only: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n\nOUTPUT RULES:\n1. Single-line JSON with key "emotions"\n2. Value must be array of emotion arrays, one per utterance\n3. Output array length MUST EXACTLY match number of input utterances\n4. Multiple emotions allowed per utterance\n5. Return ONLY the JSON object, no other text\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n'},
 {'role': 'user',
  'content': 'Comics title: Jurassic League #4\n\nPage scene description: The comic book page is divided into three distinct sections, each with its own visual elements and narrative. The first se

In [305]:
raw_outputs = []

for message in tqdm(comics_dataset):
    
    inputs = inference_tokenizer.apply_chat_template(
    message,
    tokenize=True,
    add_generation_prompt=True,
    #return_dict=True,
    return_tensors="pt",
).to("cuda") # type: ignore
    #print(inputs)
    #break
    
    output = generation_model.generate(input_ids=inputs, max_new_tokens=128)[0]
    #output = model.generate(**inputs, max_new_tokens=128)[0]
    
    input_length = inputs.shape[1]
    generated_tokens = output[input_length:]
    
    decoded_output = inference_tokenizer.decode(generated_tokens, skip_special_tokens=True)  
    #decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    raw_outputs.append(decoded_output)
    #break

100%|██████████| 874/874 [29:28<00:00,  2.02s/it]


In [306]:
## these are results with old prompt. and with scene description.

In [307]:
len(raw_outputs)

874

In [308]:
raw_outputs

['{"emotions": [["anger"], ["anger", "fear"], ["fear"], ["anger"], ["fear", "joy"], ["anger", "disgust"], ["anger", "fear"], ["fear", "surprise"], ["fear", "surprise"], ["fear", "sadness"]]}',
 '{"emotions": [["anger", "disgust"], ["surprise"], ["anger"], ["anger", "fear"], ["surprise"], ["disgust"], ["neutral"], ["anger"], ["anger", "surprise"]]}',
 '{"emotions": [["anger", "surprise"], ["anger"], ["anger", "fear", "disgust"], ["anger", "joy"]]}',
 '{"emotions": [["anger"]]}',
 '{"emotions": [["neutral"], ["joy", "anger"], ["anger", "fear"], ["anger"], ["surprise"], ["joy", "anger"]]}',
 '{"emotions": [["anger"], ["surprise", "fear"], ["anger"], ["neutral"], ["surprise", "joy"], ["anger"], ["anger", "fear"], ["anger"], ["neutral", "joy"], ["joy"]]}',
 '{"emotions": [["joy"], ["anger"], ["surprise"], ["disgust", "fear"], ["joy"]]}',
 '{"emotions": [["anger"], ["fear", "surprise"], ["surprise"], ["joy"], ["anger", "fear"], ["fear", "surprise"], ["fear", "anger"], ["anger", "disgust"], [

In [309]:
import json_repair

predictions = [json_repair.loads(e) for e in raw_outputs]

In [287]:
def process_list(mixed_list):
    result = []
    for element in mixed_list:
        if isinstance(element, list) and element:  # Check if it's a non-empty list
            result.append(element[0])
        else:
            result.append(element)
    return result

In [288]:
predictions = process_list(predictions)

In [310]:
preds_l = []
bad_idx = []

for i, pred in enumerate(predictions):
    try:        
        preds_l.append(pred['emotions'])
    except:
        print(i)
        bad_idx.append(i)

In [311]:
len(bad_idx)

0

In [312]:
grounds = df.emotion_c.tolist()

In [313]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    #del preds_l[idx]

In [314]:
len(grounds), len(preds_l)

(874, 874)

In [315]:
import ast

grounds = [ast.literal_eval(x) for x in grounds]

In [316]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, preds_l)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

8 14 16
9 10 11
14 11 10
26 14 15
37 8 7
44 8 7
46 11 10
47 11 12
48 10 9
60 8 7
66 8 7
75 18 17
76 16 19
82 13 14
90 10 9
99 11 12
101 16 17
102 21 16
104 20 17
106 16 18
107 12 10
108 11 12
113 23 20
114 23 25
117 13 16
119 10 9
123 13 12
126 17 16
130 9 10
131 12 13
135 7 6
137 12 11
161 18 19
173 14 17
183 17 19
193 11 12
198 18 22
200 18 17
206 11 10
213 9 8
214 11 12
216 15 14
220 6 7
229 12 11
234 12 11
240 10 9
241 11 10
242 12 11
247 16 15
249 17 20
252 9 10
253 16 15
254 17 12
255 14 13
263 5 6
272 19 20
273 19 18
274 23 22
277 10 9
278 15 14
282 11 10
283 25 23
284 9 10
288 15 17
293 11 10
297 10 9
301 31 18
302 11 10
315 6 7
317 15 16
318 13 12
319 12 13
320 19 18
321 17 15
322 11 12
323 14 13
325 14 13
326 12 14
329 11 14
331 2 5
334 5 10
335 1 2
336 2 7
339 4 5
341 4 7
347 11 12
356 9 16
359 11 12
360 16 17
363 12 13
371 7 6
372 10 12
394 8 18
406 13 14
411 15 16
412 10 9
414 15 17
421 10 9
425 9 10
435 14 15
451 10 9
462 10 12
465 14 15
466 15 16
469 8 7
470 9 10
471 13 

In [317]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del preds_l[idx]

In [318]:
len(grounds), len(preds_l)

(691, 691)

In [319]:
grounds = [item for sublist in grounds for item in sublist]
predictions_l = [item for sublist in preds_l for item in sublist]

In [320]:
len(grounds), len(predictions_l)

(4912, 4912)

In [321]:
predictions_l

[['anger'],
 ['anger', 'fear'],
 ['fear'],
 ['anger'],
 ['fear', 'joy'],
 ['anger', 'disgust'],
 ['anger', 'fear'],
 ['fear', 'surprise'],
 ['fear', 'surprise'],
 ['fear', 'sadness'],
 ['anger', 'disgust'],
 ['surprise'],
 ['anger'],
 ['anger', 'fear'],
 ['surprise'],
 ['disgust'],
 ['neutral'],
 ['anger'],
 ['anger', 'surprise'],
 ['anger', 'surprise'],
 ['anger'],
 ['anger', 'fear', 'disgust'],
 ['anger', 'joy'],
 ['anger'],
 ['neutral'],
 ['joy', 'anger'],
 ['anger', 'fear'],
 ['anger'],
 ['surprise'],
 ['joy', 'anger'],
 ['anger'],
 ['surprise', 'fear'],
 ['anger'],
 ['neutral'],
 ['surprise', 'joy'],
 ['anger'],
 ['anger', 'fear'],
 ['anger'],
 ['neutral', 'joy'],
 ['joy'],
 ['joy'],
 ['anger'],
 ['surprise'],
 ['disgust', 'fear'],
 ['joy'],
 ['anger'],
 ['fear', 'surprise'],
 ['surprise'],
 ['joy'],
 ['anger', 'fear'],
 ['fear', 'surprise'],
 ['fear', 'anger'],
 ['anger', 'disgust'],
 ['anger', 'fear'],
 ['surprise', 'anger'],
 ['anger', 'fear'],
 ['anger', 'fear'],
 ['neutral'],

In [277]:
bad_idx = []

def process_list(data):
    for index, element in enumerate(data):
        if isinstance(element, list):
            if not element:
                bad_idx.append(index)
                print(f"Empty list at index: {index}")
            else:
                data[index] = element[0]  # Replace with zeroth element
    return data

In [278]:
processed_list = process_list(predictions_l)

Empty list at index: 385
Empty list at index: 1733
Empty list at index: 1904
Empty list at index: 1991
Empty list at index: 2088
Empty list at index: 3912


In [281]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del processed_list[idx]

In [282]:
len(grounds), len(processed_list)

(4766, 4766)

In [283]:
new_preds = process_list(processed_list)

In [284]:
new_preds

['anger',
 'anger',
 'fear',
 'fear',
 'anger',
 'fear',
 'anger',
 'surprise',
 'surprise',
 'fear',
 'anger',
 'anger',
 'anger',
 'fear',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'anger',
 'surprise',
 'fear',
 'anger',
 'neutral',
 'joy',
 'fear',
 'fear',
 'anger',
 'anger',
 'joy',
 'surprise',
 'anger',
 'surprise',
 'surprise',
 'neutral',
 'anger',
 'neutral',
 'fear',
 'neutral',
 'anger',
 'fear',
 'fear',
 'fear',
 'anger',
 'neutral',
 'anger',
 'anger',
 'neutral',
 'neutral',
 'anger',
 'sadness',
 'joy',
 'anger',
 'anger',
 'sadness',
 'sadness',
 'fear',
 'neutral',
 'neutral',
 'joy',
 'joy',
 'anger',
 'fear',
 'disgust',
 'sadness',
 'fear',
 'joy',
 'fear',
 'fear',
 'anger',
 'surprise',
 'anger',
 'anger',
 'surprise',
 'joy',
 'joy',
 'fear',
 'neutral',
 'surprise',
 'fear',
 'disgust',
 'disgust',
 'joy',
 'joy',
 'surprise',
 'anger',
 'joy',
 'anger',
 'anger',
 'anger',
 'anger',
 'joy',
 'anger',
 'anger',

In [None]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del predictions_l[idx]

In [322]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

In [323]:
mlb = MultiLabelBinarizer()

In [324]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions_l)



In [325]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.479     0.600     0.532      1671
     disgust      0.102     0.318     0.155       214
        fear      0.369     0.537     0.437      1274
         joy      0.475     0.405     0.438      1088
     neutral      0.135     0.323     0.191       300
     sadness      0.503     0.237     0.322      1212
    surprise      0.399     0.313     0.351      1343

   micro avg      0.381     0.422     0.400      7102
   macro avg      0.352     0.390     0.346      7102
weighted avg      0.422     0.422     0.405      7102
 samples avg      0.384     0.426     0.383      7102



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
