In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

In [2]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4", torch_dtype="auto", device_map="auto"
)

  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loading checkpoint shards:   0%|          | 0/11 [00:00<?, ?it/s]

In [3]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4")

### Data

In [4]:
import pandas as pd
from pathlib import Path
from PIL import Image

In [5]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_pg_w_images.csv", index_col=0)

In [6]:
df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c,comics_title,comics_id,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,"[""THIS VILE THING ATTACKED THE SMALL BEASTS OF...","[['anger'], ['anger'], ['fear'], ['fear'], ['f...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[""NO-- #GKKK\u2026#"", ""#CHOMP!"", ""BY THE SKIN...","[['fear'], ['anger'], ['surprise'], ['anger'],...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[""COME ON, BEAST!"", ""SHOW YOURSELF!"", ""WHY DO ...","[['joy'], ['joy'], ['anger'], ['anger']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,"[""#AARGH! ""]","[['fear', 'surprise']]",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[""I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","[['anger'], ['anger'], ['fear'], ['fear', 'sur...",Jurassic League #4,1499,/Utilisateurs/umushtaq/emotion_analysis_comics...
...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","[['anger'], ['anger'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[""SO WHAT ARE WE GOING TO DO?"", ""THE WAY I SEE...","[['sadness', 'surprise'], ['anger'], ['anger']...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","[['anger', 'sadness'], ['anger'], ['anger'], [...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,"[""IT'S BEEN\u2026 PEACEFUL. ASIDE FROM SHIT LI...","[['anger'], ['joy'], ['joy'], ['anger', 'surpr...",Stillwater #13,2200,/Utilisateurs/umushtaq/emotion_analysis_comics...


In [7]:
def build_generation_instruction(comics_title, page_utterances):
   
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis for Comics

You are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.

INPUT:
- List of utterances from a page in a comic
- An image of the comics page

OUTPUT:
- JSON with single key "emotions"
- Value: array of emotion arrays matching utterance order
- ONLY use these emotions: {formatted_classes}
- NO OTHER EMOTION LABELS ARE ALLOWED

RULES:
1. Each utterance must have at least one emotion from the list above
2. Multiple emotions per utterance are allowed
3. Keep emotions in arrays even for single emotions
4. Maintain exact emotion spelling and case
5. No explanations, only JSON output

Example format:
{{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}}

Comic Information:
Title: {comics_title}

Utterances to Classify:
{page_utterances}

"""
    return instruction

In [8]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [9]:
def convert_to_conversation(row):
  
    comics_title = row.comics_title
    image_path = row.image_path
    labels = row.emotion_c
    
    utterances_l = eval(row['utterance'])
    pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
    instruction = build_generation_instruction(comics_title, pg_utterances)
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : ""} ]
        },
    ]
    return { "messages" : conversation }
pass

In [10]:
# Apply the function to each row and store the results in a list
comics_mm_dataset = [convert_to_conversation(row) for _, row in df.iterrows()]

In [11]:
len(comics_mm_dataset)

874

In [12]:
#comics_mm_dataset = comics_mm_dataset[:len(comics_mm_dataset)/2]

In [13]:
comics_mm_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Jurassic League #4\n\nUtterances to Classify:\n1. THIS VILE THING ATTA

In [14]:
from PIL import Image

def resize_images(images, max_size=256):
    resized_images = []
    for img in images:
        if isinstance(img, Image.Image):  # Check if it's a PIL Image
            img = img.resize((max_size, max_size))#, Image.ANTIALIAS) # type: ignore
        resized_images.append(img)
    return resized_images


In [15]:
#comics_mm_dataset[0]['messages'][0]['content'][1]['image']

In [16]:
from tqdm import tqdm

In [17]:
BATCH_SIZE = 32

In [18]:
# Store outputs
raw_outputs = []

# Split the dataset into batches
for i in tqdm(range(0, len(comics_mm_dataset), BATCH_SIZE)):
    batch = comics_mm_dataset[i:i + BATCH_SIZE]

    # Prepare batch inputs
    batch_texts = []
    batch_images = []
    
    for message in batch:
        # Process text and images for the current message
        text = processor.apply_chat_template(
            message['messages'], tokenize=False, add_generation_prompt=True
        )
        image_inputs, _ = process_vision_info(message['messages'])
        image_inputs = resize_images(image_inputs, max_size=256)

        # Append to batch lists
        batch_texts.append(text)
        batch_images.append(image_inputs)
    
    # Process the batch with the processor
    inputs = processor(
        text=batch_texts,
        images=batch_images,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate outputs for the batch
    generated_ids = model.generate(**inputs, max_new_tokens=256)
    
    # Append outputs
    raw_outputs.extend(generated_ids)

# Final outputs are stored in raw_outputs


100%|██████████| 28/28 [1:12:05<00:00, 154.49s/it]


In [19]:
len(raw_outputs)

874

In [41]:
raw_outputs[0].shape

torch.Size([721])

In [21]:
op = [processor.decode(x).split("<|im_start|>assistant\n<|im_end|>\n<|im_start|>assistant\n")[1].split("<|im_end|>")[0] for x in raw_outputs]

In [22]:
op

['```json\n{\n  "emotions": [\n    ["anger", "disgust"],\n    ["anger", "fear"],\n    ["neutral"],\n    ["fear"],\n    ["anger"],\n    ["fear"],\n    ["anger"],\n    ["surprise"],\n    ["surprise"],\n    ["fear"]\n  ]\n}\n```',
 '```json\n{\n  "emotions": [\n    ["neutral"],\n    ["neutral"],\n    ["anger"],\n    ["anger", "disgust"],\n    ["neutral"],\n    ["neutral"],\n    ["neutral"],\n    ["anger"],\n    ["anger"]\n  ]\n}\n```',
 '```json\n{\n  "emotions": [\n    ["anger"],\n    ["anger"],\n    ["anger", "disgust"],\n    ["anger"]\n  ]\n}\n```',
 '```json\n{"emotions": [["anger", "fear"]]}\n```',
 '```json\n{\n  "emotions": [\n    ["neutral"],\n    ["anger", "determination"],\n    ["fear"],\n    ["pain"],\n    ["joy", "excitement"],\n    ["neutral"]\n  ]\n}\n```',
 '```json\n{\n  "emotions": [\n    ["anger"],\n    ["fear"],\n    ["anger", "fear"],\n    [],\n    ["neutral"],\n    ["neutral"],\n    ["neutral"],\n    ["neutral"],\n    ["neutral"],\n    ["neutral"]\n  ]\n}\n```',
 '```

In [23]:
import json_repair

In [24]:
preds = [json_repair.loads(x) for x in op]

In [25]:
preds = [x['emotions'] for x in preds]

In [26]:
preds

[[['anger', 'disgust'],
  ['anger', 'fear'],
  ['neutral'],
  ['fear'],
  ['anger'],
  ['fear'],
  ['anger'],
  ['surprise'],
  ['surprise'],
  ['fear']],
 [['neutral'],
  ['neutral'],
  ['anger'],
  ['anger', 'disgust'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['anger'],
  ['anger']],
 [['anger'], ['anger'], ['anger', 'disgust'], ['anger']],
 [['anger', 'fear']],
 [['neutral'],
  ['anger', 'determination'],
  ['fear'],
  ['pain'],
  ['joy', 'excitement'],
  ['neutral']],
 [['anger'],
  ['fear'],
  ['anger', 'fear'],
  [],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral']],
 [['surprise'], ['fear'], ['neutral'], ['joy'], ['neutral']],
 [['fear'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['anger', 'determination'],
  ['fear'],
  ['fear', 'hesitation'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral'],
  ['neutral']],
 [['neutral'],
  ['neutral'],
  ['anger', 'disgust'],
  ['fear', 'sadness'],
  ['fear'],
  ['fear'],
  ['deter

In [27]:
grounds = df.emotion_c.tolist()

In [28]:
len(grounds)

874

In [29]:
len(preds)

874

In [30]:
import ast

grounds = [ast.literal_eval(x) for x in grounds]

In [31]:
bad_idx = []

for idx, (i,j) in enumerate(zip(grounds, preds)):
    if len(i) != len(j):
        print(idx, len(i), len(j))
        bad_idx.append(idx)

95 8 9
101 16 15
123 13 12
153 13 14
160 13 14
283 25 24
318 13 14
335 1 2
365 11 12
495 8 3
525 9 10
581 11 12
664 13 14
767 13 12
774 8 7
813 5 4
824 8 7
836 13 14


In [32]:
bad_idx.sort(reverse=True)

# Remove elements from 'grounds' at the specified indices
for idx in bad_idx:
    
    del grounds[idx]
    del preds[idx]

In [33]:
len(grounds), len(preds)

(856, 856)

In [34]:
grounds = [item for sublist in grounds for item in sublist]
predictions = [item for sublist in preds for item in sublist]

In [35]:
len(grounds), len(predictions)

(6928, 6928)

In [36]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

In [37]:
mlb = MultiLabelBinarizer()

In [38]:
y_true_mhot = mlb.fit_transform(grounds)
y_pred_mhot = mlb.transform(predictions)



In [39]:
print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.747     0.296     0.425      2324
     disgust      0.207     0.241     0.223       323
        fear      0.576     0.294     0.389      1686
         joy      0.775     0.156     0.260      1652
     neutral      0.081     0.886     0.148       414
     sadness      0.726     0.116     0.201      1726
    surprise      0.644     0.121     0.203      1781

   micro avg      0.301     0.233     0.262      9906
   macro avg      0.536     0.302     0.264      9906
weighted avg      0.655     0.233     0.294      9906
 samples avg      0.281     0.247     0.253      9906



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# raw_outputs = []

# for message in tqdm(comics_mm_dataset):
    
#     text = processor.apply_chat_template(
#     message['messages'], tokenize=False, add_generation_prompt=True)

#     image_inputs, _ = process_vision_info(message['messages'])
#     image_inputs = resize_images(image_inputs, max_size=1024)
    
#     inputs = processor(
#     text=text,
#     images=image_inputs,
#     padding=True,
#     return_tensors="pt",)

#     inputs = inputs.to("cuda")

#     generated_ids = model.generate(**inputs, max_new_tokens=256)
#     raw_outputs.append(generated_ids)

In [42]:
# raw_outputs = []

# for message in tqdm(comics_mm_dataset):
    
#     #print(len(message))
#     #print(message['messages'])
    
#     text = processor.apply_chat_template(
#     message['messages'], tokenize=False, add_generation_prompt=True
# )
#     #print(text)
#     #break
#     #image = comics_mm_dataset[0]['messages'][0]['content'][1]['image']
#     image_inputs, _ = process_vision_info(message['messages'])
#     image_inputs = resize_images(image_inputs, max_size=1024)
#     #image_inputs, _ = process_vision_info(message['messages'])
#     #print(image_inputs)
    
#     inputs = processor(
#     #text=[text],
#     text=text,
#     images=image_inputs,
#     #videos=video_inputs,
#     padding=True,
#     return_tensors="pt",
# )
#     #print(inputs)
#     #break
#     inputs = inputs.to("cuda")
#     #print(inputs)
#     num_elements = inputs.pixel_values.numel()

#     # Get the size of each element in bytes
#     bytes_per_element = inputs.pixel_values.element_size()

#     # Calculate total memory in bytes
#     memory_size = num_elements * bytes_per_element
#     print(f"Total memory size: {memory_size / (1024 ** 2):.4f} MB")

#     #break
#     generated_ids = model.generate(**inputs, max_new_tokens=256)
#     raw_outputs.append(generated_ids)
    
    
    
# #     input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
# #     image = message['messages'][0]['content'][1]['image']
# #     #print(input_text)
# #     #break
   
# #     inputs = tokenizer(
# #     image,
# #     input_text,
# #     add_special_tokens = False,
# #     return_tensors = "pt",
# # ).to("cuda")
# #     #print(inputs['input_ids'])
# #     #print(tokenizer.decode(inputs['input_ids'][0]))
# #     #break
    
# #     #output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
# #     output = model.generate(**inputs, max_new_tokens=512)[0]
    
# #     #input_length = inputs.shape[1]
# #     #generated_tokens = output[input_length:]
    
# #     #decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
# #     decoded_output = tokenizer.decode(output, skip_special_tokens=True)
# #     raw_outputs.append(decoded_output)

  0%|          | 0/874 [00:00<?, ?it/s]

Total memory size: 24.5658 MB


  0%|          | 1/874 [00:09<2:18:44,  9.54s/it]

Total memory size: 24.5658 MB


  0%|          | 2/874 [00:15<1:49:25,  7.53s/it]

Total memory size: 24.5658 MB


  0%|          | 3/874 [00:19<1:27:50,  6.05s/it]

Total memory size: 24.5658 MB


  0%|          | 4/874 [00:23<1:12:48,  5.02s/it]

Total memory size: 24.5658 MB


  1%|          | 5/874 [00:28<1:15:03,  5.18s/it]

Total memory size: 24.5658 MB


  1%|          | 6/874 [00:33<1:12:32,  5.01s/it]

Total memory size: 24.5658 MB


  1%|          | 7/874 [00:37<1:06:06,  4.57s/it]

Total memory size: 24.5658 MB


  1%|          | 8/874 [00:46<1:29:44,  6.22s/it]

Total memory size: 24.5658 MB


  1%|          | 9/874 [00:59<1:58:52,  8.25s/it]

Total memory size: 24.5658 MB


  1%|          | 10/874 [01:09<2:04:58,  8.68s/it]

Total memory size: 24.5658 MB


  1%|▏         | 11/874 [01:12<1:40:54,  7.02s/it]

Total memory size: 24.5658 MB


  1%|▏         | 12/874 [01:22<1:53:07,  7.87s/it]

Total memory size: 24.5658 MB


  1%|▏         | 13/874 [01:31<1:59:39,  8.34s/it]

Total memory size: 24.5658 MB


  1%|▏         | 13/874 [01:34<1:44:37,  7.29s/it]


KeyboardInterrupt: 

In [25]:
#inputs.pixel_values.shape

In [26]:
#!nvidia-smi