In [1]:
import json
import pandas as pd

from PIL import Image
from tqdm import tqdm

from unsloth import FastVisionModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 4096

model, tokenizer = FastVisionModel.from_pretrained(

    model_name="unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Mllama vision patching. Transformers: 4.47.0.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### model

In [3]:
# model = FastVisionModel.get_peft_model(
#     model,
#     finetune_vision_layers     = True, # False if not finetuning vision layers
#     finetune_language_layers   = True, # False if not finetuning language layers
#     finetune_attention_modules = True, # False if not finetuning attention layers
#     finetune_mlp_modules       = True, # False if not finetuning MLP layers

#     r = 32,           # The larger, the higher the accuracy, but might overfit
#     lora_alpha = 32,  # Recommended alpha == r at least
#     lora_dropout = 0,
#     bias = "none",
#     random_state = 3407,
#     use_rslora = False,  # We support rank stabilized LoRA
#     loftq_config = None, # And LoftQ
#     # target_modules=[
#     #     "q_proj",
#     #     "k_proj",
#     #     "v_proj",
#     #     "o_proj",
#     #     "gate_proj",
#     #     "up_proj",
#     #     "down_proj",
#     # ],
#     # use_gradient_checkpointing=True,
#     #target_modules = "all-linear", # Optional now! Can specify a list if needed
# )

## data

In [4]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/dataset_files/comics_pg_w_images.csv", index_col=0)

In [608]:
x = df[df.file_name == 'QC copy - 1500 - 04 Nightwing 19 _Nightwing 95_.xlsx']

In [609]:
x.page_nr == 13

0      False
1      False
2      False
3      False
4      False
       ...  
134    False
135    False
136    False
137    False
138    False
Name: page_nr, Length: 139, dtype: bool

## Build vision dataset

In [6]:
def generation_instruction():
    
    instruction = f"""Describe this comics page with focus on the characters' emotional states. Include:
1. The facial expressions, body language, and micro-expressions of each character
2. The emotional atmosphere of the scene (tense, joyful, melancholic, etc.)
3. Any emotional subtext or contrast between characters
4. How the emotional state relates to the narrative context

Incorporate all text elements present in the panel:
- Analyze dialogue and captions to understand character emotions
- Analyze how typography (size, style, coloring of text) emphasizes emotional states
- Include how narrative text provides emotional context
- Analyze how spoken/thought text relate to the visual emotional cues


IMPORTANT: Your complete description MUST fit within a strict 256-token limit. Plan your response to conclude naturally and completely without being cut off abruptly.
"""
    return instruction

In [7]:
def build_image_modality(image_path):
    
    return Image.open(image_path)

In [8]:
def convert_to_conversation_test(row):
  
    image_path = row.image_path
  
    instruction = generation_instruction()
    image = build_image_modality(image_path)
    
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : image} ]
        },
        { "role" : "assistant",
          "content" : ""
        },
    ]
    return { "messages" : conversation }
pass

In [9]:
comics_mm_dataset = [convert_to_conversation_test(row) for _, row in tqdm(df.iterrows())]

874it [00:01, 806.73it/s]


In [14]:
#comics_mm_dataset_t = comics_mm_dataset[0:16]
model = FastVisionModel.for_inference(model)

In [11]:
#BATCH_SIZE = 8
raw_outputs = []

In [12]:
for message in tqdm(comics_mm_dataset[0:8]):
    
    #batch = comics_mm_dataset[i:i + BATCH_SIZE]

    #for message in tqdm(comics_mm_dataset):
        
    #texts = [message['messages'] for message in batch]
    image = message['messages'][0]['content'][1]['image']
    #print(images[0])
    #break
    input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
    #image = message['messages'][0]['content'][1]['image']
    #print(input_text)
    #break
    inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
    #break

    output = model.generate(**inputs, max_new_tokens=512)[0]
    input_length = inputs.input_ids.shape[1]
    generated_tokens = output[input_length:]
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    raw_outputs.append(decoded_output)
    
    # for i in range(len(outputs)):
    #     #input_length = inputs.input_ids.shape[1]
    #     #generated_tokens = outputs[i][input_length:]
    #     decoded_output = tokenizer.decode(outputs[i], skip_special_tokens=True)
    #     raw_outputs.append(decoded_output)
        
    #break
    

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [03:01<00:00, 22.73s/it]


In [13]:
raw_outputs

['The comic page depicts a tense scene with a mix of emotions. The title "HIDDEN BENEATH THE EARTH..." sets the tone, while the characters\' facial expressions and body language convey their emotional states.\n\n*   **Character 1 (Purple Creature):** A large, purple creature with sharp teeth and a menacing expression dominates the top panel. Its speech bubble reads, "THIS VILE THING ATTACKED THE SMALL BEASTS OF MY SHORES... MY MASTER!" The creature\'s body language is aggressive, with its arms raised and teeth bared. The text emphasizes its anger and frustration.\n*   **Character 2 (Green Creature):** A green creature with a similar appearance to the purple one is shown in the second panel, cowering in fear. Its speech bubble says, "I PUNCHED MY BEAUTIFUL MATILDA... AND NOW IT BEGS FOR LIFE." The creature\'s body language is submissive, with its head bowed and hands raised in surrender. The text highlights its regret and remorse.\n*   **Character 3 (Blue Creature):** A blue creature wi

In [None]:
# for i in tqdm(range(0, len(comics_mm_dataset), BATCH_SIZE)):
    
#     batch = comics_mm_dataset[i:i + BATCH_SIZE]

#     #for message in tqdm(comics_mm_dataset):
        
#     texts = [message['messages'] for message in batch]
#     images = [message['messages'][0]['content'][1]['image'] for message in batch]
#     #print(images[0])
#     #break
#     input_text = tokenizer.apply_chat_template(texts, add_generation_prompt = True)
#     #image = message['messages'][0]['content'][1]['image']
#     #print(input_text)
#     #break
#     inputs = tokenizer(
#     images,
#     input_text,
#     #add_special_tokens = False,
#     return_tensors = "pt",
# ).to("cuda")
#     #break

#     outputs = model.generate(**inputs, max_new_tokens=512)
#     raw_outputs.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    
#     # for i in range(len(outputs)):
#     #     #input_length = inputs.input_ids.shape[1]
#     #     #generated_tokens = outputs[i][input_length:]
#     #     decoded_output = tokenizer.decode(outputs[i], skip_special_tokens=True)
#     #     raw_outputs.append(decoded_output)
        
#     break
    

  0%|          | 0/110 [01:08<?, ?it/s]


In [58]:
raw_outputs

[['user\n\nDescribe this comics page with focus on the characters\' emotional states. Include:\n1. The facial expressions, body language, and micro-expressions of each character\n2. The emotional atmosphere of the scene (tense, joyful, melancholic, etc.)\n3. Any emotional subtext or contrast between characters\n4. How the emotional state relates to the narrative context\n\nIncorporate all text elements present in the panel:\n- Analyze dialogue and captions to understand character emotions\n- Analyze how typography (size, style, coloring of text) emphasizes emotional states\n- Include how narrative text provides emotional context\n- Analyze how spoken/thought text relate to the visual emotional cues\n\n\nIMPORTANT: Your complete description MUST fit within a strict 256-token limit. Plan your response to conclude naturally and completely without being cut off abruptly.\nassistant\n\nassistant\n\nThe comic page depicts a dramatic scene with a mix of intense and emotional expressions. The 

In [None]:
#import json

#data = ["item1", "item2", "item3"]  # Your list

with open("scene_discriptions_vision.json", "w") as file:
    json.dump(raw_outputs, file, indent=4)  # Save as JSON with indentation

In [None]:
# from torch.utils.data import DataLoader
# from tqdm import tqdm
# from PIL import Image
# import torch

# def collate_fn(batch):
#     input_texts = []
#     images = []
    
#     for message in batch:
#         input_texts.append(tokenizer.apply_chat_template(message['messages'], add_generation_prompt=True))
#         image = message['messages'][0]['content'][1]['image']
#         if isinstance(image, Image.Image):
#             image = torch.tensor(np.array(image)).permute(2, 0, 1)  # Convert PIL image to tensor
#         images.append(image)
    
#     return images, input_texts

# batch_size = 4  # Adjust batch size as needed
# data_loader = DataLoader(comics_mm_dataset_t, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# raw_outputs = []

# for images, input_texts in tqdm(data_loader):
#     inputs = tokenizer(
#         images,
#         input_texts,
#         add_special_tokens=False,
#         return_tensors="pt",
#         padding=True,
#         truncation=True
#     ).to("cuda")
    
#     outputs = model.generate(**inputs, max_new_tokens=512)
    
#     for i in range(len(outputs)):
#         input_length = inputs.input_ids.shape[1]
#         generated_tokens = outputs[i][input_length:]
#         decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
#         raw_outputs.append(decoded_output)


100%|██████████| 4/4 [03:52<00:00, 58.24s/it]


In [None]:
# def process_batch(batch_messages, tokenizer, model, batch_size=8):
#     raw_outputs = []
    
#     # Process data in batches
#     for i in range(0, len(batch_messages), batch_size):
#         current_batch = batch_messages[i:i+batch_size]
#         batch_inputs = []
        
#         # Prepare inputs for each item in the batch
#         for message in current_batch:
#             input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt=True)
#             image = message['messages'][0]['content'][1]['image']
            
#             encoded_input = tokenizer(
#                 image,
#                 input_text,
#                 add_special_tokens=False,
#                 return_tensors="pt",
#             )
            
#             batch_inputs.append(encoded_input)
        
#         # Batch processing
#         with torch.no_grad():
#             # Move all inputs to device and track input lengths
#             input_lengths = []
#             batch_input_ids = []
#             batch_attention_mask = []
#             batch_pixel_values = []
#             batch_aspect_ratios = []
            
#             for inp in batch_inputs:
#                 input_lengths.append(inp.input_ids.shape[1])
#                 batch_input_ids.append(inp.input_ids)
#                 batch_attention_mask.append(inp.attention_mask)
#                 batch_pixel_values.append(inp.pixel_values)
#                 batch_aspect_ratios.append(inp.aspect_ratio_ids.to)
            
#             # Pad inputs to the same length
#             max_length = max(input_lengths)
#             padded_input_ids = []
#             padded_attention_mask = []
#             padded_pixel_values = []
#             padded_aspect_ratios = []
            
            
            
#             for i, (ids, mask, pixel, aspect_ratio) in enumerate(zip(batch_input_ids, batch_attention_mask, batch_pixel_values, batch_aspect_ratios)):
#                 padding_length = max_length - ids.shape[1]
#                 padded_ids = torch.cat([ids, torch.zeros((1, padding_length), dtype=torch.long, device=ids.device)], dim=1)
#                 padded_mask = torch.cat([mask, torch.zeros((1, padding_length), dtype=torch.long, device=mask.device)], dim=1)
#                 #padded_pixel = torch.cat([pixel, torch.zeros((1, padding_length), dtype=torch.long, device=pixel.device)], dim=1)
                
#                 padded_input_ids.append(padded_ids)
#                 padded_attention_mask.append(padded_mask)
#                 padded_pixel_values.append(pixel)
#                 padded_aspect_ratios.append(aspect_ratio)
            
#             # Stack tensors into batches
#             batched_input_ids = torch.cat(padded_input_ids, dim=0).to("cuda")
#             batched_attention_mask = torch.cat(padded_attention_mask, dim=0).to("cuda")
#             batched_pixel_values = torch.cat(padded_pixel_values, dim=0).to("cuda")
#             batched_aspect_ratios = torch.cat(padded_aspect_ratios, dim=0).to("cuda")
            
#             # Generate outputs
#             outputs = model.generate(
#                 input_ids=batched_input_ids,
#                 attention_mask=batched_attention_mask,
#                 pixel_values=batched_pixel_values,
#                 aspect_ratio_ids=batched_aspect_ratios,
#                 max_new_tokens=512
#             )
            
#             # Process each output in the batch
#             for j, (output, input_length) in enumerate(zip(outputs, input_lengths)):
#                 generated_tokens = output[input_length:]
#                 decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
#                 raw_outputs.append(decoded_output)
    
#     return raw_outputs

In [None]:
# raw_outputs = []
# batch_size = 8  # Adjust based on your GPU memory

In [None]:
#FastVisionModel.for_inference(model)

In [None]:
# for i in tqdm(range(0, len(comics_mm_dataset), batch_size)):
#     batch_messages = comics_mm_dataset[i:i+batch_size]
#     #print(batch_messages)
#     batch_outputs = process_batch(batch_messages, tokenizer, model, batch_size)
#     raw_outputs.extend(batch_outputs)
#     break

In [None]:
# raw_outputs

In [None]:
# def build_image_modality(image_path):
    
#     return Image.open(image_path)

In [None]:
# def convert_to_conversation(row):
  
#     comics_title = row.comics_title
#     image_path = row.image_path
#     labels = row.emotion_c
    
#     utterances_l = eval(row['utterance'])
#     pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
#     instruction = build_generation_instruction(comics_title, pg_utterances)
#     image = build_image_modality(image_path)
    
#     conversation = [
#         { "role": "user",
#           "content" : [
#             {"type" : "text",  "text"  : instruction},
#             {"type" : "image", "image" : image} ]
#         },
#         { "role" : "assistant",
#           "content" : [
#             {"type" : "text",  "text"  : {"emotions": labels}} ]
#         },
#     ]
#     return { "messages" : conversation }
# pass

In [None]:
# # Apply the function to each row and store the results in a list
# comics_mm_dataset_train = [convert_to_conversation(row) for _, row in df_train_f.iterrows()]
# comics_mm_dataset_eval = [convert_to_conversation(row) for _, row in df_eval_f.iterrows()]

In [None]:
#len(comics_mm_dataset_eval)

116

In [None]:
#comics_mm_dataset_eval[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Nightwing #95\n\nUtterances to Classify:\n1. DID YOU HAVE TO ELECTROCU

In [None]:
#FastVisionModel.for_training(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [None]:
# args = SFTConfig(
    
#         do_train = True,
#         do_eval=True,

#         per_device_train_batch_size = 4,
#         gradient_accumulation_steps = 2,
#         warmup_steps = 5,
#         #max_steps = 30,
#         num_train_epochs = 3, # Set this instead of max_steps for full training runs
#         learning_rate = 2e-4,
#         fp16 = not is_bf16_supported(),
#         bf16 = is_bf16_supported(),
#         logging_steps = 25,
#         eval_steps = 25,
#         eval_strategy = "steps",
#         optim = "adamw_8bit",
#         weight_decay = 0.01,
#         lr_scheduler_type = "linear",
#         seed = 3407,
#         output_dir = "/Utilisateurs/umushtaq/emotion_analysis_comics/outputs_dir_tmp",
#         report_to = "none",     # For Weights and Biases
#         load_best_model_at_end=True,
#         metric_for_best_model="eval_loss",

#         # You MUST put the below items for vision finetuning:
#         remove_unused_columns = False,
#         dataset_text_field = "",
#         dataset_kwargs = {"skip_prepare_dataset": True},
#         dataset_num_proc = 4,
#         max_seq_length = 4096,
#     )

In [None]:
# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
#     train_dataset = comics_mm_dataset_train,
#     eval_dataset = comics_mm_dataset_eval,
#     args = args,
# )

In [None]:
# gpu_stats = torch.cuda.get_device_properties(0)
# start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
# max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
# print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
# print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
7.818 GB of memory reserved.


In [None]:
#trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 602 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 225
 "-____-"     Number of trainable parameters = 117,964,800
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss,Validation Loss
25,0.6094,0.155453
50,0.1046,0.142057
75,0.095,0.14569
100,0.0772,0.153259
125,0.0771,0.149886
150,0.0666,0.148824
175,0.0572,0.155657
200,0.0517,0.156782
225,0.0517,0.155092


TrainOutput(global_step=225, training_loss=0.13227260377671984, metrics={'train_runtime': 2371.5509, 'train_samples_per_second': 0.762, 'train_steps_per_second': 0.095, 'total_flos': 5.97897370384729e+16, 'train_loss': 0.13227260377671984, 'epoch': 2.966887417218543})

In [None]:
#FastVisionModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MllamaForConditionalGeneration(
      (vision_model): MllamaVisionModel(
        (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
        (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
          (tile_embedding): Embedding(9, 8197120)
        )
        (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
          (embedding): Embedding(9, 5120)
        )
        (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (transformer): MllamaVisionEncoder(
          (layers): ModuleList(
            (0-12): 13 x MllamaVisionEncoderLayer(
              (self_attn): MllamaVisionSdpaAttention(
               

In [None]:
# def convert_to_conversation_test(row):
  
#     comics_title = row.comics_title
#     image_path = row.image_path
#     # labels = row.emotion_c
    
#     utterances_l = eval(row['utterance'])
#     pg_utterances = "\n".join(f"{i+1}. {title}" for i, title in enumerate(utterances_l))
  
#     instruction = build_generation_instruction(comics_title, pg_utterances)
#     image = build_image_modality(image_path)
    
#     conversation = [
#         { "role": "user",
#           "content" : [
#             {"type" : "text",  "text"  : instruction},
#             {"type" : "image", "image" : image} ]
#         },
#         { "role" : "assistant",
#           "content" : ""
#         },
#     ]
#     return { "messages" : conversation }
# pass

In [None]:
#df_test = df[df.split == "TEST"].reset_index(drop=True)

In [None]:
#comics_mm_dataset_test = [convert_to_conversation_test(row) for _, row in df_test.iterrows()]

In [None]:
#len(comics_mm_dataset_test)

156

In [None]:
#comics_mm_dataset_test[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. H

In [None]:
#comics_mm_dataset_test[0]['messages'][0]['content'][1]['image']

In [None]:
# raw_outputs = []

# for message in tqdm(comics_mm_dataset_test):
    
#     input_text = tokenizer.apply_chat_template(message['messages'], add_generation_prompt = True)
#     image = message['messages'][0]['content'][1]['image']
#     #print(input_text)
#     #break
   
#     inputs = tokenizer(
#     image,
#     input_text,
#     add_special_tokens = False,
#     return_tensors = "pt",
# ).to("cuda")
#     #print(inputs['input_ids'])
#     #print(tokenizer.decode(inputs['input_ids'][0]))
#     #break
    
#     #output = model.generate(input_ids=inputs, max_new_tokens=128)[0]
#     output = model.generate(**inputs, max_new_tokens=512)[0]
    
#     #input_length = inputs.shape[1]
#     #generated_tokens = output[input_length:]
    
#     #decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)  
#     decoded_output = tokenizer.decode(output, skip_special_tokens=True)
#     raw_outputs.append(decoded_output)

100%|██████████| 156/156 [12:50<00:00,  4.94s/it]


In [None]:
#len(raw_outputs)

156

In [None]:
#raw_outputs

['user\n\n### Emotion Analysis for Comics\n\nYou are an emotion analysis expert for comic dialogue. Your task is to analyze utterances based on the immediate context.\n\nINPUT:\n- List of utterances from a page in a comic\n- An image of the comics page\n\nOUTPUT:\n- JSON with single key "emotions"\n- Value: array of emotion arrays matching utterance order\n- ONLY use these emotions: "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n- NO OTHER EMOTION LABELS ARE ALLOWED\n\nRULES:\n1. Each utterance must have at least one emotion from the list above\n2. Multiple emotions per utterance are allowed\n3. Keep emotions in arrays even for single emotions\n4. Maintain exact emotion spelling and case\n5. No explanations, only JSON output\n\nExample format:\n{"emotions": [["joy"], ["anger", "fear"], ["neutral"]]}\n\nComic Information:\nTitle: Danger Street #1\n\nUtterances to Classify:\n1. HOW\'S IT GOING?\n2. HEY.\n3. CAN I GET YOU ANYTHING?\n4. JUST A COKE.\n5. OKAY. COMING U

In [None]:
# op = []

# for output in raw_outputs:
#     op.append(output.split("assistant\n\nassistant\n\n")[1])

In [None]:
#op

['{\'emotions\': "[[\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'neutral\'], [\'neutral\']]"}',
 '{\'emotions\': "[[\'neutral\'], [\'neutral\'], [\'anger\', \'fear\'], [\'anger\', \'fear\'], [\'anger\'], [\'anger\', \'fear\'], [\'anger\', \'fear\']]"}',
 '{\'emotions\': "[[\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'joy\'], [\'surprise\', \'joy\'], [\'joy\'], [\'joy\'], [\'joy\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'anger\', \'disgust\'], [\'joy\'], [\'joy\'], [\'anger\',\'surprise\']]"}',
 '{\'emotions\': "[[\'anger\'], [\'anger\'], [\'anger\'], [\'fear\',\'sadness\'], [\'sadness\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\',\'surprise\'], [\'anger\',\'surprise\'], [\'anger\',\'surprise\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\'], [\'anger\', \'joy\'], [\'anger\', \'joy\'], [\'joy\'], [\'anger\',\'surprise

In [None]:
#grounds = df_test.emotion_c.tolist()

In [None]:
#import json_repair


In [None]:
#grounds = [ast.literal_eval(x) for x in grounds]

ValueError: malformed node or string: [['surprise', 'joy'], ['joy'], ['surprise', 'joy'], ['joy'], ['joy'], ['joy'], ['surprise'], ['joy'], ['joy'], ['neutral'], ['neutral']]

In [None]:
# bad_idx = []
# predictions = []

# for i, x in enumerate(op):
#     try:
#         predictions.append(json_repair.loads(x)['emotions'])
#     except:
#         print(i)
#         bad_idx.append(i)

In [None]:
# bad_idx.sort(reverse=True)

# # Remove elements from 'grounds' at the specified indices
# for idx in bad_idx:
    
#     del grounds[idx]
#     #del predictions[idx]

In [None]:
#len(grounds), len(predictions)

(156, 156)

In [None]:
#predictions

["[['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['neutral'], ['neutral']]",
 "[['neutral'], ['neutral'], ['anger', 'fear'], ['anger', 'fear'], ['anger'], ['anger', 'fear'], ['anger', 'fear']]",
 "[['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['joy'], ['surprise', 'joy'], ['joy'], ['joy'], ['joy'], ['anger', 'disgust'], ['anger', 'disgust'], ['anger', 'disgust'], ['joy'], ['joy'], ['anger','surprise']]",
 "[['anger'], ['anger'], ['anger'], ['fear','sadness'], ['sadness'], ['anger'], ['anger'], ['anger'], ['anger','surprise'], ['anger','surprise'], ['anger','surprise'], ['anger'], ['anger'], ['anger'], ['anger'], ['anger', 'joy'], ['anger', 'joy'], ['joy'], ['anger','surprise'], ['anger','surprise']]",
 "[['joy'], ['joy'], ['joy']]",
 "[['joy'], ['anger','surprise'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'], ['joy'

In [None]:
# bad_idx = []
# predictions_l = []

# for i, x in enumerate(predictions):
#     try:
#         predictions_l.append(ast.literal_eval(x))
#     except:
#         print(i)
#         bad_idx.append(i)

7
22


In [None]:
# bad_idx.sort(reverse=True)

# # Remove elements from 'grounds' at the specified indices
# for idx in bad_idx:
    
#     del grounds[idx]
#     #del predictions[idx]

In [None]:
#len(grounds), len(predictions_l)

(154, 154)

In [None]:
# bad_idx = []

# for idx, (i,j) in enumerate(zip(grounds, predictions_l)):
#     if len(i) != len(j):
#         print(idx, len(i), len(j))
#         bad_idx.append(idx)

3 21 20
13 23 22
14 23 22
25 17 16
118 17 16


In [None]:
# bad_idx.sort(reverse=True)

# # Remove elements from 'grounds' at the specified indices
# for idx in bad_idx:
    
#     del grounds[idx]
#     del predictions_l[idx]

In [None]:
# grounds_l = [item for sublist in grounds for item in sublist]
# predictions_l = [item for sublist in predictions_l for item in sublist]

In [None]:
#len(grounds_l), len(predictions_l)

(1198, 1198)

In [None]:
#grounds_l

[['surprise', 'joy'],
 ['joy'],
 ['surprise', 'joy'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['surprise'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['neutral'],
 ['anger', 'disgust'],
 ['anger', 'disgust'],
 ['neutral'],
 ['sadness'],
 ['sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['anger', 'sadness'],
 ['fear', 'surprise'],
 ['surprise'],
 ['joy'],
 ['anger', 'surprise'],
 ['joy'],
 ['joy'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['surprise', 'joy'],
 ['fear', 'sadness'],
 ['fear', 'sadness'],
 ['fear', 'surprise'],
 ['neutral'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['fear'],
 ['neutral'],
 ['anger', 'fear', 'sadness'],
 ['joy'],
 ['sadness'],
 ['fear', 'surprise'],
 ['anger', 'fear', 'sadness'],
 ['joy'],
 ['neutral'],
 ['fear'],
 ['joy'],
 ['neutral'],
 ['joy'],
 ['joy'],
 ['neutral'],
 ['joy'],
 ['anger'],
 ['anger'],
 ['surprise', 'joy'],
 ['anger', 'sadness'],
 ['anger'],
 ['sadness'],
 ['surprise'],
 ['anger'],
 ['neutral'],
 ['joy'],
 ['joy'],
 ['joy'],
 [

In [None]:
#set([type(item) for sublist in grounds_l for item in sublist])

{str}

In [None]:
#mlb = MultiLabelBinarizer()

In [None]:
# y_true_mhot = mlb.fit_transform(grounds_l)
# y_pred_mhot = mlb.transform(predictions_l)

In [None]:
# y_pred_mhot.shape

(1198, 7)

In [None]:
# print(classification_report(y_true_mhot, y_pred_mhot, target_names=mlb.classes_, digits=3))

              precision    recall  f1-score   support

       anger      0.551     0.566     0.558       412
     disgust      0.270     0.256     0.263        39
        fear      0.452     0.465     0.458       271
         joy      0.461     0.649     0.539       265
     neutral      0.429     0.293     0.348        92
     sadness      0.540     0.486     0.512       317
    surprise      0.588     0.485     0.531       324

   micro avg      0.509     0.511     0.510      1720
   macro avg      0.470     0.457     0.459      1720
weighted avg      0.514     0.511     0.508      1720
 samples avg      0.532     0.530     0.507      1720



In [None]:
# used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
# used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
# used_percentage = round(used_memory         /max_memory*100, 3)
# lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
# #print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
# #print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
# print(f"Peak reserved memory = {used_memory} GB.")
# print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
# print(f"Peak reserved memory % of max memory = {used_percentage} %.")
# print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 31.213 GB.
Peak reserved memory for training = 0.377 GB.
Peak reserved memory % of max memory = 33.561 %.
Peak reserved memory for training % of max memory = 0.405 %.


In [None]:
# image1 = Image.open("/Utilisateurs/umushtaq/emotion_analysis_comics/comics_dataset_images/001499/images/page00001.jpg")
# #image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
# #display(image1)
# #display(image2)

In [None]:
#instruction = "You are an expert in comics. Explain the emotional content of this comics page."

In [None]:
# messages = [
#     {"role": "user", "content": [
#         {"type": "image"},
#         {"type": "text", "text": instruction}
#     ]}
# ]

In [None]:
#input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)

In [None]:
# inputs = tokenizer(
#     image1,
#     input_text,
#     add_special_tokens = False,
#     return_tensors = "pt",
# ).to("cuda")

In [None]:
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)

In [None]:
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
#                    use_cache = True, temperature = 1.5, min_p = 0.1)