In [1]:
import os
import re
import json
import torch

import pickle

import numpy as np
import pandas as pd


from tqdm.notebook import tqdm
from pathlib import Path
from sklearn.metrics import classification_report
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
from huggingface_hub import login

In [4]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [40]:
tokenizer.pad_token = tokenizer.eos_token

In [29]:
def make_prompt(utterance):

    conversation = [
        {"role": "system", "content": "### Task description: You are an expert sentiment analysis assistant that takes an utterance from a comic book and must classify the utterance into appropriate emotion class(s): anger, surprise, fear, disgust, sadness, joy, neutral. You must absolutely not generate any text or explanation other than the following JSON format: {\"utterance_emotion\": \"<predicted emotion classes for the utterance (str)>}\"\n\n"},
        {"role":"user", "content": f"# Utterance:\n{utterance}\n\n# Result:\n"}
    ]

    return conversation

### Read data files

In [30]:
df = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/zeroshot/datasets/comics_data_processed.csv")
df = df.drop(columns=[df.columns[0], df.columns[1]])

In [31]:
emotion_map = {
    'AN': 'anger',
    'DI': 'disgust',
    'FE': 'fear',
    'SA': 'sadness',
    'SU': 'surprise',
    'JO': 'joy'
}
labels = ["anger", "surprise", "fear", "disgust", "sadness", "joy", "neutral"]

In [32]:
def extract_emotions(row):

    emotion_str = row.emotion

    if emotion_str == 'Neutral':
        return ['neutral']

    emotions = emotion_str.split('-')
    tags = []

    for emotion in emotions:
        abbrev = emotion[:2]  # Get the abbreviation
        value_part = emotion[2:]  # Get the value part
        
        if abbrev in emotion_map and value_part.isdigit():
            value = int(value_part)
            if value > 0:
                tags.append(emotion_map[abbrev].lower())
        else:
            print(f"Warning: Skipping invalid emotion entry: '{emotion}'")
    return tags  

In [33]:
df['emotions_list'] = df.apply(lambda row: extract_emotions(row), axis=1)

In [34]:
texts = df.utterance.tolist()
texts = [make_prompt(text) for text in texts]

In [72]:
inputs = tokenizer.apply_chat_template(
            texts,
            #tools=tools,
            # pad_token = tokenizer.eos_token,
            padding=True,
            truncation=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt",
)

In [47]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [75]:
def batch_tensor(tensor, batch_size):
    return [tensor[i:i+batch_size] for i in range(0, tensor.size(0), batch_size)]

In [76]:
BATCH_SIZE = 128

In [77]:
input_ids_batches = batch_tensor(inputs['input_ids'], BATCH_SIZE)
attention_mask_batches = batch_tensor(inputs['attention_mask'], BATCH_SIZE)

In [78]:
generated_outputs = []

In [80]:
for i, (input_ids_batch, attention_mask_batch) in enumerate(zip(input_ids_batches, attention_mask_batches)):
    
    print(f"Processing batch {i + 1}")
    
    # Move tensors to model device
    inputs = {
        'input_ids': input_ids_batch.to(model.device),
        'attention_mask': attention_mask_batch.to(model.device)
    }
    
    # Generate output using model.generate
    generated = model.generate(**inputs, max_new_tokens=32)
    
    # Store the generated output
    generated_outputs.append(generated)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [95]:
decoded_outputs = []

for batch in generated_outputs:

    for prediction in batch:

        decoded_outputs.append(tokenizer.decode(prediction, skip_special_tokens=True))

In [96]:
len(decoded_outputs)

5282

In [98]:
x = []

for decoded_ouput in decoded_outputs:
    x.append(decoded_ouput.split("Result:\n")[1])

In [101]:
z = []

for y in x:

    match = re.search(r'(\{.*?\})', y)

    if match:

        json_str = match.group(1)  # Extract the JSON object part
        try:
            # Parse the JSON string
            parsed_json = json.loads(json_str)
            
            # Extract the 'utterance_emotion' value
            utterance_emotion = parsed_json.get('utterance_emotion')
            z.append(utterance_emotion)
            #print("Extracted utterance_emotion:", utterance_emotion)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")

In [104]:
preds_l = [[emotion] for emotion in z]

In [109]:
grounds = df.emotions_list.tolist()

In [111]:
all_labels = ["anger", "surprise", "fear", "disgust", "sadness", "joy", "neutral"]

def labels_to_binary_matrix(label_list, all_labels):
    binary_matrix = np.zeros((len(label_list), len(all_labels)))
    for i, labels in enumerate(label_list):
        for label in labels:
            if label in all_labels:
                binary_matrix[i][all_labels.index(label)] = 1
    return binary_matrix

def opposite(component_type):

    if component_type == "anger":
        return "surprise"
    elif component_type == "disgust":
        return "joy"
    elif component_type == "fear":
        return "sadness"
    elif component_type == "sadness":
        return "anger"
    elif component_type == "surprise":
        return "disgust"
    elif component_type == "joy":
        return "fear"
    elif component_type == "Neutral":
        return "sadness"
    

def harmonize_preds(grounds, preds):

    l1, l2 = len(preds), len(grounds)
    if l1 < l2:
        diff = l2 - l1
        preds = preds + [opposite(x) for x in grounds[l1:]]
    else:
        preds = preds[:l2]
        
    return preds 

def post_process_zs(grounds, preds):

    for i,(x,y) in enumerate(zip(grounds, preds)):
        
        if len(x) != len(y):
            
            preds[i] = harmonize_preds(x, y)

    true_matrix = labels_to_binary_matrix(grounds, all_labels)
    predicted_matrix = labels_to_binary_matrix(preds, all_labels)

    return true_matrix, predicted_matrix

In [112]:
grounds_matrix, preds_matrix = post_process_zs(grounds, preds_l)

In [117]:
classification_report(grounds_matrix, preds_matrix, target_names=all_labels, digits=3)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


'              precision    recall  f1-score   support\n\n       anger      0.562     0.524     0.542      1791\n    surprise      0.710     0.200     0.312      1590\n        fear      0.240     0.101     0.142      1373\n     disgust      0.058     0.180     0.088       311\n     sadness      0.382     0.183     0.247      1238\n         joy      0.427     0.228     0.298      1104\n     neutral      0.116     0.761     0.201       343\n\n   micro avg      0.309     0.283     0.295      7750\n   macro avg      0.357     0.311     0.261      7750\nweighted avg      0.447     0.283     0.309      7750\n samples avg      0.309     0.296     0.300      7750\n'

In [115]:
results_file = Path("/Utilisateurs/umushtaq/emotion_analysis_comics/zeroshot/results/zs_Mistral-7B-Instruct-v0.3") / "results.pickle"
results_file.parent.mkdir(parents=True, exist_ok=True)

with results_file.open('wb') as fh:
    results_d = {"ground_truths": grounds,
                 "predictions": preds_l    
        
    }
    pickle.dump(results_d, fh)

In [116]:
classification_file = Path("/Utilisateurs/umushtaq/emotion_analysis_comics/zeroshot/results/zs_Mistral-7B-Instruct-v0.3") / "classification_report.pickle"

with classification_file.open('wb') as fh:
    
    pickle.dump(classification_report(grounds_matrix, preds_matrix, target_names=all_labels, output_dict=True), fh)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
