# Data Pre-Processing

In [1]:
import json
import pandas as pd

from pathlib import Path

In [6]:
CURRENT_DIR = Path.cwd()
#EAC_DIR = CURRENT_DIR.parent.parent
EAC_DIR = Path(CURRENT_DIR) / "emotion_analysis_comics"

In [7]:
DATA_FILE = Path(EAC_DIR) / "dataset_files" / "comics_dataset.csv"
DATASET_DIR = Path(EAC_DIR) / "finetuning" / "datasets"

In [8]:
df = pd.read_csv(DATA_FILE, index_col=False)

In [9]:
def get_unique_emotion(row):

    utterance_emotions = row.emotion

    utterance_emotions_l = []
    emotion_class_labels = ["anger", "disgust", "fear", "sadness", "surprise", "joy"]

    if utterance_emotions == 'Neutral':
        
        utterance_emotions_l.append('neutral')
    
    else:
        utterance_emotions = utterance_emotions.split("-")
       
        #emotion_annotation_l = []

        for idx, emotion_annotation in enumerate(utterance_emotions):

            if '0' not in emotion_annotation:
         
                #emotion_annotation_l.append(emotion_class_labels[idx])
                #utterance_emotions_l.append(emotion_annotation[:-1])
                utterance_emotions_l.append(emotion_class_labels[idx])

    return utterance_emotions_l

In [10]:
df['emotion_c'] = df.apply(lambda row: get_unique_emotion(row), axis=1)

In [11]:
# Page level

In [12]:
grouped_df = df.groupby(['file_name', 'page_nr', 'split']).agg({
    'utterance': list,
    'emotion_c': list
}).reset_index()

In [13]:
grouped_df['split'].value_counts()

split
TRAIN    718
TEST     156
Name: count, dtype: int64

In [14]:
grouped_df

Unnamed: 0,file_name,page_nr,split,utterance,emotion_c
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,TRAIN,[THIS VILE THING ATTACKED THE SMALL BEASTS OF ...,"[[anger], [anger], [fear], [fear], [fear, sadn..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,TRAIN,"[NO-- #GKKK…#, #CHOMP!, BY THE SKIN OF MATILD...","[[fear], [anger], [surprise], [anger], [joy], ..."
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,TRAIN,"[COME ON, BEAST!, SHOW YOURSELF!, WHY DO YOU H...","[[joy], [joy], [anger], [anger]]"
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,TRAIN,[#AARGH! ],"[[fear, surprise]]"
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,TRAIN,"[I, THE GREEN TORCH, HAVE BEEN TASKED WITH PRO...","[[anger], [anger], [fear], [fear, surprise], [..."
...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,TEST,[WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO ...,"[[anger], [anger], [anger], [anger], [anger, s..."
870,QC copy - 2200 - Stillwater 13.xlsx,17,TEST,"[SO WHAT ARE WE GOING TO DO?, THE WAY I SEE IT...","[[sadness, surprise], [anger], [anger], [anger..."
871,QC copy - 2200 - Stillwater 13.xlsx,18,TEST,"[KIDDIE COUNCIL'S BEEN GOING A LONG TIME... , ...","[[anger, sadness], [anger], [anger], [anger], ..."
872,QC copy - 2200 - Stillwater 13.xlsx,19,TEST,[IT'S BEEN… PEACEFUL. ASIDE FROM SHIT LIKE THI...,"[[anger], [joy], [joy], [anger, surprise], [an..."


In [15]:
grouped_df.file_name.unique()

array(['QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
       'QC copy - 1500 - 04 Nightwing 19 _Nightwing 95_.xlsx',
       'QC copy - 1501 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCWWJL GL 1.xlsx',
       'QC copy - 1502 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCVMM Lobo.xlsx',
       'QC copy - 1503 - 10 Crisis Oscura Flash - FLS 783.xlsx',
       'QC copy - 1507 - 22 Calle Peligro 1.xlsx',
       'QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx',
       'QC copy - 1513 - 21 Blanco Humano 9.xlsx',
       'QC copy - 1514 - 15 DC contra Vampiros 11.xlsx',
       'QC copy - 1517 - 37 John Carpenter Historias para una noche de Halloween 3.xlsx',
       'QC copy - 1518 - 50 Las asombrosas aventuras de las Tortugas Ninja 5.xlsx',
       'QC copy - 1520 - 51 Sonic The Hedgehog 44.xlsx',
       'QC copy - 1521 - 40 Amor eterno 2.xlsx',
       'QC copy - 1559 - 36 Fantasmas vol. 1 - Ghosted 2.xlsx',
       'QC copy - 1560 - 36 Fantasmas vol. 1 - G

### Prepare prompts

In [14]:
# Formatting Fx
# Build questoin
# Build answer

In [16]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [17]:
def build_instruction():
    emotion_classes = ["anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])
    
    instruction = f"""### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a list of utterances from a page in a comic book
- The utterance may express one or multiple emotions

TASK:
1. Carefully analyze the emotional context and tone of each utterance in the page
2. Identify applicable emotions from the following classes:
   {formatted_classes}
3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.

RULES:
1. Use ONLY the labels listed above
2. Output must be a JSON with single key "page_utterance_emotions"
3. Value must be an array where:
   - Each element is an array of emotions for one utterance
   - Order matches the input utterances order
   - Multiple emotions are allowed per utterance
4. No explanations, only JSON output

IMPORTANT:
- Each array element corresponds to one utterance
- One utterance can have multiple emotions
- Maintain exact spelling and case of emotion labels
- Keep emotions in arrays even for single emotions

"""
    return instruction

In [18]:
def build_tagged_text(utterances):

    concatenated_utterances = '\n'.join(f"{i + 1}. {line}" for i, line in enumerate(utterances))
    
    question = f"""Now analyze these utterances in a page:\n{concatenated_utterances}"""

    return question

In [19]:
def build_answer(utterance_emotions):
                

    return json.dumps({"page_utterance_emotions": utterance_emotions})

### Build Data Files

In [20]:
df_train = grouped_df[grouped_df.split == 'TRAIN'].reset_index()

data_file_train = []

for index, _ in df_train.iterrows():
    
    i = index

    instruction = build_instruction()
    question = build_tagged_text(df_train.iloc[i].utterance)
    answer = build_answer(df_train.iloc[i].emotion_c)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [21]:
len(data_file_train)

718

In [22]:
print(data_file_train[0]['instruction'])

### Emotion Analysis Expert Role

You are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.

INPUT:
- You will receive a list of utterances from a page in a comic book
- The utterance may express one or multiple emotions

TASK:
1. Carefully analyze the emotional context and tone of each utterance in the page
2. Identify applicable emotions from the following classes:
   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"
3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.

RULES:
1. Use ONLY the labels listed above
2. Output must be a JSON with single key "page_utterance_emotions"
3. Value must be an array where:
   - Each element is an array of emotions for one utterance
   - Order matches the input utterances order
   - Multiple emotions are allowed per utterance
4. No explanations, only JSON ou

In [23]:
print(data_file_train[0]['input'])

Now analyze these utterances in a page:
1. THIS VILE THING ATTACKED THE SMALL BEASTS OF MY SHORES… 
2. … IT PUNCHED MY BEAUTIFUL MATILDA… AND NOW IT BEGS FOR LIFE.
3. MY MASTER!
4. PLEASE!
5. BUT I HAVE NOT CHASED THIS MONSTER ALL THIS WAY TO LET IT GROVEL!
6. HEAL MEEE!
7. I HAVE COME TO CONQ--!
8. WHAT--
9. --IS THAT?!
10. NO! NO!


In [24]:
print(data_file_train[0]['output'])

{"page_utterance_emotions": [["anger"], ["anger"], ["fear"], ["fear"], ["fear", "sadness"], ["sadness"], ["anger"], ["surprise"], ["surprise"], ["fear", "surprise"]]}


In [25]:
df_test = grouped_df[grouped_df.split == 'TEST'].reset_index()

data_file_test = []

for index, _ in df_test.iterrows():
    
    i = index

    instruction = build_instruction()
    question = build_tagged_text(df_test.iloc[i].utterance)
    answer = build_answer(df_test.iloc[i].emotion_c)
    
    data_file_test.append( formatting_fct(instruction, question, answer) )

In [26]:
len(df_test)

156

In [27]:
for i in range(10):
    print(data_file_test[i])

{'instruction': '### Emotion Analysis Expert Role\n\nYou are an advanced emotion analysis expert specializing in comic book dialogue interpretation. Your task is to analyze utterances and identify their emotional content.\n\nINPUT:\n- You will receive a list of utterances from a page in a comic book\n- The utterance may express one or multiple emotions\n\nTASK:\n1. Carefully analyze the emotional context and tone of each utterance in the page\n2. Identify applicable emotions from the following classes:\n   "anger", "disgust", "fear", "sadness", "surprise", "joy", "neutral"\n3. For each utterance in a comic page, identify all emotions present and return an array of emotion arrays in order.\n\nRULES:\n1. Use ONLY the labels listed above\n2. Output must be a JSON with single key "page_utterance_emotions"\n3. Value must be an array where:\n   - Each element is an array of emotions for one utterance\n   - Order matches the input utterances order\n   - Multiple emotions are allowed per utter

### Create and save JSON files

In [28]:
file_path = Path(DATASET_DIR) / "comics35_utterance_pg_train.json"

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [29]:
file_path = Path(DATASET_DIR) / "comics35_utterance_pg_test.json"

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)