# Data Pre-Processing

In [1]:
import os
import glob
import json
import random
import pandas as pd # type: ignore

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data_files")

### Read files into a dataframe

In [3]:
file_paths = glob.glob(os.path.join(data_dir, '*.xlsx'))

In [4]:
df_list = []

for file_path in file_paths:

    df = pd.read_excel(file_path)
    df['file_name'] = os.path.basename(file_path)
    df_list.append(df)

In [5]:
df = pd.concat(df_list, ignore_index=True)

### Process dataframe

In [6]:
df = df[df['Annotations'].notna()].reset_index()

In [7]:
df[['raw_emotion', 'raw_speaker_id']] = df['Annotations'].str.split('\n\n', expand=True, n=1)

In [8]:
def process_emotion(x):

    raw_emotion = x.raw_emotion
    try:
        emotion = raw_emotion.split("\n")[1].split(":")[1]
    except:
        emotion = "no annotation"

    return emotion

In [9]:
df['emotion'] = df.apply(lambda x: process_emotion(x), axis=1)

In [10]:
def process_speaker(x):

    raw_speaker_id = x.raw_speaker_id
    
    try:
        if 'SpokenBy:' in raw_speaker_id.split("\n")[0]:
            speaker_id = raw_speaker_id.split("\n")[0].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[1]:            
            speaker_id = raw_speaker_id.split("\n")[1].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[2]:            
            speaker_id = raw_speaker_id.split("\n")[2].split(":")[1]

        return speaker_id
            
    except:
        
        speaker_id = "no annotation"

        return speaker_id

In [11]:
df['speaker_id'] = df.apply(lambda x: process_speaker(x), axis=1)

In [12]:
df = df.drop(columns=['index', 'Translated text in Spanish; Castilian', 'Number of words.1', 'Signs with spaces.1', 'Signs without spaces.1'])

In [13]:
df = df.rename(columns={'Source text in English': 'utterance', 'Annotations': 'raw_annotation'})

In [14]:
df = df.drop(columns=['Number of words', 'Signs with spaces', 'Signs without spaces'])

In [15]:
df = df.rename(columns={'Page': 'page_nr', 'Panel': 'panel_nr', 'Balloon': 'balloon_nr'})

In [16]:
df = df[['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id']]

### train-test-split

In [17]:
titles_list = list(df.file_name.unique())
num_train = int(len(titles_list) * 0.7)

In [18]:
train_titles = random.sample(titles_list, num_train)
test_titles = [title for title in titles_list if title not in train_titles]

In [19]:
df['split'] = df['file_name'].apply(lambda x: 'TRAIN' if x in train_titles else ('TEST' if x in test_titles else 'UNKNOWN'))

#### clean dataframe

In [20]:
wrong_idx = [1093, 26, 1447]

In [21]:
df = df.drop(df.index[wrong_idx]).reset_index(drop=True)

In [22]:
speaker = df.iloc[1129]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1129]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [23]:
df.iloc[1129, 8] = emotion
df.iloc[1129, 9] = speaker

In [24]:
speaker = df.iloc[1411]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1411]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [25]:
df.iloc[1411, 8] = emotion
df.iloc[1411, 9] = speaker

In [26]:
df = df[df.emotion != "no annotation"].reset_index()

In [27]:
df.to_csv("comics_data_processed.csv")

### dataframe grouped by titles

In [28]:
df_by_title = df.groupby('file_name').agg({'utterance': list, 'emotion': list, 'split': set}).reset_index()

In [29]:
def get_full_title_text(x):

    list_utterances = x.utterance
    full_title_text = ' '.join(list_utterances)

    return full_title_text

In [30]:
df_by_title['full_title_text'] = df_by_title.apply(lambda x: get_full_title_text(x), axis=1)

In [31]:
df_by_title = df_by_title.rename(columns={'utterance': 'utterances_l', 'emotion': 'emotions_l'})

In [32]:
def process_split(x):

    return list(x.split)[0]

In [33]:
df_by_title['split'] = df_by_title.apply(lambda x: process_split(x), axis=1)

In [34]:
def find_utterance_indices(row):

    title_text = row.full_title_text
    utterances_l = row.utterances_l

    start_indices = []
    end_indices = []
    
    for utterance in utterances_l:
        start = title_text.find(utterance)
        if start != -1:
            # Append the start index and calculate the end index
            start_indices.append(start)
            end_indices.append(start + len(utterance))
        else:
            # If substring not found, append -1 to both lists
            start_indices.append(-1)
            end_indices.append(-1)
    
    return start_indices, end_indices

In [35]:
df_by_title['start_indices', 'end_indices'] = df_by_title.apply(lambda row: find_utterance_indices(row), axis=1)

In [36]:
df_by_title[['start_indices', 'end_indices']] = pd.DataFrame(df_by_title[('start_indices', 'end_indices')].tolist(), index=df_by_title.index)

In [37]:
df_by_title.to_csv("comics_data_by_title.csv")

### Prepare prompts

In [38]:
# Formatting Fx
# Build questoin
# Build answer

In [39]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [40]:
def build_instruction(nr_utterances):

    results = json.dumps(["emotion_class (str)"] * nr_utterances)

    instruction = f"""### You are an expert in Emotion Analysis. You are given the transcript of a comic book which contains numbered character utterances enclosed by <UT></UT> tags. Your task is to classify each utterance in the comic transcript as on the following emotion classes: "Anger" (AN), "Disgust" (DI), "Fear" (FE), "Sadness" (SA), "Surprise" (SU) or "Joy" (JO). You must return a list of emotion classes, strictly of length {nr_utterances}, in following JSON format: {{"list_emotion_classes": [["emotion_classes (str)"], ["emotion_classes (str)"] ... ["emotion_classes (str)"]]}} where each element "emotion_classes (str)" is replaced by one ore more of the following abbreviated emotion class labels: "AN", "DI", "FE", "SA", "SU" or "JO". 
"""    
    return instruction

In [41]:
# def build_tagged_text(text, start_indices, end_indices):

#     offset = 0

#     for i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
            
#         start_tag = "<UT" + str(i+1) + ">"
#         end_tag = "</UT" + str(i+1) + ">"
        
#         start_idx = start_i + offset
#         end_idx = end_i + offset

#         offset = offset + (len(start_tag)  + len(end_tag))
        
#         text_r = text[start_idx:end_idx]
#         new_text = start_tag + text_r + end_tag
#         text = text.replace(text_r, new_text)

#         question = f"""### Here is the comic transcript: {text}"""

#     return question

In [42]:
def build_tagged_text(utterances_l):

    tagged_utterances_l = []

    for idx, utterance in enumerate(utterances_l):
        
        start_tag = "<UT" + str(idx+1) + ">"
        end_tag = "</UT" + str(idx+1) + ">"
        tagged_utterance = start_tag + utterance + end_tag
        tagged_utterances_l.append(tagged_utterance)
        
    tagged_title_text = ''.join(tagged_utterances_l)
    question = f"""### Here is the comic transcript: {tagged_title_text}"""

    return question

In [43]:
def build_answer(title_emotions):

    title_emotions_l = []
    emotion_class_labels = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy"]

    for emotions_l in title_emotions:

            if emotions_l == 'Neutral':
                title_emotions_l.append([emotions_l])
            
            else:
                emotions_l = emotions_l.split("-")
               
                emotion_annotation_l = []

                for idx, emotion_annotation in enumerate(emotions_l):

                    if '0' not in emotion_annotation:
                 
                        #emotion_annotation_l.append(emotion_class_labels[idx])
                        emotion_annotation_l.append(emotion_annotation[:-1])
                    
                title_emotions_l.append(emotion_annotation_l)
                

    return json.dumps({"list_emotion_classes": title_emotions_l})

### Build Data Files

In [44]:
df_train = df_by_title[df_by_title.split == 'TRAIN'].reset_index()

data_file_train = []

for index, _ in df_train.iterrows():
    
    i = index

    instruction = build_instruction(len(df_train.iloc[i].utterances_l))
    question = build_tagged_text(df_train.iloc[i].utterances_l)
    answer = build_answer(df_train.iloc[i].emotions_l)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [45]:
len(data_file_train)

22

In [46]:
for i in range(3):
    print(data_file_train[i])

{'instruction': '### You are an expert in Emotion Analysis. You are given the transcript of a comic book which contains numbered character utterances enclosed by <UT></UT> tags. Your task is to classify each utterance in the comic transcript as on the following emotion classes: "Anger" (AN), "Disgust" (DI), "Fear" (FE), "Sadness" (SA), "Surprise" (SU) or "Joy" (JO). You must return a list of emotion classes, strictly of length 34, in following JSON format: {"list_emotion_classes": [["emotion_classes (str)"], ["emotion_classes (str)"] ... ["emotion_classes (str)"]]} where each element "emotion_classes (str)" is replaced by one ore more of the following abbreviated emotion class labels: "AN", "DI", "FE", "SA", "SU" or "JO". \n', 'input': '### Here is the comic transcript: <UT1>In Rio de Janeiro my brakes were cut.</UT1><UT2>In Mexico l was slipped psilocybin while fighting a pack of jaguars.</UT2><UT3>And now, in SHANGHAI…</UT3><UT4>< I\'VE GOT YOU! GO LIMP! > *</UT4><UT5>< THIS DOESN\'T

In [47]:
df_test = df_by_title[df_by_title.split == 'TEST'].reset_index()

data_file_test = []

for index, _ in df_test.iterrows():
    
    i = index

    instruction = build_instruction(len(df_test.iloc[i].utterances_l))
    question = build_tagged_text(df_test.iloc[i].utterances_l)
    answer = build_answer(df_test.iloc[i].emotions_l)
    
    data_file_test.append( formatting_fct(instruction, question, answer) )

### Create and save JSON files

In [48]:
file_path = os.path.join(os.getcwd(), "../datasets/comics_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [49]:
file_path = os.path.join(os.getcwd(), "../datasets/comics_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)

### Data Process by a Single Title

In [50]:
df

Unnamed: 0,index,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TRAIN
1,1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TRAIN
2,2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TRAIN
3,3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TRAIN
4,4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...,...
5277,5290,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,1,I KNOW THE BEINGS OF THIS WORLD ARE TRYING TO ...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5278,5291,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,2,… BUT I WILL CRUSH THEM IN DUE TIME!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5279,5292,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,1,FOR MY FIRST TASK...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TEST
5280,5293,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,2,… I MUST REMOVE THIS WORLD OF THEIR GODS!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO5,BLACKMANTASAURUS,TEST


In [51]:
df_by_title

Unnamed: 0,file_name,utterances_l,emotions_l,split,full_title_text,"(start_indices, end_indices)",start_indices,end_indices
0,QC copy - 1492 - 03 Batman El Caballero 8.xlsx,"[In Rio de Janeiro my brakes were cut., In Mex...","[AN3-DI0-FE3-SA0-SU3-JO0, AN3-DI0-FE3-SA2-SU0-...",TRAIN,In Rio de Janeiro my brakes were cut. In Mexic...,"([0, 38, 107, 129, 158, 224, 248, 261, 271, 34...","[0, 38, 107, 129, 158, 224, 248, 261, 271, 348...","[37, 106, 128, 157, 223, 247, 260, 270, 347, 3..."
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,[THIS VILE THING ATTACKED THE SMALL BEASTS OF ...,"[AN5-DI0-FE0-SA0-SU0-JO0, AN5-DI0-FE0-SA0-SU0-...",TEST,THIS VILE THING ATTACKED THE SMALL BEASTS OF M...,"([0, 57, 118, 129, 137, 203, 214, 237, 244, 25...","[0, 57, 118, 129, 137, 203, 214, 237, 244, 256...","[56, 117, 128, 136, 202, 213, 236, 243, 255, 2..."
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,"[DID YOU HAVE TO ELECTROCUTE HER SO HARD?, IT'...","[AN0-DI0-FE3-SA0-SU5-JO0, AN0-DI0-FE0-SA0-SU5-...",TRAIN,DID YOU HAVE TO ELECTROCUTE HER SO HARD? IT'S ...,"([0, 41, 82, 176, 206, 272, 315, 358, 383, 397...","[0, 41, 82, 176, 206, 272, 315, 358, 383, 397,...","[40, 81, 175, 205, 271, 314, 357, 382, 396, 41..."
3,QC copy - 1501 - 09 Mundos sin Liga de la Just...,"["" Tell me another story, Momma. "", "" What kin...","[AN0-DI0-FE0-SA0-SU0-JO3, AN0-DI0-FE0-SA0-SU3-...",TRAIN,""" Tell me another story, Momma. "" "" What kind ...","([0, 34, 67, 85, 125, 197, 209, 242, 334, 338,...","[0, 34, 67, 85, 125, 197, 209, 242, 334, 338, ...","[33, 66, 84, 124, 196, 208, 241, 333, 337, 343..."
4,QC copy - 1502 - 09 Mundos sin Liga de la Just...,"[MOTHERFRAGGER., "" AND WHEN HE SAW THE BLASPHE...","[AN5-DI0-FE0-SA0-SU0-JO0, AN3-DI0-FE0-SA0-SU5-...",TRAIN,"MOTHERFRAGGER. "" AND WHEN HE SAW THE BLASPHEMI...","([0, 15, 93, 117, 227, 321, 326, 446, 563, 575...","[0, 15, 93, 117, 227, 321, 326, 446, 563, 575,...","[14, 92, 116, 226, 320, 325, 445, 562, 574, 67..."
5,QC copy - 1503 - 10 Crisis Oscura Flash - FLS ...,"[I'M DONE LETTING YOU TORTURE ME!, YOU'RE NEVE...","[AN5-DI0-FE0-SA0-SU0-JO0, AN5-DI0-FE0-SA2-SU0-...",TRAIN,I'M DONE LETTING YOU TORTURE ME! YOU'RE NEVER ...,"([0, 33, 77, 141, 374, 394, 591, 622, 741, 800...","[0, 33, 77, 141, 374, 394, 591, 622, 741, 800,...","[32, 76, 140, 373, 393, 590, 621, 740, 799, 90..."
6,QC copy - 1507 - 22 Calle Peligro 1.xlsx,"[HOW'S IT GOING?, HEY., CAN I GET YOU ANYTHING...","[AN0-DI0-FE0-SA0-SU2-JO3, AN0-DI0-FE0-SA0-SU0-...",TEST,HOW'S IT GOING? HEY. CAN I GET YOU ANYTHING? J...,"([0, 16, 21, 45, 58, 75, 83, 117, 133, 150, 16...","[0, 16, 21, 45, 58, 75, 83, 117, 133, 150, 169...","[15, 20, 44, 57, 74, 82, 116, 132, 149, 168, 1..."
7,QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx,"[ SO, LET ME GET THIS STRAIGHT… THERE'S A SUPE...","[AN0-DI0-FE0-SA0-SU3-JO0, AN0-DI0-FE0-SA0-SU5-...",TEST,"SO, LET ME GET THIS STRAIGHT… THERE'S A SUPER...","([0, 110, 135, 154, 170, 186, 186, 208, 220, 2...","[0, 110, 135, 154, 170, 186, 186, 208, 220, 22...","[109, 134, 153, 169, 185, 196, 196, 219, 225, ..."
8,QC copy - 1513 - 21 Blanco Humano 9.xlsx,"[I'M ALREADY TOO LATE., $ % # @., I'M ALSO TOO...","[AN0-DI0-FE3-SA3-SU0-JO0, AN5-DI0-FE0-SA0-SU0-...",TRAIN,I'M ALREADY TOO LATE. $ % # @. I'M ALSO TOO EA...,"([0, 22, 31, 70, 96, 102, 115, 184, 208, 214, ...","[0, 22, 31, 70, 96, 102, 115, 184, 208, 214, 2...","[21, 30, 69, 95, 101, 114, 183, 207, 213, 250,..."
9,QC copy - 1514 - 15 DC contra Vampiros 11 (1)....,"[STILL WARM., WHOEVER WAS HERE, THEY LEFT QUIC...","[AN0-DI0-FE2-SA0-SU3-JO0, AN0-DI0-FE3-SA0-SU3-...",TRAIN,"STILL WARM. WHOEVER WAS HERE, THEY LEFT QUICKL...","([0, 12, 49, 118, 146, 269, 309, 319, 371, 398...","[0, 12, 49, 118, 146, 269, 309, 319, 371, 398,...","[11, 48, 117, 145, 268, 308, 318, 370, 397, 43..."
