# Data Pre-Processing

In [4]:
import os
import glob
import json
import random
import pandas as pd # type: ignore

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data_files")

### Read files into a dataframe

In [4]:
file_paths = glob.glob(os.path.join(data_dir, '*.xlsx'))

In [5]:
df_list = []

for file_path in file_paths:

    df = pd.read_excel(file_path)
    df['file_name'] = os.path.basename(file_path)
    df_list.append(df)

In [6]:
df = pd.concat(df_list, ignore_index=True)

### Process dataframe

In [7]:
df = df[df['Annotations'].notna()].reset_index()

In [8]:
df[['raw_emotion', 'raw_speaker_id']] = df['Annotations'].str.split('\n\n', expand=True, n=1)

In [9]:
def process_emotion(x):

    raw_emotion = x.raw_emotion
    try:
        emotion = raw_emotion.split("\n")[1].split(":")[1]
    except:
        emotion = "no annotation"

    return emotion

In [10]:
df['emotion'] = df.apply(lambda x: process_emotion(x), axis=1)

In [11]:
def process_speaker(x):

    raw_speaker_id = x.raw_speaker_id
    
    try:
        if 'SpokenBy:' in raw_speaker_id.split("\n")[0]:
            speaker_id = raw_speaker_id.split("\n")[0].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[1]:            
            speaker_id = raw_speaker_id.split("\n")[1].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[2]:            
            speaker_id = raw_speaker_id.split("\n")[2].split(":")[1]

        return speaker_id
            
    except:
        
        speaker_id = "no annotation"

        return speaker_id

In [12]:
df['speaker_id'] = df.apply(lambda x: process_speaker(x), axis=1)

In [13]:
df = df.drop(columns=['index', 'Translated text in Spanish; Castilian', 'Number of words.1', 'Signs with spaces.1', 'Signs without spaces.1'])

In [14]:
df = df.rename(columns={'Source text in English': 'utterance', 'Annotations': 'raw_annotation'})

In [15]:
df = df.drop(columns=['Number of words', 'Signs with spaces', 'Signs without spaces'])

In [16]:
df = df.rename(columns={'Page': 'page_nr', 'Panel': 'panel_nr', 'Balloon': 'balloon_nr'})

In [17]:
df = df[['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id']]

In [18]:
df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1
...,...,...,...,...,...,...,...,...,...,...
5293,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,1,I KNOW THE BEINGS OF THIS WORLD ARE TRYING TO ...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS
5294,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,2,… BUT I WILL CRUSH THEM IN DUE TIME!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS
5295,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,1,FOR MY FIRST TASK...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS
5296,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,2,… I MUST REMOVE THIS WORLD OF THEIR GODS!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO5,BLACKMANTASAURUS


### train-test-split

In [19]:
titles_list = list(df.file_name.unique())
num_train = int(len(titles_list) * 0.7)

In [20]:
train_titles = random.sample(titles_list, num_train)
test_titles = [title for title in titles_list if title not in train_titles]

In [21]:
df['split'] = df['file_name'].apply(lambda x: 'TRAIN' if x in train_titles else ('TEST' if x in test_titles else 'UNKNOWN'))

#### clean dataframe

In [22]:
df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TRAIN
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TRAIN
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TRAIN
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TRAIN
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...
5293,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,1,I KNOW THE BEINGS OF THIS WORLD ARE TRYING TO ...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TRAIN
5294,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,1,2,… BUT I WILL CRUSH THEM IN DUE TIME!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TRAIN
5295,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,1,FOR MY FIRST TASK...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO0,BLACKMANTASAURUS,TRAIN
5296,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,20,2,2,… I MUST REMOVE THIS WORLD OF THEIR GODS!,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN5-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:BLACKMA...,AN5-DI0-FE0-SA0-SU0-JO5,BLACKMANTASAURUS,TRAIN


In [23]:
wrong_idx = [1093, 26, 1447]

In [24]:
df = df.drop(df.index[wrong_idx]).reset_index(drop=True)

In [25]:
speaker = df.iloc[1129]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1129]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [26]:
df.iloc[1129, 8] = emotion
df.iloc[1129, 9] = speaker

In [27]:
speaker = df.iloc[1411]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1411]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [28]:
df.iloc[1411, 8] = emotion
df.iloc[1411, 9] = speaker

In [29]:
df = df[df.emotion != "no annotation"].reset_index()

In [30]:
df.to_csv("comic_titles_processed.csv")

### dataframe grouped by titles

In [33]:
df_by_title = df.groupby('file_name').agg({'utterance': list, 'emotion': list, 'split': set}).reset_index()

In [34]:
def get_full_title_text(x):

    list_utterances = x.utterance
    full_title_text = ' '.join(list_utterances)

    return full_title_text

In [35]:
df_by_title['full_title_text'] = df_by_title.apply(lambda x: get_full_title_text(x), axis=1)

In [36]:
df_by_title = df_by_title.rename(columns={'utterance': 'utterances_l', 'emotion': 'emotions_l'})

In [37]:
df_by_title

Unnamed: 0,file_name,utterances_l,emotions_l,split,full_title_text
0,QC copy - 1492 - 03 Batman El Caballero 8.xlsx,"[In Rio de Janeiro my brakes were cut., In Mex...","[AN3-DI0-FE3-SA0-SU3-JO0, AN3-DI0-FE3-SA2-SU0-...",{TRAIN},In Rio de Janeiro my brakes were cut. In Mexic...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,[THIS VILE THING ATTACKED THE SMALL BEASTS OF ...,"[AN5-DI0-FE0-SA0-SU0-JO0, AN5-DI0-FE0-SA0-SU0-...",{TRAIN},THIS VILE THING ATTACKED THE SMALL BEASTS OF M...
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,"[DID YOU HAVE TO ELECTROCUTE HER SO HARD?, IT'...","[AN0-DI0-FE3-SA0-SU5-JO0, AN0-DI0-FE0-SA0-SU5-...",{TRAIN},DID YOU HAVE TO ELECTROCUTE HER SO HARD? IT'S ...
3,QC copy - 1501 - 09 Mundos sin Liga de la Just...,"["" Tell me another story, Momma. "", "" What kin...","[AN0-DI0-FE0-SA0-SU0-JO3, AN0-DI0-FE0-SA0-SU3-...",{TEST},""" Tell me another story, Momma. "" "" What kind ..."
4,QC copy - 1502 - 09 Mundos sin Liga de la Just...,"[MOTHERFRAGGER., "" AND WHEN HE SAW THE BLASPHE...","[AN5-DI0-FE0-SA0-SU0-JO0, AN3-DI0-FE0-SA0-SU5-...",{TEST},"MOTHERFRAGGER. "" AND WHEN HE SAW THE BLASPHEMI..."
5,QC copy - 1503 - 10 Crisis Oscura Flash - FLS ...,"[I'M DONE LETTING YOU TORTURE ME!, YOU'RE NEVE...","[AN5-DI0-FE0-SA0-SU0-JO0, AN5-DI0-FE0-SA2-SU0-...",{TRAIN},I'M DONE LETTING YOU TORTURE ME! YOU'RE NEVER ...
6,QC copy - 1507 - 22 Calle Peligro 1.xlsx,"[HOW'S IT GOING?, HEY., CAN I GET YOU ANYTHING...","[AN0-DI0-FE0-SA0-SU2-JO3, AN0-DI0-FE0-SA0-SU0-...",{TRAIN},HOW'S IT GOING? HEY. CAN I GET YOU ANYTHING? J...
7,QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx,"[ SO, LET ME GET THIS STRAIGHT… THERE'S A SUPE...","[AN0-DI0-FE0-SA0-SU3-JO0, AN0-DI0-FE0-SA0-SU5-...",{TRAIN},"SO, LET ME GET THIS STRAIGHT… THERE'S A SUPER..."
8,QC copy - 1513 - 21 Blanco Humano 9.xlsx,"[I'M ALREADY TOO LATE., $ % # @., I'M ALSO TOO...","[AN0-DI0-FE3-SA3-SU0-JO0, AN5-DI0-FE0-SA0-SU0-...",{TRAIN},I'M ALREADY TOO LATE. $ % # @. I'M ALSO TOO EA...
9,QC copy - 1514 - 15 DC contra Vampiros 11 (1)....,"[STILL WARM., WHOEVER WAS HERE, THEY LEFT QUIC...","[AN0-DI0-FE2-SA0-SU3-JO0, AN0-DI0-FE3-SA0-SU3-...",{TEST},"STILL WARM. WHOEVER WAS HERE, THEY LEFT QUICKL..."


### Prepare prompts

In [None]:
# Formatting Fx
# Build questoin
# Build answer

In [1]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [6]:
def build_instruction(nr_utterances):

    results = json.dumps(["emotion_type (str)"] * nr_utterances)

    instruction = f"""### You are an expert in Sentiment Analysis. You are given the transcript of a comic book which contains numbered character utterances enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length {nr_utterances}, in following JSON format: {{"component_types": {results}}} where each element "component_type (str)" is replaced by either "fact", "policy", "reference", "testimony" or "value". 
"""    
    return instruction

In [8]:
print(build_instruction(3))

### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 3, in following JSON format: {"component_types": ["emotion_type (str)", "emotion_type (str)", "emotion_type (str)"]} where each element "component_type (str)" is replaced by either "fact", "policy", "reference", "testimony" or "value". 



In [None]:
def insert_tags(text, start_indices, end_indices):

    offset = 0

    for i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
            
        start_tag = "<AC" + str(i+1) + ">"
        end_tag = "</AC" + str(i+1) + ">"
        
        start_idx = start_i + offset
        end_idx = end_i + offset

        offset = offset + (len(start_tag)  + len(end_tag))
        
        text_r = text[start_idx:end_idx]
        new_text = start_tag + text_r + end_tag
        text = text.replace(text_r, new_text)

        question = f"""### Here is the text: {text}"""

    return question

In [None]:
def get_ac_types(raw_labels):

    
    class_labels = ["fact", "policy", "reference", "testimony", "value"]

    labels = [class_labels[i] for i in raw_labels]
    
    return json.dumps({"component_types": labels})