# Data Pre-Processing

In [1]:
import os
import glob
import json
import random
import pandas as pd # type: ignore

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data_files")

### Read files into a dataframe

In [3]:
file_paths = glob.glob(os.path.join(data_dir, '*.xlsx'))

In [4]:
df_list = []

for file_path in file_paths:

    df = pd.read_excel(file_path)
    df['file_name'] = os.path.basename(file_path)
    df_list.append(df)

In [5]:
df = pd.concat(df_list, ignore_index=True)

### Process dataframe

In [6]:
df = df[df['Annotations'].notna()].reset_index()

In [7]:
df[['raw_emotion', 'raw_speaker_id']] = df['Annotations'].str.split('\n\n', expand=True, n=1)

In [8]:
def process_emotion(x):

    raw_emotion = x.raw_emotion
    try:
        emotion = raw_emotion.split("\n")[1].split(":")[1]
    except:
        emotion = "no annotation"

    return emotion

In [9]:
df['emotion'] = df.apply(lambda x: process_emotion(x), axis=1)

In [10]:
def process_speaker(x):

    raw_speaker_id = x.raw_speaker_id
    
    try:
        if 'SpokenBy:' in raw_speaker_id.split("\n")[0]:
            speaker_id = raw_speaker_id.split("\n")[0].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[1]:            
            speaker_id = raw_speaker_id.split("\n")[1].split(":")[1]
            
        elif 'SpokenBy:' in raw_speaker_id.split("\n")[2]:            
            speaker_id = raw_speaker_id.split("\n")[2].split(":")[1]

        return speaker_id
            
    except:
        
        speaker_id = "no annotation"

        return speaker_id

In [11]:
df['speaker_id'] = df.apply(lambda x: process_speaker(x), axis=1)

In [12]:
df = df.drop(columns=['index', 'Translated text in Spanish; Castilian', 'Number of words.1', 'Signs with spaces.1', 'Signs without spaces.1'])

In [13]:
df = df.rename(columns={'Source text in English': 'utterance', 'Annotations': 'raw_annotation'})

In [14]:
df = df.drop(columns=['Number of words', 'Signs with spaces', 'Signs without spaces'])

In [15]:
df = df.rename(columns={'Page': 'page_nr', 'Panel': 'panel_nr', 'Balloon': 'balloon_nr'})

In [16]:
df = df[['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id']]

### train-test-split

In [18]:
titles_list = list(df.file_name.unique())
num_train = int(len(titles_list) * 0.7)

In [19]:
train_titles = random.sample(titles_list, num_train)
test_titles = [title for title in titles_list if title not in train_titles]

In [20]:
df['split'] = df['file_name'].apply(lambda x: 'TRAIN' if x in train_titles else ('TEST' if x in test_titles else 'UNKNOWN'))

#### clean dataframe

In [21]:
wrong_idx = [1093, 26, 1447]

In [22]:
df = df.drop(df.index[wrong_idx]).reset_index(drop=True)

In [23]:
speaker = df.iloc[1129]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1129]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [24]:
df.iloc[1129, 8] = emotion
df.iloc[1129, 9] = speaker

In [25]:
speaker = df.iloc[1411]['raw_annotation'].split("\n\n")[0].split(":")[1]
emotion = df.iloc[1411]['raw_annotation'].split("\n\n")[1].split(":")[1]

In [26]:
df.iloc[1411, 8] = emotion
df.iloc[1411, 9] = speaker

In [27]:
df = df[df.emotion != "no annotation"].reset_index()

In [28]:
df.to_csv("comics_data_processed.csv")

### dataframe grouped by titles

In [29]:
df_by_title = df.groupby('file_name').agg({'utterance': list, 'emotion': list, 'split': set}).reset_index()

In [30]:
def get_full_title_text(x):

    list_utterances = x.utterance
    full_title_text = ' '.join(list_utterances)

    return full_title_text

In [31]:
df_by_title['full_title_text'] = df_by_title.apply(lambda x: get_full_title_text(x), axis=1)

In [32]:
df_by_title = df_by_title.rename(columns={'utterance': 'utterances_l', 'emotion': 'emotions_l'})

In [33]:
def process_split(x):

    return list(x.split)[0]

In [34]:
df_by_title['split'] = df_by_title.apply(lambda x: process_split(x), axis=1)

In [36]:
def find_utterance_indices(row):

    title_text = row.full_title_text
    utterances_l = row.utterances_l

    start_indices = []
    end_indices = []
    
    for utterance in utterances_l:
        start = title_text.find(utterance)
        if start != -1:
            # Append the start index and calculate the end index
            start_indices.append(start)
            end_indices.append(start + len(utterance))
        else:
            # If substring not found, append -1 to both lists
            start_indices.append(-1)
            end_indices.append(-1)
    
    return start_indices, end_indices

In [37]:
df_by_title['start_indices', 'end_indices'] = df_by_title.apply(lambda row: find_utterance_indices(row), axis=1)

In [38]:
df_by_title[['start_indices', 'end_indices']] = pd.DataFrame(df_by_title[('start_indices', 'end_indices')].tolist(), index=df_by_title.index)

In [40]:
df_by_title.to_csv("comics_data_by_title.csv")

### Prepare prompts

In [41]:
# Formatting Fx
# Build questoin
# Build answer

In [42]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [43]:
def build_instruction(nr_utterances):

    results = json.dumps(["emotion_class (str)"] * nr_utterances)

    instruction = f"""### You are an expert in Emotion Analysis. You are given the transcript of a comic book which contains numbered character utterances enclosed by <UT></UT> tags. Your task is to classify each utterance in the comic transcript as on the following emotion classes: "Anger" (AN), "Disgust" (DI), "Fear" (FE), "Sadness" (SA), "Surprise" (SU) or "Joy" (JO). You must return a list of emotion classes, strictly of length {nr_utterances}, in following JSON format: {{"emotion_class": ["emotion_class (str)", "emotion_class (str)" ... "emotion_class (str)"]}} where each element "emotion_class (str)" is replaced by one of the following abbreviated emotion class labels: "AN", "DI", "FE", "SA", "SU" or "JO". 
"""    
    return instruction

In [44]:
# def build_tagged_text(text, start_indices, end_indices):

#     offset = 0

#     for i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
            
#         start_tag = "<UT" + str(i+1) + ">"
#         end_tag = "</UT" + str(i+1) + ">"
        
#         start_idx = start_i + offset
#         end_idx = end_i + offset

#         offset = offset + (len(start_tag)  + len(end_tag))
        
#         text_r = text[start_idx:end_idx]
#         new_text = start_tag + text_r + end_tag
#         text = text.replace(text_r, new_text)

#         question = f"""### Here is the comic transcript: {text}"""

#     return question

In [45]:
def build_tagged_text(utterances_l):

    tagged_utterances_l = []

    for idx, utterance in enumerate(utterances_l):
        
        start_tag = "<UT" + str(idx+1) + ">"
        end_tag = "</UT" + str(idx+1) + ">"
        tagged_utterance = start_tag + utterance + end_tag
        tagged_utterances_l.append(tagged_utterance)
        
    tagged_title_text = ''.join(tagged_utterances_l)
    question = f"""### Here is the comic transcript: {tagged_title_text}"""

    return question

In [46]:
def build_answer(title_emotions):

    title_emotions_l = []
    emotion_class_labels = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy"]

    for emotions_l in title_emotions:

            if emotions_l == 'Neutral':
                title_emotions_l.append([emotions_l])
            
            else:
                emotions_l = emotions_l.split("-")
               
                emotion_annotation_l = []

                for idx, emotion_annotation in enumerate(emotions_l):

                    if '0' not in emotion_annotation:
                 
                        #emotion_annotation_l.append(emotion_class_labels[idx])
                        emotion_annotation_l.append(emotion_annotation[:-1])
                    
                title_emotions_l.append(emotion_annotation_l)
                

    return json.dumps({"emotion_classes": title_emotions_l})

### Build Data Files

In [47]:
df_train = df_by_title[df_by_title.split == 'TRAIN'].reset_index()

data_file_train = []

for index, _ in df_train.iterrows():
    
    i = index

    instruction = build_instruction(len(df_train.iloc[i].utterances_l))
    question = build_tagged_text(df_train.iloc[i].utterances_l)
    answer = build_answer(df_train.iloc[i].emotions_l)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [48]:
len(data_file_train)

22

In [49]:
for i in range(3):
    print(data_file_train[i])

{'instruction': '### You are an expert in Emotion Analysis. You are given the transcript of a comic book which contains numbered character utterances enclosed by <UT></UT> tags. Your task is to classify each utterance in the comic transcript as on the following emotion classes: "Anger" (AN), "Disgust" (DI), "Fear" (FE), "Sadness" (SA), "Surprise" (SU) or "Joy" (JO). You must return a list of emotion classes, strictly of length 34, in following JSON format: {"emotion_class": ["emotion_class (str)", "emotion_class (str)" ... "emotion_class (str)"]} where each element "emotion_class (str)" is replaced by one of the following abbreviated emotion class labels: "AN", "DI", "FE", "SA", "SU" or "JO". \n', 'input': '### Here is the comic transcript: <UT1>In Rio de Janeiro my brakes were cut.</UT1><UT2>In Mexico l was slipped psilocybin while fighting a pack of jaguars.</UT2><UT3>And now, in SHANGHAI…</UT3><UT4>< I\'VE GOT YOU! GO LIMP! > *</UT4><UT5>< THIS DOESN\'T --MY POLYETHYLENE LINES ARE I

In [51]:
df_test = df_by_title[df_by_title.split == 'TEST'].reset_index()

data_file_test = []

for index, _ in df_test.iterrows():
    
    i = index

    instruction = build_instruction(len(df_test.iloc[i].utterances_l))
    question = build_tagged_text(df_test.iloc[i].utterances_l)
    answer = build_answer(df_test.iloc[i].emotions_l)
    
    data_file_test.append( formatting_fct(instruction, question, answer) )

### Create and save JSON files

In [53]:
file_path = os.path.join(os.getcwd(), "../datasets/comics_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [54]:
file_path = os.path.join(os.getcwd(), "../datasets/comics_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)