# Prepare dataset for BERT fine-tune

- Create dataset from comics dataframe

## Libraries

In [1]:
import os
import torch
import datasets
import transformers

import pandas as pd

from pathlib import Path

from datasets import Dataset
from datasets import DatasetDict

In [2]:
print('pandas:\t\t', pd.__version__)
print('transformers:\t', transformers.__version__)
print('datasets:\t', datasets.__version__)

pandas:		 2.2.2
transformers:	 4.44.2
datasets:	 2.21.0


### Load and Process dataframe

In [6]:
dataset_file = Path.cwd() / "emotion_analysis_comics" / "dataset_files" / "comics_dataset.csv"

In [7]:
dataset_df = pd.read_csv(dataset_file, index_col=False)

In [8]:
dataset_df

Unnamed: 0,file_name,page_nr,panel_nr,balloon_nr,utterance,raw_annotation,raw_emotion,raw_speaker_id,emotion,speaker_id,split
0,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,1,DID YOU HAVE TO ELECTROCUTE HER SO HARD?,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE3-SA0-SU5-JO0,ID-1,TEST
1,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,2,IT'S NOT LIKE I HAVE DIFFERENT SETTINGS.,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU5-JO0,ID-2,TEST
2,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,2,3,YOU'RE ELECTROCUTIONER. IT'S YOUR WHOLE THING....,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN0-DI0-FE2-SA0-SU0-JO0,ID-1,TEST
3,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,3,1,"OH, HEY. I THINK SHE'S AWAKE.",2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN0-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-2,AN0-DI0-FE0-SA0-SU4-JO0,ID-2,TEST
4,QC copy - 1500 - 04 Nightwing 19 _Nightwing 95...,1,4,1,"WELCOME BACK, MADAM MAYOR. BLOCKBUSTER IS PRET...",2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-08-27 - aselermekova20\nFeeling:AN3-DI0-F...,2024-09-05 - aidaraliev12345\nSpokenBy:ID-1,AN3-DI0-FE0-SA0-SU0-JO0,ID-1,TEST
...,...,...,...,...,...,...,...,...,...,...,...
6731,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,19,4,1,POR--,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,\n2024-08-01 - aidaraliev12345\nSpokenBy:ID-3,AN0-DI0-FE0-SA5-SU0-JO0,ID-3,TRAIN
6732,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,19,5,3,… POR FAVOR…,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,\n2024-08-01 - aidaraliev12345\nSpokenBy:ID-3,AN0-DI0-FE0-SA5-SU0-JO0,ID-3,TRAIN
6733,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,20,1,1,"COHEN, HERE--",2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,\n2024-08-01 - aidaraliev12345\nSpokenBy:Liz C...,AN0-DI0-FE0-SA0-SU0-JO3,Liz Cohen,TRAIN
6734,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,20,1,2,"@AGENT COHEN, IT'S EMMA.",2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,2024-08-01 - aidaraliev12345\nFeeling:AN0-DI0-...,\n2024-08-01 - aidaraliev12345\nSpokenBy:Emma,AN0-DI0-FE3-SA0-SU0-JO0,Emma,TRAIN


In [9]:
dataset_df.columns

Index(['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance',
       'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion',
       'speaker_id', 'split'],
      dtype='object')

In [11]:
emotion_map = {
    'AN': 'anger',
    'DI': 'disgust',
    'FE': 'fear',
    'SA': 'sadness',
    'SU': 'surprise',
    'JO': 'joy'
}

def extract_emotions(row):

    emotion_str = row.emotion

    if emotion_str == 'Neutral':
        return ['neutral']

    emotions = emotion_str.split('-')
    tags = []

    for emotion in emotions:
        abbrev = emotion[:2]  # Get the abbreviation
        value_part = emotion[2:]  # Get the value part
        
        # Ensure that the value part is a valid integer and abbrev is in the emotion_map
        if abbrev in emotion_map and value_part.isdigit():
            value = int(value_part)
            if value > 0:
                tag = emotion_map[abbrev].lower() + ":" + str(value)
                #tags.append(emotion_map[abbrev].lower())
                tags.append(tag)
        else:
            print(f"Warning: Skipping invalid emotion entry: '{emotion}'")
    return tags  

In [12]:
dataset_df['utterance_emotion'] = dataset_df.apply(lambda row: extract_emotions(row), axis=1)

In [13]:
def get_unique_emotion(row):
    
    emotion_vals = []
    utterance_emotion = row.utterance_emotion
    
    for element in utterance_emotion:
        if element == 'neutral':
            return 'neutral'
        else:
            emotion_val = element.split(":")[1]
            emotion_vals.append(emotion_val)
    
    return utterance_emotion[emotion_vals.index(max(emotion_vals))].split(":")[0]    

In [14]:
dataset_df['unique_emotion'] = dataset_df.apply(lambda row: get_unique_emotion(row), axis=1)

In [15]:
dataset_df['unique_emotion'].value_counts()

unique_emotion
anger       1906
joy         1267
sadness     1069
fear        1013
surprise     968
neutral      422
disgust       91
Name: count, dtype: int64

In [16]:
dataset_df.columns

Index(['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance',
       'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion',
       'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
      dtype='object')

In [17]:
train_df = dataset_df[dataset_df.split=='TRAIN'].reset_index(drop=True)
test_df = dataset_df[dataset_df.split=='TEST'].reset_index(drop=True)

## Prepare Dataset object

In [26]:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [27]:
train_val_datasets = dataset_train.train_test_split(train_size=0.9)
dataset_train = train_val_datasets['train']
dataset_val = train_val_datasets['test']

In [28]:
dataset = DatasetDict({"train": dataset_train, "test": dataset_test, "validation": dataset_val})

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 5075
    })
    test: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 1097
    })
    validation: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 564
    })
})

In [30]:
# sanity check
set(dataset['train']['split']), set(dataset['test']['split']), set(dataset['validation']['split'])

({'TRAIN'}, {'TEST'}, {'TRAIN'})

## Save dataset

In [31]:
torch.save(dataset, os.path.join("emotion_analysis_comics/bert/datasets/", 'comics_dataset_complete.pt'))