# Prepare dataset for BERT fine-tune

- Create dataset from comics dataframe

## Libraries

In [1]:
import os
import torch
import datasets
import transformers

import pandas as pd

from pathlib import Path

from datasets import Dataset
from datasets import DatasetDict

In [2]:
print('pandas:\t\t', pd.__version__)
print('transformers:\t', transformers.__version__)
print('datasets:\t', datasets.__version__)

pandas:		 2.2.2
transformers:	 4.44.2
datasets:	 2.21.0


### Load and Process dataframe

In [6]:
dataset_file = Path.cwd() / "emotion_analysis_comics" / "dataset_files" / "comics_dataset.csv"

In [7]:
dataset_df = pd.read_csv(dataset_file, index_col=False)

In [11]:
emotion_map = {
    'AN': 'anger',
    'DI': 'disgust',
    'FE': 'fear',
    'SA': 'sadness',
    'SU': 'surprise',
    'JO': 'joy'
}

def extract_emotions(row):

    emotion_str = row.emotion

    if emotion_str == 'Neutral':
        return ['neutral']

    emotions = emotion_str.split('-')
    tags = []

    for emotion in emotions:
        abbrev = emotion[:2]  # Get the abbreviation
        value_part = emotion[2:]  # Get the value part
        
        # Ensure that the value part is a valid integer and abbrev is in the emotion_map
        if abbrev in emotion_map and value_part.isdigit():
            value = int(value_part)
            if value > 0:
                tag = emotion_map[abbrev].lower() + ":" + str(value)
                #tags.append(emotion_map[abbrev].lower())
                tags.append(tag)
        else:
            print(f"Warning: Skipping invalid emotion entry: '{emotion}'")
    return tags  

In [12]:
dataset_df['utterance_emotion'] = dataset_df.apply(lambda row: extract_emotions(row), axis=1)

In [13]:
def get_unique_emotion(row):
    
    emotion_vals = []
    utterance_emotion = row.utterance_emotion
    
    for element in utterance_emotion:
        if element == 'neutral':
            return 'neutral'
        else:
            emotion_val = element.split(":")[1]
            emotion_vals.append(emotion_val)
    
    return utterance_emotion[emotion_vals.index(max(emotion_vals))].split(":")[0]    

In [14]:
dataset_df['unique_emotion'] = dataset_df.apply(lambda row: get_unique_emotion(row), axis=1)

In [17]:
train_df = dataset_df[dataset_df.split=='TRAIN'].reset_index(drop=True)
test_df = dataset_df[dataset_df.split=='TEST'].reset_index(drop=True)

## Prepare Dataset object

In [26]:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [27]:
train_val_datasets = dataset_train.train_test_split(train_size=0.9)
dataset_train = train_val_datasets['train']
dataset_val = train_val_datasets['test']

In [28]:
dataset = DatasetDict({"train": dataset_train, "test": dataset_test, "validation": dataset_val})

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 5075
    })
    test: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 1097
    })
    validation: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 564
    })
})

In [30]:
# sanity check
set(dataset['train']['split']), set(dataset['test']['split']), set(dataset['validation']['split'])

({'TRAIN'}, {'TEST'}, {'TRAIN'})

## Save dataset

In [31]:
torch.save(dataset, os.path.join("emotion_analysis_comics/bert/datasets/", 'comics_dataset_complete.pt'))