# Prepare dataset for BERT fine-tune

- Create dataset from comics dataframe

## Libraries

In [1]:
import os
import torch
import pickle
import datasets
import transformers

import numpy as np
import pandas as pd

from transformers import BertTokenizer

from datasets import concatenate_datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import DatasetDict

In [2]:
print('pandas:\t\t', pd.__version__)
print('transformers:\t', transformers.__version__)
print('datasets:\t', datasets.__version__)

pandas:		 2.2.2
transformers:	 4.44.2
datasets:	 2.21.0


### Load and Process dataframe

In [6]:
dataset_df = pd.read_csv("emotion_analysis_comics/bert/datasets/comics_data_processed.csv")

In [8]:
dataset_df = dataset_df.drop(columns=[dataset_df.columns[0], dataset_df.columns[1]])

In [9]:
dataset_df.columns

Index(['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance',
       'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion',
       'speaker_id', 'split'],
      dtype='object')

In [14]:
emotion_map = {
    'AN': 'anger',
    'DI': 'disgust',
    'FE': 'fear',
    'SA': 'sadness',
    'SU': 'surprise',
    'JO': 'joy'
}

def extract_emotions(row):

    emotion_str = row.emotion

    if emotion_str == 'Neutral':
        return ['neutral']

    emotions = emotion_str.split('-')
    tags = []

    for emotion in emotions:
        abbrev = emotion[:2]  # Get the abbreviation
        value_part = emotion[2:]  # Get the value part
        
        # Ensure that the value part is a valid integer and abbrev is in the emotion_map
        if abbrev in emotion_map and value_part.isdigit():
            value = int(value_part)
            if value > 0:
                tag = emotion_map[abbrev].lower() + ":" + str(value)
                #tags.append(emotion_map[abbrev].lower())
                tags.append(tag)
        else:
            print(f"Warning: Skipping invalid emotion entry: '{emotion}'")
    return tags  

In [15]:
dataset_df['utterance_emotion'] = dataset_df.apply(lambda row: extract_emotions(row), axis=1)

In [32]:
def get_unique_emotion(row):
    
    emotion_vals = []
    utterance_emotion = row.utterance_emotion
    
    for element in utterance_emotion:
        if element == 'neutral':
            return 'neutral'
        else:
            emotion_val = element.split(":")[1]
            emotion_vals.append(emotion_val)
    
    return utterance_emotion[emotion_vals.index(max(emotion_vals))].split(":")[0]    

In [33]:
dataset_df['unique_emotion'] = dataset_df.apply(lambda row: get_unique_emotion(row), axis=1)

In [34]:
dataset_df['unique_emotion'].value_counts()

unique_emotion
anger       1462
joy          884
surprise     876
fear         859
sadness      767
neutral      343
disgust       91
Name: count, dtype: int64

In [35]:
dataset_df.columns

Index(['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance',
       'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion',
       'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
      dtype='object')

In [36]:
train_df = dataset_df[dataset_df.split=='TRAIN'].reset_index(drop=True)
test_df = dataset_df[dataset_df.split=='TEST'].reset_index(drop=True)

## Prepare Dataset object

In [37]:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [38]:
train_val_datasets = dataset_train.train_test_split(train_size=0.8)
dataset_train = train_val_datasets['train']
dataset_val = train_val_datasets['test']

In [39]:
dataset = DatasetDict({"train": dataset_train, "test": dataset_test, "validation": dataset_val})

In [40]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 2804
    })
    test: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 1776
    })
    validation: Dataset({
        features: ['file_name', 'page_nr', 'panel_nr', 'balloon_nr', 'utterance', 'raw_annotation', 'raw_emotion', 'raw_speaker_id', 'emotion', 'speaker_id', 'split', 'utterance_emotion', 'unique_emotion'],
        num_rows: 702
    })
})

In [41]:
# sanity check
set(dataset['train']['split']), set(dataset['test']['split']), set(dataset['validation']['split'])

({'TRAIN'}, {'TEST'}, {'TRAIN'})

## Save dataset

In [42]:
torch.save(dataset, os.path.join("emotion_analysis_comics/bert/datasets/", 'comics_dataset.pt'))