In [4]:
import os
import pandas as pd
from pathlib import Path

### Paths

In [5]:
PATH = Path.cwd()

In [6]:
EC35_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35"
XLS_DIR = EC35_DIR / "raw_files" / "xls"
DATA_FILES_DIR = EC35_DIR / "data_files"

### Read xlsx files and build dataframe

In [7]:
xlsx_files = [f for f in os.listdir(XLS_DIR) if f.endswith('.xlsx')]

In [8]:
len(xlsx_files)

35

In [9]:
dfs = []
for file in xlsx_files:
    file_path = os.path.join(XLS_DIR, file)
    df = pd.read_excel(file_path)
    df["source_file"] = file
    dfs.append(df)

In [10]:
ec35_df = pd.concat(dfs, ignore_index=True)

In [11]:
ec35_df = ec35_df.dropna(subset=["Annotations"]).reset_index(drop=True)

In [None]:
#ec35_df.to_csv(DATA_FILES_DIR / "emocomics35_raw.csv")

### Post-process

In [52]:
ec35_df = pd.read_csv(DATA_FILES_DIR / "emocomics35_raw.csv", index_col=0)

In [53]:
ec35_df.iloc[2582]["Annotations"]

'2024-08-20 - aidaraliev12345\nFeeling:AN3-DI0-FE0-SA0-SU0-JO0\n\n\n2024-08-20 - aidaraliev12345\nSpokenBy:ID- 42'

In [54]:
ec35_df.iloc[0]["Annotations"].split("\n")[1].split(":")[1]

'AN5-DI0-FE0-SA0-SU0-JO0'

In [55]:
#### File Nr

In [56]:
def get_file_nr(row):
    
    return row["source_file"].split("-")[1].strip()

In [57]:
ec35_df['file_nr'] = ec35_df.apply(lambda row: get_file_nr(row), axis=1)

In [58]:
len(ec35_df.file_nr.unique().tolist())

35

In [59]:
#### Speaker

In [60]:
def get_speaker_name(row):
    
    try:
        return row["Annotations"].split("\n")[-1].split(":")[1]
    
    except Exception as e:
        print(f"Error at index {row.name},{row.Annotations}: {e}")
        return None  # or some default value


In [61]:
ec35_df["speaker"] = ec35_df.apply(lambda row: get_speaker_name(row), axis=1)

In [62]:
#### Raw Emotion --> Emotion

In [63]:
def get_raw_emotions(row):
    
    try:
        return row["Annotations"].split("\n")[1].split(":")[1]
    
    except Exception as e:
        print(f"Error at index {row.name}\n{row.Annotations}: {e}\n")
        return None  # or some default value
    
    

In [64]:
ec35_df["raw_emotions"] = ec35_df.apply(lambda row: get_raw_emotions(row), axis=1)

In [None]:
nan_rows = ec35_df[ec35_df.isna().any(axis=1)]

if not nan_rows.empty:
    print("Rows with NaN values found:")
    for index, row in nan_rows.iterrows():
        nan_columns = row.index[row.isna()].tolist()  # Get columns with NaN
        print(f"Index {index}: NaN in columns {nan_columns}")
else:
    print("No NaN values found in the DataFrame.")

In [71]:
### NaNs only in the Spanish column

In [72]:
#### Emotions

In [73]:
emotion_map = {
    "AN": "Anger",
    "DI": "Disgust",
    "FE": "Fear",
    "SA": "Sadness",
    "SU": "Surprise",
    "JO": "Joy"
}

In [79]:
def get_emotion(row):
    
    mapped_emotions = []
    
    if row.raw_emotions == "Neutral":
        return "[Neutral]"
    
    else:
        for pair in row.raw_emotions.split("-"):
            code = pair[:2]  # Extract emotion code
            value_str = pair[2:]  # Extract value as string

            if code in emotion_map and value_str.isdigit():  # Ensure value is numeric
                value = int(value_str)
                if value > 0:
                    mapped_emotions.append(emotion_map[code])
    
    return mapped_emotions

In [80]:
ec35_df["emotions"] = ec35_df.apply(lambda row: get_emotion(row), axis=1)

In [83]:
ec35_df.emotions.value_counts()

emotions
[Joy]                               1135
[Anger]                             1082
[Sadness]                            754
[Surprise]                           517
[Fear, Surprise]                     460
[Fear]                               441
[Neutral]                            439
[Fear, Sadness]                      307
[Anger, Sadness]                     276
[Anger, Surprise]                    242
[Anger, Fear]                        242
[Surprise, Joy]                      236
[Anger, Disgust]                     153
[Sadness, Surprise]                  150
[Anger, Joy]                         132
[Sadness, Joy]                       108
[Anger, Fear, Surprise]               98
[Anger, Fear, Sadness]                59
[Fear, Sadness, Surprise]             46
[Disgust, Surprise]                   31
[Fear, Joy]                           30
[Disgust, Sadness]                    28
[Disgust]                             27
[Anger, Disgust, Sadness]             24
[Anger,

In [90]:
ec35_df = ec35_df[ec35_df["emotions"].apply(lambda x: not (isinstance(x, list) and len(x) == 0))].reset_index(drop=True)

In [None]:
# 3 samples have no annotation, back to 7129 samples.

In [92]:
ec35_df.emotions.value_counts()

emotions
[Joy]                               1135
[Anger]                             1082
[Sadness]                            754
[Surprise]                           517
[Fear, Surprise]                     460
[Fear]                               441
[Neutral]                            439
[Fear, Sadness]                      307
[Anger, Sadness]                     276
[Anger, Surprise]                    242
[Anger, Fear]                        242
[Surprise, Joy]                      236
[Anger, Disgust]                     153
[Sadness, Surprise]                  150
[Anger, Joy]                         132
[Sadness, Joy]                       108
[Anger, Fear, Surprise]               98
[Anger, Fear, Sadness]                59
[Fear, Sadness, Surprise]             46
[Disgust, Surprise]                   31
[Fear, Joy]                           30
[Disgust, Sadness]                    28
[Disgust]                             27
[Anger, Disgust, Sadness]             24
[Anger,

In [93]:
#### Split

In [None]:
train_files = [
    
]

In [95]:
ec35_df['split'] = ec35_df['source_file'].apply(lambda x: 'TRAIN' if x in train_files else 'TEST')

In [97]:
ec35_df.split.value_counts()

split
TRAIN    5803
TEST     1326
Name: count, dtype: int64

In [98]:
### Title

In [None]:
titles = []

In [117]:
len(titles)

35

In [118]:
len(ec35_df.source_file.unique().tolist())

35

In [112]:
mapping = dict(zip(ec35_df.source_file.unique().tolist(), titles))

In [113]:
ec35_df['comicbook_title'] = ec35_df['source_file'].map(mapping)

In [126]:
ec35_df.shape

(7129, 19)

In [127]:
ec35_df.to_csv(DATA_FILES_DIR / "emocomics35.csv")