In [128]:
import os
import pandas as pd
from pathlib import Path

### Paths

In [129]:
PATH = Path.cwd()

In [130]:
EC35_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35"
XLS_DIR = EC35_DIR / "raw_files" / "xls"
DATA_FILES_DIR = EC35_DIR / "data_files"

### Read xlsx files and build dataframe

In [7]:
xlsx_files = [f for f in os.listdir(XLS_DIR) if f.endswith('.xlsx')]

In [8]:
len(xlsx_files)

35

In [9]:
dfs = []
for file in xlsx_files:
    file_path = os.path.join(XLS_DIR, file)
    df = pd.read_excel(file_path)
    df["source_file"] = file
    dfs.append(df)

In [10]:
ec35_df = pd.concat(dfs, ignore_index=True)

In [11]:
ec35_df = ec35_df.dropna(subset=["Annotations"]).reset_index(drop=True)

In [None]:
#ec35_df.to_csv(DATA_FILES_DIR / "emocomics35_raw.csv")

### Post-process

In [131]:
ec35_df = pd.read_csv(DATA_FILES_DIR / "emocomics35_raw.csv", index_col=0)

In [132]:
ec35_df.iloc[2582]["Annotations"]

'2024-08-20 - aidaraliev12345\nFeeling:AN3-DI0-FE0-SA0-SU0-JO0\n\n\n2024-08-20 - aidaraliev12345\nSpokenBy:ID- 42'

In [133]:
ec35_df.iloc[0]["Annotations"].split("\n")[1].split(":")[1]

'AN5-DI0-FE0-SA0-SU0-JO0'

In [134]:
#### File Nr

In [135]:
def get_file_nr(row):
    
    return row["source_file"].split("-")[1].strip()

In [136]:
ec35_df['file_nr'] = ec35_df.apply(lambda row: get_file_nr(row), axis=1)

In [137]:
len(ec35_df.file_nr.unique().tolist())

35

In [138]:
#### Speaker

In [139]:
def get_speaker_name(row):
    
    try:
        return row["Annotations"].split("\n")[-1].split(":")[1]
    
    except Exception as e:
        print(f"Error at index {row.name},{row.Annotations}: {e}")
        return None  # or some default value


In [140]:
ec35_df["speaker"] = ec35_df.apply(lambda row: get_speaker_name(row), axis=1)

In [141]:
#### Raw Emotion --> Emotion

In [142]:
def get_raw_emotions(row):
    
    try:
        return row["Annotations"].split("\n")[1].split(":")[1]
    
    except Exception as e:
        print(f"Error at index {row.name}\n{row.Annotations}: {e}\n")
        return None  # or some default value
    
    

In [143]:
ec35_df["raw_emotions"] = ec35_df.apply(lambda row: get_raw_emotions(row), axis=1)

In [144]:
nan_rows = ec35_df[ec35_df.isna().any(axis=1)]

if not nan_rows.empty:
    print("Rows with NaN values found:")
    for index, row in nan_rows.iterrows():
        nan_columns = row.index[row.isna()].tolist()  # Get columns with NaN
        print(f"Index {index}: NaN in columns {nan_columns}")
else:
    print("No NaN values found in the DataFrame.")

Rows with NaN values found:
Index 5: NaN in columns ['Translated text in Spanish; Castilian']
Index 27: NaN in columns ['Translated text in Spanish; Castilian']
Index 33: NaN in columns ['Translated text in Spanish; Castilian']
Index 115: NaN in columns ['Translated text in Spanish; Castilian']
Index 117: NaN in columns ['Translated text in Spanish; Castilian']
Index 119: NaN in columns ['Translated text in Spanish; Castilian']
Index 120: NaN in columns ['Translated text in Spanish; Castilian']
Index 125: NaN in columns ['Translated text in Spanish; Castilian']
Index 280: NaN in columns ['Translated text in Spanish; Castilian']
Index 283: NaN in columns ['Translated text in Spanish; Castilian']
Index 292: NaN in columns ['Translated text in Spanish; Castilian']
Index 357: NaN in columns ['Translated text in Spanish; Castilian']
Index 398: NaN in columns ['Translated text in Spanish; Castilian']
Index 404: NaN in columns ['Translated text in Spanish; Castilian']
Index 421: NaN in column

In [145]:
### NaNs only in the Spanish column

In [146]:
#### Emotions

In [147]:
emotion_map = {
    "AN": "Anger",
    "DI": "Disgust",
    "FE": "Fear",
    "SA": "Sadness",
    "SU": "Surprise",
    "JO": "Joy"
}

In [148]:
def get_emotion(row):
    
    mapped_emotions = []
    
    if row.raw_emotions == "Neutral":
        return "[Neutral]"
    
    else:
        for pair in row.raw_emotions.split("-"):
            code = pair[:2]  # Extract emotion code
            value_str = pair[2:]  # Extract value as string

            if code in emotion_map and value_str.isdigit():  # Ensure value is numeric
                value = int(value_str)
                if value > 0:
                    mapped_emotions.append(emotion_map[code])
    
    return mapped_emotions

In [149]:
ec35_df["emotions"] = ec35_df.apply(lambda row: get_emotion(row), axis=1)

In [152]:
ec35_df.emotions.value_counts()

emotions
[Joy]                               1135
[Anger]                             1082
[Sadness]                            754
[Surprise]                           517
[Fear, Surprise]                     460
[Fear]                               441
[Neutral]                            439
[Fear, Sadness]                      307
[Anger, Sadness]                     276
[Anger, Surprise]                    242
[Anger, Fear]                        242
[Surprise, Joy]                      236
[Anger, Disgust]                     153
[Sadness, Surprise]                  150
[Anger, Joy]                         132
[Sadness, Joy]                       108
[Anger, Fear, Surprise]               98
[Anger, Fear, Sadness]                59
[Fear, Sadness, Surprise]             46
[Disgust, Surprise]                   31
[Fear, Joy]                           30
[Disgust, Sadness]                    28
[Disgust]                             27
[Anger, Disgust, Sadness]             24
[Anger,

In [153]:
ec35_df = ec35_df[ec35_df["emotions"].apply(lambda x: not (isinstance(x, list) and len(x) == 0))].reset_index(drop=True)

In [154]:
# 3 samples have no annotation, back to 7129 samples.

In [155]:
ec35_df.emotions.value_counts()

emotions
[Joy]                               1135
[Anger]                             1082
[Sadness]                            754
[Surprise]                           517
[Fear, Surprise]                     460
[Fear]                               441
[Neutral]                            439
[Fear, Sadness]                      307
[Anger, Sadness]                     276
[Anger, Surprise]                    242
[Anger, Fear]                        242
[Surprise, Joy]                      236
[Anger, Disgust]                     153
[Sadness, Surprise]                  150
[Anger, Joy]                         132
[Sadness, Joy]                       108
[Anger, Fear, Surprise]               98
[Anger, Fear, Sadness]                59
[Fear, Sadness, Surprise]             46
[Disgust, Surprise]                   31
[Fear, Joy]                           30
[Disgust, Sadness]                    28
[Disgust]                             27
[Anger, Disgust, Sadness]             24
[Anger,

In [156]:
#### Split

In [157]:
train_files = [
    'QC copy - 1569 - 36 Fantasmas vol. 1 - Ghosted 8.xlsx',
 'QC copy - 1642 - 23 American Vampire vol. 4 - AVC2 6.xlsx',
 'QC copy - 1567 - 36 Fantasmas vol. 1 - Ghosted 6.xlsx',
 'QC copy - 1570 - 36 Fantasmas vol. 1 - Ghosted 9.xlsx',
 'QC copy - 1517 - 37 John Carpenter Historias para una noche de Halloween 3.xlsx',
 'QC copy - 1571 - 36 Fantasmas vol. 1 - Ghosted 10.xlsx',
 'QC copy - 1500 - 04 Nightwing 19 _Nightwing 95_.xlsx',
 'QC copy - 1518 - 50 Las asombrosas aventuras de las Tortugas Ninja 5.xlsx',
 'QC copy - 1499 - 58 ECC Co_mics 50 _The Jurassic League 4_.xlsx',
 'QC copy - 1735 - 34 The Walking Dead vol 15 - 171_TWD.xlsx',
 'QC copy - 1660 - 39 Dragon Age vol. 3 Engano - Deception 1.xlsx',
 'QC copy - 1503 - 10 Crisis Oscura Flash - FLS 783.xlsx',
 'QC copy - 1513 - 21 Blanco Humano 9.xlsx',
 'QC copy - 1521 - 40 Amor eterno 2.xlsx',
 'QC copy - 1662 - 39 Dragon Age vol. 3 Engano - Deception 3.xlsx',
 'QC copy - 1559 - 36 Fantasmas vol. 1 - Ghosted 2.xlsx',
 'QC copy - 1568 - 36 Fantasmas vol. 1 - Ghosted 7.xlsx',
 'QC copy - 1644 - 23 American Vampire vol. 4 - AVC2 8.xlsx',
 'QC copy - 1734 - 34 The Walking Dead vol 15 - 170_TWD.xlsx',
 'QC copy - 1502 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCVMM Lobo.xlsx',
 'QC copy - 1661 - 39 Dragon Age vol. 3 Engano - Deception 2.xlsx',
 'QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx',
 'QC copy - 1501 - 09 Mundos sin Liga de la Justicia Green Lantern 1 - DCWWJL GL 1.xlsx',
 'QC copy - 1561 - 36 Fantasmas vol. 1 - Ghosted 4.xlsx',
 'QC copy - 1737 - 34 The Walking Dead vol 15 - 173_TWD.xlsx',
 'QC copy - 1562 - 36 Fantasmas vol. 1 - Ghosted 5.xlsx',
 'QC copy - 1910 - 35 Ladro_n de ladrones 1 - ToT 11.xlsx',
 'QC copy - 1560 - 36 Fantasmas vol. 1 - Ghosted 3.xlsx'
]

In [158]:
ec35_df['split'] = ec35_df['source_file'].apply(lambda x: 'TRAIN' if x in train_files else 'TEST')

In [159]:
ec35_df.split.value_counts()

split
TRAIN    5803
TEST     1326
Name: count, dtype: int64

In [160]:
### Title

In [161]:
titles = [
    
    "Jurassic League #4",      
    "Nightwing #95",    
    "Dark Crisis: Worlds Without a Justice League - Green Lantern #1", 
    "Dark Crisis: Worlds Without a Justice League - Green Lantern - Very Merry Multiverse",
    "Dark Crisis: The Flash #783",
    "Danger Street #1",
    "Tiny Titans #25",
    "Human Target #9",
    "DC vs. Vampires #11",
    "John Carpenter's Tales for a Halloweenight #3",
    "The Amazing Adventures of the Ninja Turtles #5",
    "Sonic The Hedgehog #44",
    "Love Everlasting #2",
    "Fantasmas vol. 1 - Ghosted #2",
    "Fantasmas vol. 1 - Ghosted #3",
    "Fantasmas vol. 1 - Ghosted #4",
    "Fantasmas vol. 1 - Ghosted #5",
    "Fantasmas vol. 1 - Ghosted #1",
    "Fantasmas vol. 1 - Ghosted #6",
    "Fantasmas vol. 1 - Ghosted #7",
    "Fantasmas vol. 1 - Ghosted #8",
    "Fantasmas vol. 1 - Ghosted #9",
    "Fantasmas vol. 1 - Ghosted #10",
    "American Vampire vol. 4 - #6",
    "American Vampire vol. 4 - #7",
    "American Vampire vol. 4 - #8",
    "Dragon Age vol. 3 Engano - Deception #1",
    "Dragon Age vol. 3 Engano - Deception #2",
    "Dragon Age vol. 3 Engano - Deception #3",
    "The Walking Dead vol 15 - #169",
    "The Walking Dead vol 15 - #170",
    "The Walking Dead vol 15 - #171",
    "The Walking Dead vol 15 - #173",
    "Thief Of Thieves 1 - #11",
    "Stillwater #13",

]

In [162]:
len(titles)

35

In [163]:
len(ec35_df.source_file.unique().tolist())

35

In [164]:
mapping = dict(zip(ec35_df.source_file.unique().tolist(), titles))

In [165]:
ec35_df['comicbook_title'] = ec35_df['source_file'].map(mapping)

In [166]:
ec35_df.shape

(7129, 19)

In [167]:
ec35_df.to_csv(DATA_FILES_DIR / "emocomics35.csv")