In [2]:
import pandas as pd
from pathlib import Path

### Path

In [39]:
EC35_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35"
DATA_FILES_DIR = EC35_DIR / "data_files"

In [40]:
DATA_FILE = DATA_FILES_DIR / "emocomics35.csv"

### Pandas DataFrame

In [41]:
df = pd.read_csv(DATA_FILE, index_col=0)

In [43]:
df.columns

Index(['Source text in English', 'Number of words', 'Signs with spaces',
       'Signs without spaces', 'Translated text in Spanish; Castilian',
       'Number of words.1', 'Signs with spaces.1', 'Signs without spaces.1',
       'Page', 'Panel', 'Balloon', 'Annotations', 'source_file', 'file_nr',
       'speaker', 'raw_emotions', 'emotions', 'split', 'comicbook_title'],
      dtype='object')

In [44]:
columns = ['Source text in English', 'Page', 'Panel', 'Balloon', 'source_file', 'file_nr', 'speaker', 'emotions', 'split', 'comicbook_title']

In [45]:
df = df[columns]

In [47]:
df.columns = ['Utterance', 'Page', 'Panel', 'Balloon', 'Source_file', 'File_nr', 'Speaker', 'Emotions', 'Split', 'Comicbook_title']

### Page Grouping

In [65]:
page_df = df.groupby(["Source_file", "Page"]).agg({
    
    "Utterance": list,
    "Speaker": list,
    "Emotions": list,
    "Panel": list,
    "Balloon": list,
    "File_nr": list,
    "Split": list,
    "Comicbook_title": list
}).reset_index()

In [67]:
def clean_columns(row, col):
    
    return row[col][0]

In [68]:
page_df["File_nr"] = page_df.apply(lambda row: clean_columns(row, "File_nr"), axis=1)
page_df["Split"] = page_df.apply(lambda row: clean_columns(row, "Split"), axis=1)
page_df["Comicbook_title"] = page_df.apply(lambda row: clean_columns(row, "Comicbook_title"), axis=1)

In [70]:
page_df.columns = ["SourceFile", "Page", "PageUtterances", "PageSpeakers", "PageEmotions", "PagePanels", "PageBalloons", "FileNr", "Split", "ComicBookTitle"]

In [76]:
def check_egalite(row):
    
    return 1 if (len(row.PageUtterances) == len(row.PageSpeakers) == len(row.PageEmotions) == len(row.PagePanels) == len(row.PageBalloons)) else 0

In [77]:
page_df["egalite_check"] = page_df.apply(lambda row: check_egalite(row), axis=1)

In [81]:
page_df["egalite_check"].value_counts()

egalite_check
1    874
Name: count, dtype: int64

In [83]:
page_df= page_df.drop(page_df.columns[-1], axis=1)

In [None]:
#page_df.to_csv(DATA_FILES_DIR / "emocomics35_pg.csv")

### Add Images

In [85]:
pg_df = pd.read_csv(DATA_FILES_DIR / "emocomics35_pg.csv", index_col=0)

In [87]:
EC35_IMAGES_DIR = EC35_DIR / "raw_files" / "images"

In [93]:
def get_image_file_path(row):
    
    page_nr = row.Page
    comics_dir = "00" + str(row.FileNr)
    
    images_dir = Path(EC35_IMAGES_DIR) / comics_dir / "images"    
    file_name = "page" + f"{page_nr:05d}" + ".jpg"
        
    file_path = images_dir / file_name    
    
    return file_path

In [96]:
pg_df['image_path'] = pg_df.apply(lambda row: get_image_file_path(row), axis=1)

In [106]:
pg_df.to_csv(DATA_FILES_DIR / "emocomics35_pg_images.csv")

In [None]:
# HF dataset

In [3]:
pg_df = pd.read_csv("/Utilisateurs/umushtaq/multimodal_er/EmoComics35/data_files/emocomics35_pg_images.csv", index_col=0)

In [5]:
import datasets
from PIL import Image

In [6]:
def load_image(example):
    example["image"] = Image.open(example["image_path"])
    return example

In [8]:
hf_dataset = datasets.Dataset.from_pandas(pg_df)

In [9]:
hf_dataset = hf_dataset.map(load_image)

Map:   0%|          | 0/874 [00:00<?, ? examples/s]

In [11]:
hf_dataset.save_to_disk("/Utilisateurs/umushtaq/multimodal_er/EmoComics35/datasets/emocomics35_pg")

Saving the dataset (0/5 shards):   0%|          | 0/874 [00:00<?, ? examples/s]