In [2]:
import pandas as pd
from pathlib import Path

### Path

In [39]:
EC35_DIR = Path.cwd().parent / "multimodal_er" / "EmoComics35"
DATA_FILES_DIR = EC35_DIR / "data_files"

In [40]:
DATA_FILE = DATA_FILES_DIR / "emocomics35.csv"

### Pandas DataFrame

In [41]:
df = pd.read_csv(DATA_FILE, index_col=0)

In [43]:
df.columns

Index(['Source text in English', 'Number of words', 'Signs with spaces',
       'Signs without spaces', 'Translated text in Spanish; Castilian',
       'Number of words.1', 'Signs with spaces.1', 'Signs without spaces.1',
       'Page', 'Panel', 'Balloon', 'Annotations', 'source_file', 'file_nr',
       'speaker', 'raw_emotions', 'emotions', 'split', 'comicbook_title'],
      dtype='object')

In [44]:
columns = ['Source text in English', 'Page', 'Panel', 'Balloon', 'source_file', 'file_nr', 'speaker', 'emotions', 'split', 'comicbook_title']

In [45]:
df = df[columns]

In [47]:
df.columns = ['Utterance', 'Page', 'Panel', 'Balloon', 'Source_file', 'File_nr', 'Speaker', 'Emotions', 'Split', 'Comicbook_title']

### Page Grouping

In [65]:
page_df = df.groupby(["Source_file", "Page"]).agg({
    
    "Utterance": list,
    "Speaker": list,
    "Emotions": list,
    "Panel": list,
    "Balloon": list,
    "File_nr": list,
    "Split": list,
    "Comicbook_title": list
}).reset_index()

In [67]:
def clean_columns(row, col):
    
    return row[col][0]

In [68]:
page_df["File_nr"] = page_df.apply(lambda row: clean_columns(row, "File_nr"), axis=1)
page_df["Split"] = page_df.apply(lambda row: clean_columns(row, "Split"), axis=1)
page_df["Comicbook_title"] = page_df.apply(lambda row: clean_columns(row, "Comicbook_title"), axis=1)

In [70]:
page_df.columns = ["SourceFile", "Page", "PageUtterances", "PageSpeakers", "PageEmotions", "PagePanels", "PageBalloons", "FileNr", "Split", "ComicBookTitle"]

In [76]:
def check_egalite(row):
    
    return 1 if (len(row.PageUtterances) == len(row.PageSpeakers) == len(row.PageEmotions) == len(row.PagePanels) == len(row.PageBalloons)) else 0

In [77]:
page_df["egalite_check"] = page_df.apply(lambda row: check_egalite(row), axis=1)

In [81]:
page_df["egalite_check"].value_counts()

egalite_check
1    874
Name: count, dtype: int64

In [83]:
page_df= page_df.drop(page_df.columns[-1], axis=1)

In [None]:
#page_df.to_csv(DATA_FILES_DIR / "emocomics35_pg.csv")

### Add Images

In [85]:
pg_df = pd.read_csv(DATA_FILES_DIR / "emocomics35_pg.csv", index_col=0)

In [87]:
EC35_IMAGES_DIR = EC35_DIR / "raw_files" / "images"

In [93]:
def get_image_file_path(row):
    
    page_nr = row.Page
    comics_dir = "00" + str(row.FileNr)
    
    images_dir = Path(EC35_IMAGES_DIR) / comics_dir / "images"    
    file_name = "page" + f"{page_nr:05d}" + ".jpg"
        
    file_path = images_dir / file_name    
    
    return file_path

In [96]:
pg_df['image_path'] = pg_df.apply(lambda row: get_image_file_path(row), axis=1)

In [106]:
pg_df.to_csv(DATA_FILES_DIR / "emocomics35_pg_images.csv")

In [None]:
# HF dataset

In [17]:
pg_df = pd.read_csv("/Utilisateurs/umushtaq/multimodal_er/EmoComics35/data_files/emocomics35_pg_images.csv", index_col=0)

In [18]:
pg_df

Unnamed: 0,SourceFile,Page,PageUtterances,PageSpeakers,PageEmotions,PagePanels,PageBalloons,FileNr,Split,ComicBookTitle,image_path
0,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,1,['THIS VILE THING ATTACKED THE SMALL BEASTS OF...,"['AQUANYX', 'AQUANYX', 'ID-1', 'ID-1', 'AQUANY...","[""['Anger']"", ""['Anger']"", ""['Fear']"", ""['Fear...","[1, 1, 1, 2, 3, 3, 3, 4, 5, 6]","[2, 3, 4, 1, 1, 2, 3, 1, 2, 1]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
1,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,2,"['NO-- #GKKK…#', '#CHOMP!', 'BY THE SKIN OF M...","['ID-1', 'BLACKMANTASAURUS', 'AQUANYX', 'AQUAN...","[""['Fear']"", ""['Anger']"", ""['Surprise']"", ""['A...","[1, 1, 2, 3, 3, 3, 3, 3, 3]","[1, 2, 1, 1, 2, 3, 5, 6, 7]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
2,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,3,"['COME ON, BEAST!', 'SHOW YOURSELF!', 'WHY DO ...","['AQUANYX', 'AQUANYX', 'AQUANYX', 'AQUANYX']","[""['Joy']"", ""['Joy']"", ""['Anger']"", ""['Anger']""]","[1, 1, 1, 1]","[1, 2, 5, 6]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
3,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,4,['#AARGH! '],['AQUANYX'],"[""['Fear', 'Surprise']""]",[2],[2],1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
4,QC copy - 1499 - 58 ECC Co_mics 50 _The Jurass...,5,"['I, THE GREEN TORCH, HAVE BEEN TASKED WITH PR...","['GREEN TORCH', 'GREEN TORCH', 'ATROCITAURUS',...","[""['Anger']"", ""['Anger']"", ""['Fear']"", ""['Fear...","[1, 1, 1, 3, 4, 5]","[2, 3, 5, 1, 1, 2]",1499,TRAIN,Jurassic League #4,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
...,...,...,...,...,...,...,...,...,...,...,...
869,QC copy - 2200 - Stillwater 13.xlsx,16,"[""WE WERE IN GALEN'S OFFICE. YOU WERE ABOUT TO...","['LAURA', 'LAURA', 'LAURA', 'DANIEL', 'DANIEL'...","[""['Anger']"", ""['Anger']"", ""['Anger']"", ""['Ang...","[1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6]","[1, 2, 3, 1, 2, 2, 3, 4, 5, 1, 2, 1, 2, 1]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
870,QC copy - 2200 - Stillwater 13.xlsx,17,"['SO WHAT ARE WE GOING TO DO?', 'THE WAY I SEE...","['ID-6', 'GALEN', 'ID-7', 'GALEN', 'GALEN', 'G...","[""['Sadness', 'Surprise']"", ""['Anger']"", ""['An...","[3, 3, 3, 3, 4, 4, 5]","[1, 2, 3, 4, 1, 2, 1]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
871,QC copy - 2200 - Stillwater 13.xlsx,18,"[""KIDDIE COUNCIL'S BEEN GOING A LONG TIME... ""...","['TED', 'KREEGS', 'ID-8', 'ID-8', 'GALEN', 'GA...","[""['Anger', 'Sadness']"", ""['Anger']"", ""['Anger...","[1, 1, 1, 2, 3, 4, 5, 6, 6, 7, 7]","[1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 2]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
872,QC copy - 2200 - Stillwater 13.xlsx,19,"[""IT'S BEEN… PEACEFUL. ASIDE FROM SHIT LIKE TH...","['KREEGS', 'GALEN', 'GALEN', 'KREEGS', 'GALEN'...","[""['Anger']"", ""['Joy']"", ""['Joy']"", ""['Anger',...","[1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6]","[1, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1, 2]",2200,TEST,Stillwater #13,/Utilisateurs/umushtaq/multimodal_er/EmoComics...


In [19]:
pg_df.image_path.tolist()

['/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00001.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00002.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00003.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00004.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00005.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00006.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00007.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00008.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00009.jpg',
 '/Utilisateurs/umushtaq/multimodal_er/EmoComics35/raw_files/images/001499/images/page00010.jpg',
 '/Utilisateurs/umus

In [49]:
train_df = pg_df[pg_df.Split == "TRAIN"].reset_index(drop=True)
test_df = pg_df[pg_df.Split == "TEST"].reset_index(drop=True)

In [67]:
train_df

Unnamed: 0,SourceFile,Page,PageUtterances,PageSpeakers,PageEmotions,PagePanels,PageBalloons,FileNr,Split,ComicBookTitle,image_path
0,QC copy - 1570 - 36 Fantasmas vol. 1 - Ghosted...,20,"['JACKSON WINTERS!', 'YOU WILL DIE!', 'PLEASE ...","['Nina', 'Nina', 'Robby Trick', 'JACKSON T. WI...","[""['Anger', 'Disgust']"", ""['Anger', 'Disgust']...","[1, 1, 2, 2, 2]","[2, 3, 1, 2, 3]",1570,TRAIN,Fantasmas vol. 1 - Ghosted #9,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
1,QC copy - 1642 - 23 American Vampire vol. 4 - ...,16,"['"" THREE TIMES SINCE THAT FIRST WAR, THE GRAY...","['Felicia Book', 'Felicia Book', 'Felicia Book...","['[Neutral]', '[Neutral]', '[Neutral]', '[Neut...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",1642,TRAIN,American Vampire vol. 4 - #6,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
2,QC copy - 1517 - 37 John Carpenter Historias p...,61,"['ALÉ…?', 'THANK YOU, MOTGA, FOR YOUR STRENGTH...","['Jennifer', 'ID- 21', 'ID- 20', 'Alejandro', ...","[""['Fear', 'Sadness']"", ""['Joy']"", ""['Joy']"", ...","[1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5]","[1, 2, 1, 2, 1, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6]",1517,TRAIN,John Carpenter's Tales for a Halloweenight #3,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
3,QC copy - 1508 - 48 Pequen_os Titanes 25.xlsx,8,"['WHAT CHA GOT THERE, KID?', 'I GOT ARROWS, MA...","['Mr.John', 'Speedy']","[""['Surprise', 'Joy']"", '[Neutral]']","[3, 4]","[1, 1]",1508,TRAIN,Tiny Titans #25,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
4,QC copy - 1562 - 36 Fantasmas vol. 1 - Ghosted...,12,"[""IF YOU ROB A BANK YOU DON'T TAKE JUST THE HU...","['JACKSON T. WINTERS', 'JACKSON T. WINTERS', '...","[""['Joy']"", ""['Joy']"", ""['Anger', 'Fear', 'Sur...","[1, 1, 2, 3, 3, 3, 3, 3]","[1, 2, 1, 1, 2, 3, 4, 5]",1562,TRAIN,Fantasmas vol. 1 - Ghosted #5,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
...,...,...,...,...,...,...,...,...,...,...,...
569,QC copy - 1661 - 39 Dragon Age vol. 3 Engano -...,16,"['DELIVERED ON TIME, VIA MABARI.', ""I LIKE THI...","['ID-6', 'ID-6', 'ID-6', 'ID-7', 'FLORIAN', 'I...","[""['Surprise', 'Joy']"", ""['Joy']"", ""['Joy']"", ...","[2, 2, 3, 3, 4, 4, 4, 5, 6, 6]","[1, 2, 1, 2, 1, 2, 3, 1, 1, 2]",1661,TRAIN,Dragon Age vol. 3 Engano - Deception #2,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
570,QC copy - 1570 - 36 Fantasmas vol. 1 - Ghosted...,18,"['My big mouth.', ""HEY, NINA… WHAT YOU DID WRO...","['JACKSON T. WINTERS', 'JACKSON T. WINTERS', '...","[""['Joy']"", ""['Anger', 'Surprise']"", ""['Anger'...","[1, 1, 1, 2, 2, 3, 3, 4, 4, 4]","[1, 2, 3, 1, 2, 1, 2, 1, 2, 3]",1570,TRAIN,Fantasmas vol. 1 - Ghosted #9,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
571,QC copy - 1910 - 35 Ladro_n de ladrones 1 - To...,14,"['EMMA? GOT YOU A TOFU WITH WASABI--', 'WHAT W...","['Manny', 'Manny', 'Emma', 'Manny', 'Manny']","[""['Surprise', 'Joy']"", ""['Anger', 'Surprise']...","[2, 5, 5, 6, 6]","[3, 4, 5, 3, 4]",1910,TRAIN,Thief Of Thieves 1 - #11,/Utilisateurs/umushtaq/multimodal_er/EmoComics...
572,QC copy - 1662 - 39 Dragon Age vol. 3 Engano -...,19,"['YOU WERE RIGHT ABOUT HER.', 'WAS I? I THINK ...","['VAEA', 'AARON', 'VAEA', 'AARON', 'AARON', 'A...","[""['Sadness', 'Surprise']"", ""['Sadness']"", ""['...","[2, 2, 2, 2, 2, 2, 2, 2, 6, 7]","[1, 2, 3, 4, 5, 6, 7, 8, 1, 1]",1662,TRAIN,Dragon Age vol. 3 Engano - Deception #3,/Utilisateurs/umushtaq/multimodal_er/EmoComics...


In [50]:
from sklearn.model_selection import train_test_split

In [51]:
train_df, val_df = train_test_split(train_df, test_size=0.2)

In [52]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [53]:
import datasets
from PIL import Image

In [54]:
def load_image(example):
    example["image"] = Image.open(example["image_path"])
    return example

In [55]:
train_dataset = datasets.Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = datasets.Dataset.from_pandas(test_df, preserve_index=False)
val_dataset = datasets.Dataset.from_pandas(val_df, preserve_index=False)

In [56]:
train_dataset_hf = train_dataset.map(load_image)
test_dataset_hf = test_dataset.map(load_image)
val_dataset_hf = val_dataset.map(load_image)

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [57]:
train_dataset_hf

Dataset({
    features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
    num_rows: 574
})

In [58]:
test_dataset_hf

Dataset({
    features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
    num_rows: 156
})

In [60]:
val_dataset_hf

Dataset({
    features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
    num_rows: 144
})

In [61]:
from datasets import Dataset, DatasetDict

In [65]:
dataset_dict = DatasetDict({
    "train": train_dataset_hf,
    "test": test_dataset_hf,
    "validation": val_dataset_hf
})

In [66]:
dataset_dict.save_to_disk("/Utilisateurs/umushtaq/multimodal_er/EmoComics35/datasets/emocomics35_pg_ttv")

Saving the dataset (0/4 shards):   0%|          | 0/574 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/144 [00:00<?, ? examples/s]

In [68]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 574
    })
    test: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 156
    })
    validation: Dataset({
        features: ['SourceFile', 'Page', 'PageUtterances', 'PageSpeakers', 'PageEmotions', 'PagePanels', 'PageBalloons', 'FileNr', 'Split', 'ComicBookTitle', 'image_path', 'image'],
        num_rows: 144
    })
})

In [69]:
from tqdm import tqdm

In [None]:
instruction = """
You are an expert Amazon worker who is good at writing product descriptions. 
Write the product description accurately by looking at the image.
"""


def convert_to_conversation(sample):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["PageEmotions"]}],
        },
    ]
    return {"messages": conversation}


pass


train_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset_dict['train'])]
val_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset_dict['test'])]
test_dataset_p = [convert_to_conversation(sample) for sample in tqdm(dataset_dict['validation'])]

 62%|██████▏   | 358/574 [01:20<05:14,  1.46s/it]