# Dataset: Concardia

Concadia is a dataset introduced in our paper and contains Wikipedia images with their respective captions, alt descriptions and the broader context the images are situated in. We use this corpus to argue for a clear distinction between descriptions and captions, and show the similarities and differences between the two text forms. We further argue that captions and broader context are an important resource that can inform the generation of descriptions which are very sparse across the Web but absolutely crucial to make images accessible.

In [1]:
import pandas as pd
import numpy as np
from typing import *
from matplotlib import pyplot as plt 
from pathlib import Path


In [2]:
def read_meta(file_path:Path|str) -> pd.DataFrame:
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError
    js = pd.read_json(file_path)
    df = pd.json_normalize(js['images'])
    
    return df

In [3]:
meta = read_meta("../../dataset/meta.json")
meta.head()

Unnamed: 0,article_id,filename,orig_filename,split,description.raw,description.tokens,caption.raw,caption.tokens,context.raw,context.tokens
0,Autism,0.jpg,https://upload.wikimedia.org/wikipedia/commons...,train,Sleeping boy beside a dozen or so toys arrange...,"[Sleeping, boy, beside, a, dozen, or, so, toys...",A young boy with autism who has arranged his t...,"[A, young, boy, with, autism, who, has, arrang...",Autistic individuals can display many forms o...,"[Autistic, individuals, can, display, many, fo..."
1,Autism,1.jpg,https://upload.wikimedia.org/wikipedia/commons...,train,"Three diagrams of chromosome pairs A, B that a...","[Three, diagrams, of, chromosome, pairs, A, ,,...","Deletion (1), duplication (2) and inversion (3...","[Deletion, (, 1, ), ,, duplication, (, 2, ), a...","Autism has a strong genetic basis, although t...","[Autism, has, a, strong, genetic, basis, ,, al..."
2,Autism,2.jpg,https://upload.wikimedia.org/wikipedia/commons...,train,"A young child points, in front of a woman who ...","[A, young, child, points, ,, in, front, of, a,...",A three-year-old with autism points to fish in...,"[A, three-year-old, with, autism, points, to, ...",The main goals when treating children with au...,"[The, main, goals, when, treating, children, w..."
3,Autism,4.jpg,https://upload.wikimedia.org/wikipedia/commons...,train,Bar chart versus time. The graph rises steadil...,"[Bar, chart, versus, time, ., The, graph, rise...","Reports of autism cases per 1,000 children gre...","[Reports, of, autism, cases, per, 1,000, child...",Most recent reviews tend to estimate a preval...,"[Most, recent, reviews, tend, to, estimate, a,..."
4,Autism,5.jpg,https://upload.wikimedia.org/wikipedia/commons...,train,"Balding man in his early 60s in coat and tie, ...","[Balding, man, in, his, early, 60s, in, coat, ...",Leo Kanner introduced the label early infantil...,"[Leo, Kanner, introduced, the, label, early, i...",The word autism first took its modern sense i...,"[The, word, autism, first, took, its, modern, ..."


In [5]:
meta.size

969180

In [6]:
from torch.utils.data import Dataset
from cv2 import imread
class Concadia(Dataset):
    def __init__(self, src_dataset:Path|str, src_meta:Path|str, split:Literal['train','val','test']):
        super().__init__()
        src_dataset = Path(src_dataset)
        if not src_dataset.exists():
            raise FileNotFoundError
        
        self.meta = read_meta(src_meta).query(f"split == '{split}'")
        self.src_dataset = src_dataset

    def __len__(self):
        return self.meta.size
    
    def __getitem__(self, index):
        record = self.meta.iloc[index]
        
        img_path = self.src_dataset.joinpath(record['filename'])
        img = imread(img_path)
       
        return (img, record['description.raw'])

        



In [8]:
dataset_re = Concadia("../../dataset/resized", "../../dataset/meta.json", "train")
dataset_te = Concadia("../../dataset/resized", "../../dataset/meta.json", "test")
dataset_val = Concadia("../../dataset/resized", "../../dataset/meta.json", "val")

In [10]:
print(len(dataset_te))
print(len(dataset_re))
print(len(dataset_val))

96910
775340
96930
