# ImageNet Dataset

In [1]:
from datasets import ImageNetDataset

In [4]:
ImageNetDataset.make_dataset_index(
    train_data_path= "../datasets/imagenet/imagenet/Data/CLS-LOC/train",
    val_data_path= "../datasets/imagenet/imagenet/Data/CLS-LOC/val",
    index_path= "../datasets/imagenet/beit3/",
)

Write ../datasets/imagenet/beit3/imagenet.train.index.jsonl with 1281167 items !
Write ../datasets/imagenet/beit3/imagenet.val.index.jsonl with 50000 items !


# COCO Captioning

In [5]:
from datasets import CaptioningDataset
from transformers import XLMRobertaTokenizer

In [7]:
tokenizer = XLMRobertaTokenizer("beit3.spm")

CaptioningDataset.make_coco_captioning_dataset_index(
    data_path="../datasets/coco",
    tokenizer=tokenizer,
)

read ../datasets/coco\dataset_coco.json
Find 113287 images and 566747 image-text pairs for karpathy dataset train split !
Write ../datasets/coco\coco_captioning.train.jsonl with 566747 items !
read ../datasets/coco\dataset_coco.json
Find 5000 images and 5000 image-text pairs for karpathy dataset val split !
Write ../datasets/coco\coco_captioning.val.jsonl with 5000 items !
read ../datasets/coco\dataset_coco.json
Find 5000 images and 5000 image-text pairs for karpathy dataset test split !
Write ../datasets/coco\coco_captioning.test.jsonl with 5000 items !


# VQA

In [1]:
from datasets import VQAv2Dataset
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer("beit3.spm")

VQAv2Dataset.make_dataset_index(
    data_path="../datasets/coco",
    tokenizer=tokenizer,
    annotation_data_path="../datasets/coco/vqa",
)

not all images have caption annotations
82783 82774 82774
Write ../datasets/coco\vqa.train.jsonl with 434867 items !
not all images have caption annotations
40504 40503 40503
Write ../datasets/coco\vqa.val.jsonl with 210051 items !
all images have caption annotations
81434 81434 81434
Write ../datasets/coco\vqa.test.jsonl with 447793 items !
not all images have caption annotations
81434 36807 36807
Write ../datasets/coco\vqa.test-dev.jsonl with 107394 items !
Contains 40503 image and 210051 pairs for val set!
Write ../datasets/coco\vqa.trainable_val.jsonl with 204726 items !
Write ../datasets/coco\vqa.rest_val.jsonl with 5325 items !


## retrieval dataset

In [1]:
from datasets import RetrievalDataset
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer("beit3.spm")

RetrievalDataset.make_coco_dataset_index(
    data_path="../datasets/coco",
    tokenizer=tokenizer,
)

read ../datasets/coco\dataset_coco.json
Find 113287 images and 566747 image-text pairs for karpathy dataset train split !
Write ../datasets/coco\coco_retrieval.train.jsonl with 566747 items !
read ../datasets/coco\dataset_coco.json
Find 5000 images and 25010 image-text pairs for karpathy dataset val split !
Write ../datasets/coco\coco_retrieval.val.jsonl with 25010 items !
read ../datasets/coco\dataset_coco.json
Find 5000 images and 25010 image-text pairs for karpathy dataset test split !
Write ../datasets/coco\coco_retrieval.test.jsonl with 25010 items !


# proprossing the movie dataset

In [1]:

def get_sentencepiece_model_for_beit3():
    from transformers import XLMRobertaTokenizer
    return XLMRobertaTokenizer("./beit3.spm")

tokenizer = get_sentencepiece_model_for_beit3()

In [9]:
import json
ann = json.load(open("../zixuan_recsystem/data/Amazon_Sports_Dataset/raw_text.json", "r"))
items = []
for image_id, data in ann.items():
    tokens = tokenizer.tokenize(data)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    items.append({
        "image_id": image_id,
        "tokens": tokens,
        "text_segment": token_ids,
    })

In [4]:
def _write_data_into_jsonl(items, jsonl_file):
    with open(jsonl_file, mode="w", encoding="utf-8") as writer:
        for data in items:
            writer.write(json.dumps(data, indent=None))
            writer.write('\n')
    print("Write %s with %d items !" % (jsonl_file, len(items)))

In [10]:
_write_data_into_jsonl(items, '../zixuan_recsystem/data/Amazon_Sports_Dataset/raw_text_tokenize.json')

Write ../zixuan_recsystem/data/Amazon_Sports_Dataset/raw_text_tokenize.json with 18357 items !


In [6]:
dataset_names = ['Amazon_Elec_Dataset',
                 'Amazon_Clothing_Dataset', 'Amazon_Baby_Dataset',
                 ]
#'Amazon_Sports_Dataset'

In [8]:
for dataset_name in dataset_names:
    ann = json.load(open("../zixuan_recsystem/data/%s/raw_text.json" % dataset_name, "r"))
    items = []
    print("Processing %s" % dataset_name)
    for image_id, data in ann.items():
        tokens = tokenizer.tokenize(data)
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        items.append({
            "image_id": image_id,
            "tokens": tokens,
            "text_segment": token_ids,
        })
    _write_data_into_jsonl(items, "../zixuan_recsystem/data/%s/raw_text.json" % dataset_name)

Processing Amazon_Elec_Dataset
Write ../zixuan_recsystem/data/Amazon_Elec_Dataset/raw_text.json with 63001 items !
Processing Amazon_Clothing_Dataset
Write ../zixuan_recsystem/data/Amazon_Clothing_Dataset/raw_text.json with 23033 items !
Processing Amazon_Baby_Dataset
Write ../zixuan_recsystem/data/Amazon_Baby_Dataset/raw_text.json with 7050 items !


## Preprocessing the raw text in AtoMiC dataset

In [1]:
def get_sentencepiece_model_for_beit3():
    from transformers import XLMRobertaTokenizer
    return XLMRobertaTokenizer("./beit3.spm")

tokenizer = get_sentencepiece_model_for_beit3()

In [2]:
def _write_data_into_jsonl(items, jsonl_file):
    with open(jsonl_file, mode="w", encoding="utf-8") as writer:
        for data in items:
            writer.write(json.dumps(data, indent=None))
            writer.write('\n')
    print("Write %s with %d items !" % (jsonl_file, len(items)))

In [3]:
import json
from tqdm import tqdm
dataset_path = "../datasets/ATOMIC/"
splits = ["train","validation"] #["train", "dev", "test"]
for split in splits:
    items = []
    ann = []
    with open(f"../datasets/ATOMIC/Atomic_text_{split}.json", "r") as reader:
        for line in reader:
            data = json.loads(line)
            ann.append(data)

    for idx in tqdm(range(len(ann))):
        new_dict = ann[idx]
        # context_page_des_tokens = tokenizer.tokenize(new_dict["context_page_description"])
        context_sec_des_tokens = tokenizer.tokenize(new_dict["context_section_description"])
        page_title_tokens = tokenizer.tokenize(new_dict["page_title"])
        section_title_tokens = tokenizer.tokenize(new_dict["section_title"])
        category_tokens = tokenizer.tokenize(''.join(new_dict["category"]))


        # context_page_des_tokens_ids = tokenizer.convert_tokens_to_ids(context_page_des_tokens)
        context_sec_des_tokens_ids = tokenizer.convert_tokens_to_ids(context_sec_des_tokens)
        page_title_tokens_ids = tokenizer.convert_tokens_to_ids(page_title_tokens)
        section_title_tokens_ids = tokenizer.convert_tokens_to_ids(section_title_tokens)

        # new_dict["context_page_des_tokens"] = context_page_des_tokens
        # new_dict["context_sec_des_tokens"] = context_sec_des_tokens
        # new_dict["page_title_tokens"] = page_title_tokens
        # new_dict["section_title_tokens"] = section_title_tokens
        # new_dict["category_tokens"] = category_tokens
        # new_dict["context_page_des_tokens_ids"] = context_page_des_tokens_ids
        new_dict["context_sec_des_tokens_ids"] = context_sec_des_tokens_ids
        new_dict["page_title_tokens_ids"] = page_title_tokens_ids
        new_dict["section_title_tokens_ids"] = section_title_tokens_ids

        items.append(new_dict)
    _write_data_into_jsonl(items, f"../datasets/ATOMIC/Atomic_text_tokenized_{split}.json")

 48%|████▊     | 1431425/3002458 [11:18<12:41, 2062.19it/s]

# Write images data into numpy file

In [1]:
from datasets import load_dataset
from tqdm import tqdm
split = 'train'
images = load_dataset("TREC-AToMiC/AToMiC-Images-v0.2", split=split)
qrels = load_dataset("TREC-AToMiC/AToMiC-Qrels-v0.2", split=split)

Found cached dataset parquet (C:/Users/longk/.cache/huggingface/datasets/TREC-AToMiC___parquet/TREC-AToMiC--AToMiC-Images-v0.2-275960c34975be87/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
Found cached dataset parquet (C:/Users/longk/.cache/huggingface/datasets/TREC-AToMiC___parquet/TREC-AToMiC--AToMiC-Qrels-v0.2-f1634624281fffa7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [2]:

image_ids = set(qrels["image_id"])
id2pos = {}
for pos, _id in tqdm(enumerate(images['image_id']), total=len(images['image_id'])):
    id2pos[_id] = pos


100%|██████████| 11019202/11019202 [00:03<00:00, 3187237.54it/s]


In [4]:
train_image_ids = []
for idx, image_id in tqdm(enumerate(images["image_id"]), total=len(images["image_id"])):
    if image_id in image_ids:
        train_image_ids.append(idx)

100%|██████████| 11019202/11019202 [00:03<00:00, 3285467.92it/s]


In [5]:
len(train_image_ids)

3386183

In [None]:
valid_image_ids = []
for image_id in tqdm(qrels["image_id"], total=len(qrels["image_id"])):
    valid_image_ids.append({'image': images[id2pos[image_id]]['image']})

In [15]:
len(valid_image_ids)

110

In [16]:
valid_image_ids[0]['image']

{'image': <PIL.WebPImagePlugin.WebPImageFile image mode=RGB size=443x256>}

In [17]:
import numpy as np
import os
np.save(os.path.join("../datasets/ATOMIC", f"{split}_images.npy"), valid_image_ids)

In [18]:
train_images = np.load("../datasets/ATOMIC/train_images.npy", allow_pickle=True)

In [19]:
train_images[0]

{'image': <PIL.WebPImagePlugin.WebPImageFile image mode=RGB size=443x256>}

# split the dataset

In [None]:
from tqdm.auto import tqdm
from collections import defaultdict

train_text_ids = set(dataset_qrels["train"]["text_id"])
validation_text_ids = set(dataset_qrels["validation"]["text_id"])
test_text_ids = set(dataset_qrels["test"]["text_id"])

train_image_ids = set(dataset_qrels["train"]["image_id"])
validation_image_ids = set(dataset_qrels["validation"]["image_id"])
test_image_ids = set(dataset_qrels["test"]["image_id"])

split_dict_texts = defaultdict(list)
split_dict_images = defaultdict(list)

for index, row in tqdm(
        enumerate(dataset_text['train']['text_id']), total=len(dataset_text['train']), desc="build split dict"
):
    if row in train_text_ids:
        split_dict_texts["train"].append(index)
    elif row in validation_text_ids:
        split_dict_texts["validation"].append(index)
    elif row in test_text_ids:
        split_dict_texts["test"].append(index)
    else:
        split_dict_texts["other"].append(index)

for index, row in tqdm(
        enumerate(dataset_image['train']['image_id']), total=len(dataset_image['train']), desc="build split dict"
):
    if row in train_image_ids:
        split_dict_images["train"].append(index)
    elif row in validation_image_ids:
        split_dict_images["validation"].append(index)
    elif row in test_image_ids:
        split_dict_images["test"].append(index)
    else:
        split_dict_images["other"].append(index)

