In [1]:
import os
import cv2
import h5py
import json
import random
import numpy as np

from tqdm import tqdm
from collections import Counter

In [2]:
dataset_name = "flickr8k"
data_dir = "/home/pervinco/Datasets"

captions_per_image = 5
min_word_frequency = 5
max_sequence_length = 50
output_dir = f"{data_dir}/ImageCaption_Dataset"
image_dir = f"{data_dir}/Flickr8k_dataset/Images"
karpathy_caption_data_path = f"{data_dir}/karpathy_caption_datasets/dataset_{dataset_name}.json"

In [3]:
with open(karpathy_caption_data_path, 'r') as f:
    data = json.load(f) ## images, dataset

print(data['dataset'])
print(len(data['images']))

print(data['images'][0])
print(data['images'][1])
print()

for key, value in data['images'][0].items():
    print(key, value)

    if key == 'sentences':
        for sentence in value:
            print(sentence)

flickr8k
8000
{'sentids': [0, 1, 2, 3, 4], 'imgid': 0, 'sentences': [{'tokens': ['a', 'black', 'dog', 'is', 'running', 'after', 'a', 'white', 'dog', 'in', 'the', 'snow'], 'raw': 'A black dog is running after a white dog in the snow .', 'imgid': 0, 'sentid': 0}, {'tokens': ['black', 'dog', 'chasing', 'brown', 'dog', 'through', 'snow'], 'raw': 'Black dog chasing brown dog through snow', 'imgid': 0, 'sentid': 1}, {'tokens': ['two', 'dogs', 'chase', 'each', 'other', 'across', 'the', 'snowy', 'ground'], 'raw': 'Two dogs chase each other across the snowy ground .', 'imgid': 0, 'sentid': 2}, {'tokens': ['two', 'dogs', 'play', 'together', 'in', 'the', 'snow'], 'raw': 'Two dogs play together in the snow .', 'imgid': 0, 'sentid': 3}, {'tokens': ['two', 'dogs', 'running', 'through', 'a', 'low', 'lying', 'body', 'of', 'water'], 'raw': 'Two dogs running through a low lying body of water .', 'imgid': 0, 'sentid': 4}], 'split': 'train', 'filename': '2513260012_03d33305cf.jpg'}
{'sentids': [5, 6, 7, 8

In [4]:
def create_input_files(ds_name, json_path, img_ds_path, captions_per_image, min_word_freq, output_path, max_seq_len=100):
    assert ds_name in {"coco", "flickr8k", "flickr30k"}
    os.makedirs(output_path, exist_ok=True)

    with open(json_path, 'r') as j:
        data = json.load(j)

    word_freq = Counter()
    train_images, train_captions = [], []
    valid_images, valid_captions = [], []
    test_images, test_captions = [], []

    for img in data['images']:
        captions = []
        for c in img['sentences']: ## 이미지 파일마다 여러 개의 caption(sentence)이 존재.
            word_freq.update(c['tokens']) ## 문장마다 token list가 있음. 이것을 통해 word_frequencies를 계산.

            if len(c['tokens']) <= max_seq_len:
                captions.append(c['tokens']) ## 시퀀스 최대 길이 미만인 문장만 사용.

        if len(captions) == 0:
            continue

        if ds_name == "coco":
            path = os.path.join(img_ds_path, img['filepath'], img['filename'])
        else:
            path = os.path.join(img_ds_path, img['filename'])

        if img['split'] in {'train', 'restval'}:
            train_images.append(path)
            train_captions.append(captions)
        elif img['split'] in {'val'}:
            valid_images.append(path)
            valid_captions.append(captions)
        elif img['split'] in {'test'}:
            test_images.append(path)
            test_captions.append(captions)

    assert len(train_images) == len(train_captions)
    assert len(valid_images) == len(valid_captions)
    assert len(test_images) == len(test_captions)

    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k : v + 1 for v, k in enumerate(words)}
    word_map['<unk>'] = len(word_map) + 1
    word_map['<sos>'] = len(word_map) + 1
    word_map['<eos>'] = len(word_map) + 1
    word_map['<pad>'] = 0

    random.seed(123)
    base_filename = ds_name + '_' + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'
    with open(os.path.join(output_path, 'WORDMAP_' + base_filename + '.json'), 'w') as j:
        json.dump(word_map, j)

    for impaths, imcaps, split in [(train_images, train_captions, 'TRAIN'), (valid_images, valid_captions, 'VAL'), (test_images, test_captions, 'TEST')]:
        with h5py.File(os.path.join(output_path, split + "_IMAGES_" + base_filename + ".hdf5"), 'a') as h:
            h.attrs["captions_per_image"] = captions_per_image
            images = h.create_dataset('images', (len(impaths), 3, 256, 256), dtype='uint8')
            print("Reading %s images and captions, storing to file...\n" % split)

            caplens = []
            enc_captions = []
            for i, path in enumerate(tqdm(impaths)):
                if len(imcaps[i]) < captions_per_image:
                    captions = imcaps[i] + [random.choice(imcaps[i]) for _ in range(captions_per_image - len(imcaps[i]))]
                else:
                    captions = random.sample(imcaps[i], k=captions_per_image)

                assert len(captions) == captions_per_image

                img = cv2.imread(impaths[i])
                if len(img.shape) == 2:
                    img = img[:, :, np.newaxis]
                    img = np.concatenate([img, img, img], axis=2)
                    
                img = cv2.resize(img, (256, 256))
                img = img.transpose(2, 0, 1)
                assert img.shape == (3, 256, 256)
                assert np.max(img) <= 255


                images[i] = img
                for j, c in enumerate(captions):
                    # Encode captions
                    enc_c = [word_map['<sos>']] + [word_map.get(word, word_map['<unk>']) for word in c] + [word_map['<eos>']] + [word_map['<pad>']] * (max_seq_len - len(c))

                    # Find caption lengths
                    c_len = len(c) + 2

                    enc_captions.append(enc_c)
                    caplens.append(c_len)

            # Sanity check
            assert images.shape[0] * captions_per_image == len(enc_captions) == len(caplens)

            # Save encoded captions and their lengths to JSON files
            with open(os.path.join(output_path, split + '_CAPTIONS_' + base_filename + '.json'), 'w') as j:
                json.dump(enc_captions, j)

            with open(os.path.join(output_path, split + '_CAPLENS_' + base_filename + '.json'), 'w') as j:
                json.dump(caplens, j)


In [5]:
create_input_files(ds_name=dataset_name,
                   json_path=karpathy_caption_data_path,
                   img_ds_path=image_dir,
                   captions_per_image=captions_per_image,
                   min_word_freq=min_word_frequency,
                   output_path=output_dir,
                   max_seq_len=max_sequence_length)

Reading TRAIN images and captions, storing to file...



100%|██████████| 6000/6000 [00:12<00:00, 476.29it/s]


Reading VAL images and captions, storing to file...



100%|██████████| 1000/1000 [00:02<00:00, 460.56it/s]


Reading TEST images and captions, storing to file...



100%|██████████| 1000/1000 [00:02<00:00, 460.19it/s]
