# Preparing the dataset

We build a dataset using tfrecords and webp format.

In [None]:
# install dependencies
!pip install img2dataset tensorflow tensorflow_io wandb

## Getting URL list

We use the [Open Images Dataset](https://storage.googleapis.com/openimages/web/index.html).

In [None]:
# Download OpenImages (train shard 0 + validation set)
!wget https://storage.googleapis.com/cvdf-datasets/oid/open-images-dataset-validation.tsv -O open-images-dataset-validation.tsv
!wget https://storage.googleapis.com/cvdf-datasets/oid/open-images-dataset-train0.tsv -O open-images-dataset-train0.tsv

We format input files to keep only url's.

In [None]:
import pandas as pd

# clean up the files and keep only url
for (f, name) in zip(
    ["open-images-dataset-validation.tsv", "open-images-dataset-train0.tsv"],
    ["valid", "train"],
):
    df = pd.read_csv(f, sep="\t", usecols=[0], names=["url"], skiprows=1)
    df.to_csv(f"{name}.txt", sep="\t", index=False)

Datasets may be a bit large so we reduce their size.

In [None]:
for (path, max_items) in [('train.txt', 10_000), ('valid.txt', 5_000)]:
    df = pd.read_csv(path)
    print(f"{path}: keeping {max_items} / {len(df)}")
    df = df[:max_items]
    df.to_csv(path, sep="\t", index=False)

## Download images

In [None]:
!mkdir openimages

In [None]:
# parameters for validation set
input_file = "valid.txt"
output_folder = "openimages/valid"
image_size = 256
processes_count = 80
thread_count = 16
encode_quality = 100
encode_format = "webp"
number_sample_per_shard = 1000
min_image_size = 512
max_aspect_ratio = 2.

In [None]:
!img2dataset \
  --url_list $input_file \
  --image_size $image_size \
  --output_folder $output_folder \
  --processes_count $processes_count \
  --thread_count $thread_count \
  --resize_mode center_crop \
  --encode_quality $encode_quality \
  --encode_format $encode_format \
  --output_format tfrecord \
  --number_sample_per_shard $number_sample_per_shard \
  --extract_exif false \
  --min_image_size $min_image_size \
  --max_aspect_ratio $max_aspect_ratio \
  --enable_wandb

In [None]:
# update relevant parameters for train set
input_file = "train.txt"
output_folder = "openimages/train"
image_size = 304  # we will do random crop during training
number_sample_per_shard = 2000

In [None]:
!img2dataset \
  --url_list $input_file \
  --image_size $image_size \
  --output_folder $output_folder \
  --processes_count $processes_count \
  --thread_count $thread_count \
  --resize_mode center_crop \
  --encode_quality $encode_quality \
  --encode_format $encode_format \
  --output_format tfrecord \
  --number_sample_per_shard $number_sample_per_shard \
  --extract_exif false \
  --min_image_size $min_image_size \
  --max_aspect_ratio $max_aspect_ratio \
  --enable_wandb

## Dataloader

Files have been saved as tfrecords

In [None]:
from vit_vqgan.data import Dataset, logits_to_image
from matplotlib import pyplot as plt
import numpy as np

In [None]:
dataset = Dataset(
    train_folder="openimages/train",
    valid_folder="openimages/valid",
    train_batch_size=100,
    valid_batch_size=100,
    image_size= 256,
    min_original_image_size = 512,
    max_original_aspect_ratio = 2.0
)

In [None]:
sample_batch = next(iter(dataset.train))
sample_batch

In [None]:
# visualize the batch
plt.figure(figsize=(10, 10))
for i in range(9):
    img = logits_to_image(sample_batch[i], format=dataset.format)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(img)