# Imports

In [None]:
import os
import json
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../imagenet.py

# Prepare metadata

In [None]:
import scipy.io as sio

In [None]:
DATA_DIR = '' # Fill with base folder if needed

In [None]:
DEVKIT_DIR = os.path.join(DATA_DIR, 'ILSVRC2012_devkit_t12')

In [None]:
def parse_meta_mat(devkit_root):
    """Copied from ImageNet torch model.
    
    https://pytorch.org/vision/stable/_modules/torchvision/datasets/imagenet.html#ImageNet
    """
    metafile = os.path.join(devkit_root, "data", "meta.mat")
    meta = sio.loadmat(metafile, squeeze_me=True)['synsets']
    nums_children = list(zip(*meta))[4]
    meta = [meta[idx] for idx, num_children in enumerate(nums_children)
            if num_children == 0]
    idcs, wnids, classes = list(zip(*meta))[:3]
    classes = [tuple(clss.split(', ')) for clss in classes]
    idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)}
    wnid_to_classes = {wnid: clss for wnid, clss in zip(wnids, classes)}
    return idx_to_wnid, wnid_to_classes

In [None]:
idx_to_wnid, wnid_to_classes = parse_meta_mat(DEVKIT_DIR)
len(idx_to_wnid), len(wnid_to_classes)

In [None]:
wnid_ordered = [idx_to_wnid[i+1] for i in range(len(idx_to_wnid))]
len(wnid_ordered), wnid_ordered[:3]

In [None]:
classes_ordered = [
    wnid_to_classes[wnid][0]
    for wnid in wnid_ordered
]
len(classes_ordered), classes_ordered[:3]

In [None]:
with open(os.path.join(DATA_DIR, 'dataset', 'wnids.txt'), 'w') as f:
    for wnid in wnid_ordered:
        f.write(f'{wnid}\n')

In [None]:
with open(os.path.join(DATA_DIR, 'dataset', 'wnid_to_label.json'), 'w') as f:
    json.dump(wnid_to_classes, f, indent=2)

## Validation metadata

In [None]:
fpath = os.path.join(DEVKIT_DIR, 'data', 'ILSVRC2012_validation_ground_truth.txt')
with open(fpath, 'r') as f:
    val_gts = [(int(l.strip()) - 1) for l in f] # Go from 1 to 1000 inclusive
len(val_gts)

In [None]:
image_names = sorted(os.listdir(os.path.join(DATA_DIR, 'dataset', 'images', 'val')))
len(image_names), image_names[:3]

In [None]:
metadata = [
    (image_name, wnid_ordered[gt_idx])
    for image_name, gt_idx in zip(image_names, val_gts)
]
len(metadata), metadata[:3]

In [None]:
df = pd.DataFrame(metadata, columns=['image_name', 'wnid'])
df.head(3)

In [None]:
df.to_csv(os.path.join(DATA_DIR, 'dataset', 'val_metadata.csv'), index=False)

## Train data

In [None]:
parent_folder = os.path.join(DATA_DIR, 'ILSVRC2012_img_train')
folders = os.listdir(parent_folder)
folders = [f for f in folders if '.tar' not in f]
len(folders)

In [None]:
metadata = []

for folder in folders:
    for image_name in os.listdir(os.path.join(parent_folder, folder)):
        metadata.append((os.path.join(folder, image_name), folder))
len(metadata)

In [None]:
df = pd.DataFrame(metadata, columns=['image_name', 'wnid'])
print(len(df))
df.head()

In [None]:
fpath = os.path.join(DATA_DIR, 'dataset', 'train_metadata.csv')
df.to_csv(fpath, index=False)

# Compute mean and std

In [None]:
%run ../../utils/images.py

In [None]:
df = pd.read_csv(os.path.join(DATASET_DIR, 'train_metadata.csv'))
train_images = list(df['image_name'])
del df
len(train_images)

In [None]:
images_dir = os.path.join(DATASET_DIR, 'images', 'train')

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, train_images), show=True)
mean, std

In [None]:
mean, std

# Create mini-imagenet

Smaller version of the dataset, with the same classes balanced

In [None]:
import random

In [None]:
split = 'train'

In [None]:
fpath = os.path.join(DATASET_DIR, f'{split}_metadata.csv')
df = pd.read_csv(fpath)
df.head()

In [None]:
grouped_by_class = df.groupby('wnid')['image_name'].apply(list).to_dict()
len(grouped_by_class)

In [None]:
n_samples_by_class = 10 if split == 'val' else 100
chosen_samples = []
for samples_by_class in grouped_by_class.values():
    chosen_samples.extend(random.sample(samples_by_class, n_samples_by_class))
len(chosen_samples)

In [None]:
chosen_samples = set(chosen_samples)

In [None]:
df['mini'] = [
    int(image_name in chosen_samples)
    for image_name in df['image_name']
]
df.head()

In [None]:
df.to_csv(fpath, index=False)

# Load one sample

In [None]:
from PIL import Image
from collections import Counter
from tqdm.auto import tqdm
import warnings

In [None]:
warnings.filterwarnings('error')

In [None]:
split = 'train'
df = pd.read_csv(os.path.join(DATASET_DIR, f'{split}_metadata.csv'))
df.head()

## Check errors

- Check samples with errors
- Get shapes (failed)

In [None]:
shapes = Counter()
errors = []

for idx in tqdm(range(len(df))):
    row = df.iloc[idx]
    image_name = row['image_name']

    try:
        image = Image.open(os.path.join(DATASET_DIR, 'images', split, image_name))
    except UserWarning as e:
        errors.append((image_name, e))
        # print('get exif error', image_name)
    shapes[image.size] += 1
    
    image.close()
    
len(shapes)

In [None]:
len(shapes), len(errors)

# Fix EXIF errors

In [None]:
fpath = os.path.join(DATASET_DIR, 'images/train/n04152593/n04152593_17460.JPEG')
image_fp = Image.open(fpath)
image = image_fp.convert('RGB')
image_fp.close()
plt.imshow(image)
print(image.size)

In [None]:
import piexif

In [None]:
piexif.remove(fpath)

# Load class

In [None]:
%run ../imagenet.py
%run ../__init__.py
%run ../../utils/common.py

In [None]:
dataset = ImageNetDataset('train')
len(dataset)

In [None]:
item = dataset[-10]
item.image.size(), item.labels

In [None]:
plt.title(dataset._wnid_to_label_name[dataset.labels[item.labels]])
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))

In [None]:
kwargs = {
    'dataset_name': 'imagenet',
    'dataset_type': 'train',
    'shuffle': True,
}
dataloader = prepare_data_classification(**kwargs)
len(dataloader.dataset)

In [None]:
for batch in dataloader:
    break