# CIFAR-10 Dataset Maker

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import random

import tensorflow as tf
import numpy as np
import pandas as pd
import PIL.Image as Image

In [None]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()
images = np.concatenate((train_images, test_images))
labels = np.concatenate((train_labels, test_labels))
print(f'Train: {len(train_images)}, Test: {len(test_images)}, Total: {len(images)}')

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/datasets/cifar10/load_data
labelnames = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']
labelnames

In [None]:
# Data distribution table (csv)
div = pd.read_csv('div-cifar10.csv', index_col='Clients')
div.fillna(0, inplace=True)
div.drop(columns=['Total'], inplace=True)
div.drop(index=['Total'], inplace=True)
div

In [None]:
row_index = []
row_data = []
test_ratio = 0.15
train_ratio = 1 - test_ratio
for index, row in div.iterrows():
    row_index.append(f'{index} train')
    row_data.append(np.rint(row*train_ratio))
    row_index.append(f'{index} test')
    row_data.append(np.rint(row*test_ratio))
df = pd.DataFrame(data=row_data, index=row_index, dtype=div.iloc[0].dtype)
df

In [None]:
# Count each labels
total = {}
for idx, data in enumerate(zip(images, labels), start=0):
    label = labelnames[data[1][0]]
    total[label] = total.get(label, 0) + 1
total

In [None]:
np.sum(df, axis=0)

In [None]:
np.sum(df, axis=1)

In [None]:
dataset_root = os.path.abspath(os.path.expanduser('dataset-cifar10'))

In [None]:
left = df.copy(deep=True)
counter = {}
for idx, data in enumerate(zip(images, labels), start=0):
    image = Image.fromarray(data[0])
    label = labelnames[data[1][0]]
    num = counter.get(label, 0)
    party, split = random.choice(left.loc[left.loc[:, label] > 0].index).split(' ')
    odir = os.path.join(dataset_root, f'{party}', f'{split}', label)
    os.makedirs(odir, exist_ok=True)
    opath = os.path.join(odir, f'{num:04d}.jpg')
    image.save(opath)
    left.loc[f'{party} {split}', label] = left.loc[f'{party} {split}', label] - 1
    counter[label] = num + 1
left