# Fashion MNIST Dataset Maker

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import random

import tensorflow as tf
import numpy as np
import pandas as pd
import PIL.Image as Image

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/datasets/fashion_mnist
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data()
images = np.concatenate((train_images, test_images))
labels = np.concatenate((train_labels, test_labels))
print(f'Train: {len(train_images)}, Test: {len(test_images)}, Total: {len(images)}')

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/datasets/fashion_mnist/load_data
labelnames = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
labelnames

In [None]:
# Data distribution table (csv)
div = pd.read_csv('div-fashionmnist.csv', index_col='Clients')
div.fillna(0, inplace=True)
div.drop(columns=['Total'], inplace=True)
div.drop(index=['Total'], inplace=True)
div

In [None]:
row_index = []
row_data = []
test_ratio = 0.15
train_ratio = 1 - test_ratio
for index, row in div.iterrows():
    row_index.append(f'{index} train')
    row_data.append(np.rint(row*train_ratio))
    row_index.append(f'{index} test')
    row_data.append(np.rint(row*test_ratio))
df = pd.DataFrame(data=row_data, index=row_index, dtype=div.iloc[0].dtype)
df

In [None]:
# Count each labels
total = {}
for idx, data in enumerate(zip(images, labels), start=0):
    label = labelnames[data[1]]
    total[label] = total.get(label, 0) + 1
total

In [None]:
dataset_root = os.path.abspath(os.path.expanduser('dataset-fashionmnist'))
dataset_root

In [None]:
# Write dataset
left = df.copy(deep=True)
counter = {}
for idx, data in enumerate(zip(images, labels), start=0):
    image = Image.fromarray(data[0])
    label = labelnames[data[1]]
    num = counter.get(label, 0)
    party, split = random.choice(left.loc[left.loc[:, label] > 0].index).split(' ')
    odir = os.path.join(dataset_root, f'{party}', f'{split}', label)
    os.makedirs(odir, exist_ok=True)
    opath = os.path.join(odir, f'{num:04d}.jpg')
    image.save(opath)
    left.loc[f'{party} {split}', label] = left.loc[f'{party} {split}', label] - 1
    counter[label] = num + 1
left

## Create simbolic link for round

In [None]:
found = pd.read_csv('round-fashionmnist.csv', index_col='Clients')
found.fillna(-1, inplace=True)
found = found.astype('int')
found.drop(columns=['Total'], inplace=True)
found.drop(index=['Total'], inplace=True)
found

In [None]:
found_round = np.unique(found.values).tolist()
found_round.remove(-1)
found_round.sort(reverse=False)
found_round

In [None]:
found_label = dict()
for fr in found_round:
    label = found_label.get('all', dict())
    label[fr] = found.columns[((-1 < found.loc[:, :]) & 
                               (found.loc[:, :] <= fr)).any()]
    found_label['all'] = label
    for c in found.index:
        label = found_label.get(c, dict())
        label[fr] = found.columns[((-1 < found.loc[c, :]) & 
                                   (found.loc[c, :] <= fr))]
        found_label[c] = label
found_label

In [None]:
scenario_root = os.path.join(dataset_root, 'scenario')
scenario_root

In [None]:
for c, value in found_label.items():
    if c == 'all':
        continue
    for fr, labels in value.items():
        for label in labels:
            for subset in ['train', 'test']:
                src_path = os.path.join(dataset_root, c, subset, label)
                scenario_base = os.path.join(scenario_root, c, f'{fr}', subset)
                os.makedirs(scenario_base, exist_ok=True)
                dst_path = os.path.join(scenario_base, label)
                # os.symlink(src_path, dst_path)
                src_relpath = os.path.relpath(src_path, scenario_base)
                os.symlink(src_relpath, dst_path)