In [1]:
import os
import numpy as np
import shutil
import subprocess

from IPython.core.debugger import Tracer

In [2]:
data_dir = '/tanData/datasets/cifar10'
seed_val = 0

In [3]:
with open(os.path.join(data_dir, 'trainLabels.csv'), 'r') as f:
    lines = f.readlines()[1:]
    tokens = [i.rstrip().split(',') for i in lines]
    idx_label = dict((int(idx), label) for idx, label in tokens)
labels = set(idx_label.values())

num_train = len(os.listdir(os.path.join(data_dir, 'train')))

num_train_tuning = int(num_train * (1 - 0.1))

num_train_tuning_per_label = num_train_tuning // len(labels)

num_train_sup = 4000

num_train_sup_per_label = num_train_sup // len(labels)

ratio_unsup_sup = (num_train - num_train_sup) // num_train_sup

# select labeled data
data_rng = np.random.RandomState(seed_val)
indx = data_rng.permutation(range(1, num_train + 1))
indx_sup = []

for c in labels:
    c_count = 0
    for i in indx:
        if idx_label[i] == c and c_count < num_train_sup_per_label:
            indx_sup.append(i)
            c_count += 1
        if c_count >= num_train_sup_per_label:
            break
            
label_count = dict()
# print(indx_sup)
# for i in indx_sup:
#     print(idx_label[i])

def mkdir_if_not_exist(path):
    if not os.path.exists(os.path.join(*path)):
        os.makedirs(os.path.join(*path))
        
for train_file in os.listdir(os.path.join(data_dir, 'train')):
    idx = int(train_file.split('.')[0])
    label = idx_label[idx]
    if idx in indx_sup:
        mkdir_if_not_exist([data_dir, 'train_valid_sup', label])
        for i in range(ratio_unsup_sup):
            shutil.copy(os.path.join(data_dir,'train', train_file),
                        os.path.join(data_dir, 'train_valid_sup', label, '%i_%i.png'%(idx, i)))
    else:
        mkdir_if_not_exist([data_dir, 'train_valid_unsup', label])
        shutil.copy(os.path.join(data_dir,'train', train_file),
                   os.path.join(data_dir, 'train_valid_unsup', label))
    
    # TODO: SPLIT SUP AND UNSUP FOR THIS PART
    if label not in label_count or label_count[label] < num_train_tuning_per_label:
        mkdir_if_not_exist([os.path.join(data_dir, 'train_data'), label])
        shutil.copy(os.path.join(data_dir, 'train', train_file),
                   os.path.join(data_dir, 'train_data', label))
        label_count[label] = label_count.get(label, 0) + 1
    else:
        mkdir_if_not_exist([os.path.join(data_dir, 'valid_data'), label])
        shutil.copy(os.path.join(os.path.join(data_dir,'train'), train_file),
                   os.path.join(os.path.join(data_dir,'valid_data'), label))

# subprocess.Popen(["/bin/bash", "make_rec_cifar10_semisup.sh", data_dir, "train_valid_sup", "cifar10_train_valid_sup"])
# subprocess.Popen(["/bin/bash", "make_rec_cifar10_semisup.sh", data_dir, "train_valid_unsup", "cifar10_train_valid_unsup"])

In [4]:
%%bash

DATA_DIR=/tanData/datasets/cifar10
data_name=( "train_valid_sup" "train_valid_unsup" )
list_name=( "cifar10_train_valid_sup" "cifar10_train_valid_unsup" )
MX_DIR=/mxnet

for ((i=0;i<${#data_name[@]};++i)); do
    # clean stuffs
    rm -rf ${DATA_DIR}/${list_name[i]}.*
    # make list for all classes
    python ${MX_DIR}/tools/im2rec.py --list --exts '.png' --recursive ${DATA_DIR}/${list_name[i]} ${DATA_DIR}/${data_name[i]}
    # make .rec file for all classes
    python ${MX_DIR}/tools/im2rec.py --exts '.png' --quality 95 --num-thread 16 --color 1 ${DATA_DIR}/${list_name[i]} ${DATA_DIR}/${data_name[i]}
    # remove folders
    rm -rf ${DATA_DIR}/${data_name[i]}
done

airplane 0
automobile 1
bird 2
cat 3
deer 4
dog 5
frog 6
horse 7
ship 8
truck 9
Creating .rec file from /tanData/datasets/cifar10/cifar10_train_valid_sup.lst in /tanData/datasets/cifar10
time: 0.000874042510986  count: 0
time: 0.0772421360016  count: 1000
time: 0.0599570274353  count: 2000
time: 0.0572907924652  count: 3000
time: 0.0515990257263  count: 4000
time: 0.0481150150299  count: 5000
time: 0.0506100654602  count: 6000
time: 0.0502760410309  count: 7000
time: 0.0479800701141  count: 8000
time: 0.0484578609467  count: 9000
time: 0.0524051189423  count: 10000
time: 0.0533399581909  count: 11000
time: 0.0526969432831  count: 12000
time: 0.052994966507  count: 13000
time: 0.0576801300049  count: 14000
time: 0.0499358177185  count: 15000
time: 0.0518991947174  count: 16000
time: 0.0480008125305  count: 17000
time: 0.059005022049  count: 18000
time: 0.0603930950165  count: 19000
time: 0.0572309494019  count: 20000
time: 0.0517270565033  count: 21000
time: 0.0549068450928  count: 2200