In [14]:
from scipy.io import loadmat
from os import path
import os
from toolz import juxt, compose
import pandas as pd
from glob import iglob
from itertools import chain

In [2]:
data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')

In [3]:
data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))

In [4]:
def _index_from(synset):
    return synset[0][0][0][0]

def _wnid_from(synset):
    return str(synset[0][1][0])

def _name_from(synset):
    return str(synset[0][2][0])

In [5]:
def _extract_from(synset):
    extract = juxt(_index_from, _wnid_from, _name_from)
    return extract(synset)

In [6]:
label_list = list(map(_extract_from, data['synsets']))

In [7]:
df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')

In [8]:
train_dir=path.join(data_dir,'train')

In [9]:
def _extract_to_directory(wnid):
    out_dir = path.join(train_dir, wnid)
    tar_file = path.join(train_dir, '{}.tar'.format(wnid))
    print(out_dir)
    !mkdir -p $out_dir
    !tar -C $out_dir -xf $tar_file

In [30]:
filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]

In [31]:
ff = list(chain(*filenames))

In [32]:
data_df = pd.DataFrame({'filenames':ff})

In [33]:
index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()

In [34]:
wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]

In [35]:
data_df = data_df.assign(wnid=wnid_labels)

In [36]:
data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), 
                                                                    value=list(index_to_wnid_dict.keys())))

In [48]:
extract_wnid_dir = compose(path.basename, path.dirname)

In [49]:
convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))

In [51]:
data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))

In [52]:
data_df.head()

Unnamed: 0,filenames,wnid,num_id
0,n02119789/n02119789_12009.JPEG,n02119789,1
1,n02119789/n02119789_4083.JPEG,n02119789,1
2,n02119789/n02119789_14450.JPEG,n02119789,1
3,n02119789/n02119789_11832.JPEG,n02119789,1
4,n02119789/n02119789_5459.JPEG,n02119789,1


In [64]:
data_df.to_csv(path.join(data_dir, 'train.csv'))

### Validation data 

In [53]:
!mkdir -p {path.join(data_dir, 'validation')}

In [54]:
!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}

In [55]:
f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))

In [56]:
convert_label = compose(int, str.strip)

In [57]:
labels = list(map(convert_label, f.readlines()))

In [58]:
files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))

In [68]:
valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})

In [70]:
valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))

In [71]:
valid_df.head()

Unnamed: 0,filenames,num_id
0,ILSVRC2012_val_00000001.JPEG,490
1,ILSVRC2012_val_00000002.JPEG,361
2,ILSVRC2012_val_00000003.JPEG,171
3,ILSVRC2012_val_00000004.JPEG,822
4,ILSVRC2012_val_00000005.JPEG,297


In [65]:
!ls {data_dir}

ILSVRC2012_devkit_t12	      ILSVRC2012_img_train.tar	train.csv
ILSVRC2012_devkit_t12.tar.gz  ILSVRC2012_img_val.tar	validation
ILSVRC2012_img_test.tar       train			validation.csv


In [72]:
valid_df.to_csv(path.join(data_dir, 'validation.csv'))