# Amazon from Space to TFRecords

In [1]:
import tensorflow as tf
import numpy as np
import glob
from PIL import Image
import skimage.io

import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [12]:
data_path = '/data/amazon-from-space'
train_file_paths = f'{data_path}/train-jpg/'
test_file_paths = f'{data_path}/test-jpg/'
train_records_path = f'{data_path}/train-jpg.tfrecords'
test_records_path = f'{data_path}/test-jpg.tfrecords'
file_format = '.jpg'

In [13]:
train_info = pd.read_csv(data_path + '/train.csv')
test_info  = pd.read_csv(data_path + '/test.csv')

file_paths = (train_file_paths + train_info.image_name + file_format).values
labels = train_info.tags.str.split(' ').values

In [14]:
file_paths[:5]

array(['/data/amazon-from-space/train-jpg/train_0.jpg',
       '/data/amazon-from-space/train-jpg/train_1.jpg',
       '/data/amazon-from-space/train-jpg/train_2.jpg',
       '/data/amazon-from-space/train-jpg/train_3.jpg',
       '/data/amazon-from-space/train-jpg/train_4.jpg'], dtype=object)

In [5]:
labels[:5]

array([list(['haze', 'primary']),
       list(['agriculture', 'clear', 'primary', 'water']),
       list(['clear', 'primary']), list(['clear', 'primary']),
       list(['agriculture', 'clear', 'habitation', 'primary', 'road'])],
      dtype=object)

In [6]:
CLASSES = np.unique(sum(np.unique(labels).tolist(), []))

en = LabelEncoder().fit(CLASSES)

In [7]:
class AmazonFromSpace:
    classes_ = np.asarray(
        ['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down',
         'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze',
         'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water'])

    @classmethod
    def int2str(cls, indices):
        return cls.classes_[indices]
    
    @staticmethod
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.tobytes()]))

    @staticmethod
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
    @classmethod
    def convert(cls, file_paths, labels, writer):
        samples = len(file_paths)

        for ix, (p, label) in enumerate(zip(file_paths, labels)):
            img = skimage.io.imread(p)
            h, w, c = img.shape

            feature = {
              'height': cls._int64_feature(h),
              'width': cls._int64_feature(w),
              'channels': cls._int64_feature(c),
              'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label)),
              'image': cls._bytes_feature(img),
            }

            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())

            if ix % 100 == 0: print('.', end='')
            if ix % int(samples/10) == 0: print(f'\n{ix/samples:.0%}', end='')

## Copying Training Data

In [8]:
encoded_labels = [en.transform(l) for l in labels]
file_paths, encoded_labels[:5]

(array(['/data/amazon-from-space/train-jpg/train_0.tif',
        '/data/amazon-from-space/train-jpg/train_1.tif',
        '/data/amazon-from-space/train-jpg/train_2.tif', ...,
        '/data/amazon-from-space/train-jpg/train_40476.tif',
        '/data/amazon-from-space/train-jpg/train_40477.tif',
        '/data/amazon-from-space/train-jpg/train_40478.tif'], dtype=object),
 [array([10, 12]),
  array([ 0,  5, 12, 16]),
  array([ 5, 12]),
  array([ 5, 12]),
  array([ 0,  5,  9, 12, 13])])

In [16]:
# ! rm -rf /mnt/files/datasets/amazon-from-space/train.tfrecords

with tf.io.TFRecordWriter(train_records_path) as w:
    AmazonFromSpace.convert(file_paths, encoded_labels, w)

.0%........................................10%........................................20%.........................................30%........................................40%.........................................50%........................................60%.........................................70%........................................80%.........................................90%........................................100%

## Test Dataset

### Apply name fix proposed in discussions

In [None]:
import os

BASEPATH = '/mnt/files/datasets/amazon-from-space/'
WORKING = '/mnt/files/datasets/amazon-from-space/working'

CSVPATH = os.path.join(BASEPATH, 'test_v2_file_mapping.csv')
JPGPATH = os.path.join(BASEPATH, 'test-jpg-v2')
TIFPATH = os.path.join(BASEPATH, 'test-tif-v2')
FIXEDPATH = os.path.join(WORKING, 'fixed')

In [None]:
import shutil

def copy_and_rename(df, num_files=500):
    n = 0

    if not os.path.exists(FIXEDPATH):
        os.mkdir(FIXEDPATH)

    for index, row in df.iterrows():
        old = os.path.join(TIFPATH, row['old'])
        new = os.path.join(FIXEDPATH, row['new'])
        shutil.copy(old, new)
        n += 1
        if n % 500 == 0: print('Copied {}'.format(n))

In [None]:
copy_and_rename(test_info)

In [None]:
mv $TIFPATH /tmp/backup
mv $FIXEDPATH $test_file_paths

### Convert test dataset to tfrecords

In [17]:
import os

test_files = np.asarray([os.path.join(test_file_paths, f) for f in os.listdir(test_file_paths)])
test_labels = [[]]*len(test_files)

test_files[:5]

array(['/data/amazon-from-space/test-jpg/test_17823.jpg',
       '/data/amazon-from-space/test-jpg/test_29423.jpg',
       '/data/amazon-from-space/test-jpg/file_7472.jpg',
       '/data/amazon-from-space/test-jpg/file_11084.jpg',
       '/data/amazon-from-space/test-jpg/file_16408.jpg'], dtype='<U47')

In [18]:
# ! rm -rf $test_records_path

with tf.io.TFRecordWriter(test_records_path) as w:
    AmazonFromSpace.convert(test_files, test_labels, w)

.0%.............................................................10%.............................................................20%.............................................................30%.............................................................40%.............................................................50%..............................................................60%.............................................................70%.............................................................80%.............................................................90%.............................................................100%