In [1]:
from geoimages import etl
import h5py

Using TensorFlow backend.


In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

### Unzip files, rename to include lables, generate metadata.

In [3]:
## unzip images, create and save json with image metadata
images_md = etl.Images()
images_md.unzip_images('../datasets/geotechnical_images.zip')
images_md.rename_images()
md_all = images_md.generate_image_metadata()
images_md.write_json(md_all,'../datasets/metadata_all.json')
print('Total images: ' + str(len(md_all['images'])))

Total images: 29998


### ETL Training data.

In [4]:
## randomly sample images for training, save json with image metadata
images_train = etl.Images()
images_train.dev_path = '../images/train'
images_train.sample_images(sample_percent=0.70)
images_train.images_path = '../images/train'
md_train = images_train.generate_image_metadata()
images_train.write_json(md_train,'../datasets/metadata_train.json')
print('Total training set images: ' + str(len(images_train.images_meta_data['images'])))

Total training set images: 20998


In [8]:
## etl training data, save
X_Train_Orig, Y_Train_Orig = images_train.images_to_x_y()
hf = h5py.File('../datasets/image_classification_train.h5', 'w')
hf.create_dataset('X_Train_Orig', data=X_Train_Orig, compression="gzip", compression_opts=9)
hf.create_dataset('Y_Train_Orig', data=Y_Train_Orig.astype('S'), compression="gzip", compression_opts=9)
hf.close()
print('X_Train_Orig shape: ' + str(X_Train_Orig.shape))
print('Y_Train_Orig shape: ' + str(Y_Train_Orig.shape))

X Train shape: (20998, 28, 28, 3)
Y Train shape: (20998,)


### ETL Development data.

In [10]:
## randomly sample images for dev, save json with image metadata
images_dev = etl.Images()
images_dev.dev_path = '../images/dev'
images_dev.sample_images(sample_percent=0.60)
images_dev.images_path = '../images/dev'
md_dev = images_dev.generate_image_metadata()
images_dev.write_json(md_dev,'../datasets/metadata_dev.json')
print('Total development set images: ' + str(len(images_dev.images_meta_data['images'])))

Total development set images: 5400


In [12]:
## etl development data, save
X_Dev_Orig, Y_Dev_Orig = images_dev.images_to_x_y()
hf = h5py.File('../datasets/image_classification_dev.h5', 'w')
hf.create_dataset('X_Dev_Orig', data=X_Dev_Orig, compression="gzip", compression_opts=9)
hf.create_dataset('Y_Dev_Orig', data=Y_Dev_Orig.astype('S'), compression="gzip", compression_opts=9)
hf.close()
print('X_Dev_Orig shape: ' + str(X_Dev_Orig.shape))
print('Y_Dev_Orig shape: ' + str(Y_Dev_Orig.shape))

X_Dev_Orig shape: (5400, 28, 28, 3)
Y_Dev_Orig shape: (5400,)


### ETL Test data.

In [13]:
## randomly sample images for test, save json with image metadata
images_test = etl.Images()
images_test.dev_path = '../images/test'
images_test.sample_images(sample_percent=0.99)
images_test.images_path = '../images/test'
md_test = images_test.generate_image_metadata()
images_test.write_json(md_test,'../datasets/metadata_test.json')
print('Total test set images: ' + str(len(images_test.images_meta_data['images'])))

Total test set images: 3564


In [16]:
## etl test data, save
X_Test_Orig, Y_Test_Orig = images_test.images_to_x_y()
hf = h5py.File('../datasets/image_classification_test.h5', 'w')
hf.create_dataset('X_Test_Orig', data=X_Test_Orig, compression="gzip", compression_opts=9)
hf.create_dataset('Y_Test_Orig', data=Y_Test_Orig.astype('S'), compression="gzip", compression_opts=9)
hf.close()
print('X_Test_Orig shape: ' + str(X_Test_Orig.shape))
print('Y_Test_Orig shape: ' + str(Y_Test_Orig.shape))

X_Test_Orig shape: (3564, 28, 28, 3)
Y_Test_Orig shape: (3564,)
