This notebook downloads the omniglot dataset and preprocesses it for meta learning.

# Download Omniglot

In [10]:
!wget -P ../../data/omniglot/ https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip
!wget -P ../../data/omniglot/ https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip

--2020-01-12 19:05:21--  https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/brendenlake/omniglot/master/python/images_background.zip [following]
--2020-01-12 19:05:21--  https://raw.githubusercontent.com/brendenlake/omniglot/master/python/images_background.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.24.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.24.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9464212 (9.0M) [application/zip]
Saving to: ‘../../data/omniglot/images_background.zip’


2020-01-12 19:05:22 (83.3 MB/s) - ‘../../data/omniglot/images_background.zip’ saved [9464212/9464212]

--2020-01-12 19:05:22--  https://github.com/brendenlake/omniglot

In [1]:
!unzip -q ../../data/omniglot/images_background.zip -d ../../data/omniglot/
!unzip -q ../../data/omniglot/images_evaluation.zip -d ../../data/omniglot/

In [7]:
!rm -rf ../../data/omniglot/*.zip

# Process raw data

In [2]:
from skimage import io
from skimage import transform
import zipfile
import shutil
import os

In [3]:
DATA_PATH = '../../data/omniglot/'

## Dictionary functions

In [4]:
def mkdir(dir):
    """Create a directory, ignoring exceptions

    # Arguments:
        dir: Path of directory to create
    """
    try:
        os.mkdir(dir)
    except:
        pass


def rmdir(dir):
    """Recursively remove a directory and contents, ignoring exceptions

   # Arguments:
       dir: Path of directory to recursively remove
   """
    try:
        shutil.rmtree(dir)
    except:
        pass

# Augmenting and normalising functions

Scale characters from 105x105 to 28x28, apply rotation and normalise. Need skimage v0.14.0.

In [5]:
# Parameters
output_shape = (28, 28)

def handle_characters(alphabet_folder, character_folder, rotate):
    for root, _, character_images in os.walk(character_folder):
        character_name = root.split('/')[-1]
        mkdir(f'{alphabet_folder}.{rotate}/{character_name}')
        for img_path in character_images:
            # print(root+'/'+img_path)
            img = io.imread(root+'/'+img_path)
            img = transform.rotate(img, angle=rotate)
            img = transform.resize(img, output_shape, anti_aliasing=True)
            img = (img - img.min()) / (img.max() - img.min())
            # print(img.min(), img.max())
            # print(f'{alphabet_folder}.{rotate}/{character_name}/{img_path}')
            io.imsave(f'{alphabet_folder}.{rotate}/{character_name}/{img_path}', img)
            # return


def handle_alphabet(folder):
    print('{}...'.format(folder.split('/')[-1]))
    for rotate in [0, 90, 180, 270]:
        # Create new folders for each augmented alphabet
        mkdir(f'{folder}.{rotate}')
        for root, character_folders, _ in os.walk(folder):
            for character_folder in character_folders:
                # For each character folder in an alphabet rotate and resize all of the images and save
                # to the new folder
                handle_characters(folder, root + '/' + character_folder, rotate)
                # return
   # Delete original alphabet
    rmdir(folder)

# Start processing

In [6]:
print('Processing background set...')
for root, alphabets, _ in os.walk(DATA_PATH + 'images_background/'):
    for alphabet in sorted(alphabets):
        handle_alphabet(root + alphabet)

print('Processing evaluation set...')
for root, alphabets, _ in os.walk(DATA_PATH + 'images_evaluation/'):
    for alphabet in sorted(alphabets):
        handle_alphabet(root + alphabet)

Processing background set...
Alphabet_of_the_Magi...


  warn("The default mode, 'constant', will be changed to 'reflect' in "
  .format(dtypeobj_in, dtypeobj_out))


Anglo-Saxon_Futhorc...
Arcadian...
Armenian...
Asomtavruli_(Georgian)...
Balinese...
Bengali...
Blackfoot_(Canadian_Aboriginal_Syllabics)...
Braille...
Burmese_(Myanmar)...
Cyrillic...
Early_Aramaic...
Futurama...
Grantha...
Greek...
Gujarati...
Hebrew...
Inuktitut_(Canadian_Aboriginal_Syllabics)...
Japanese_(hiragana)...
Japanese_(katakana)...
Korean...
Latin...
Malay_(Jawi_-_Arabic)...
Mkhedruli_(Georgian)...
N_Ko...
Ojibwe_(Canadian_Aboriginal_Syllabics)...
Sanskrit...
Syriac_(Estrangelo)...
Tagalog...
Tifinagh...
Processing evaluation set...
Angelic...
Atemayar_Qelisayer...
Atlantean...
Aurek-Besh...
Avesta...
Ge_ez...
Glagolitic...
Gurmukhi...
Kannada...
Keble...
Malayalam...
Manipuri...
Mongolian...
Old_Church_Slavonic_(Cyrillic)...
Oriya...
Sylheti...
Syriac_(Serto)...
Tengwar...
Tibetan...
ULOG...


In [23]:
!pushd ~/SageMaker/meta_learning/data/omniglot && zip -q -r images_background.zip ./images_background/ && popd
!pushd ~/SageMaker/meta_learning/data/omniglot && zip -q -r images_evaluation.zip ./images_evaluation/ && popd

~/SageMaker/meta_learning/data/omniglot ~/SageMaker/meta_learning/sagemaker_initiators/prepare_omniglot
~/SageMaker/meta_learning/sagemaker_initiators/prepare_omniglot


In [1]:
import sagemaker
from sagemaker.session import Session

In [24]:
inputs = sagemaker.Session().upload_data(path='../../data/omniglot/images_background.zip', bucket='test-meta-learning-data', key_prefix='data/omniglot')
inputs = sagemaker.Session().upload_data(path='../../data/omniglot/images_evaluation.zip', bucket='test-meta-learning-data', key_prefix='data/omniglot')