In [None]:
import os
import pickle

from PIL import Image
from PIL import ImageOps

from urllib.request import urlretrieve
import zipfile

In [None]:
# Set target path
tpath = os.path.join(os.getcwd(), 'omniglot/')

In [None]:
# Download and extract omniglot
origin_folder = "https://github.com/brendenlake/omniglot/raw/master/python/"

fnames = ["images_evaluation.zip", "images_background.zip"]

for fname in fnames:
    origin = os.path.join(origin_folder, fname)
    if not os.path.isdir('omniglot/'):
        os.makedirs('omniglot/')
    fpath = os.path.join(tpath, fname)
    urlretrieve(origin, fpath)
    zipfile.ZipFile(fpath).extractall(tpath)

In [None]:
# Open all images and collect them in a nested list
def load_chars(path):
    chars = []
    char_locs = []

    alphabet = [os.path.join(path, x) for x in sorted(os.listdir(path))]
    for alph in alphabet:
        character = [os.path.join(alph, x) for x in sorted(os.listdir(alph))]
        alph_chars = []
        alph_char_locs = []

        for char in character:
            char_insts = [os.path.join(char, x) for x in sorted(os.listdir(char))]
            char_instances = []

            for char_inst in char_insts:
                tmp_im = Image.open(char_inst)
                tmp_im = tmp_im.convert('L')
                tmp_im = ImageOps.invert(tmp_im)
                tmp_im = tmp_im.convert('1')

                char_instances.append(tmp_im)

            alph_chars.append(char_instances)
            alph_char_locs.append(char_insts)

        chars.append(alph_chars)
        char_locs.append(alph_char_locs)
        
    return chars, char_locs

In [None]:
# Run image opening and collection function
chars_train, char_locs_train = load_chars(path = os.path.join(tpath, 'images_background/'))
chars_eval, char_locs_eval = load_chars(path = os.path.join(tpath, 'images_evaluation/'))

In [None]:
#Write dataset to pickle file
if not os.path.exists(tpath):
    os.makedirs(tpath)
#Write train split containing all alphabets from images_background
with open(tpath + 'chars_train.pickle', 'wb') as fp:   
    pickle.dump(chars_train, fp)
with open(tpath + 'char_locs_train.pickle', 'wb') as fp:   
    pickle.dump(char_locs_train, fp)
#Write evaluation split containing the first 10 alphabets from images_evaluation
with open(tpath + 'chars_eval.pickle', 'wb') as fp:   
    pickle.dump(chars_eval[:10], fp)
with open(tpath + 'char_locs_eval.pickle', 'wb') as fp:   
    pickle.dump(char_locs_eval[:10], fp)
#Write test split containing the remaining 10 alphabets from images_evaluation 
with open(tpath + 'chars_test.pickle', 'wb') as fp:   
    pickle.dump(chars_eval[10:], fp)
with open(tpath + 'char_locs_test.pickle', 'wb') as fp:   
    pickle.dump(char_locs_eval[10:], fp)