In [1]:
import scipy.misc
import numpy as np

def noise(noise_level, shape):
    return np.random.choice([0, 1], shape, p=[1-noise_level, noise_level])

def noise_masks(noise_levels, shape):
    return [noise(noise_level, shape) for noise_level in noise_levels]

def noisy_images(image_filename, min_noise, max_noise, count):
    image = scipy.misc.imread(image_filename, flatten=True)/255
    noise_levels = np.linspace(min_noise, max_noise, count)
    return [np.logical_xor(image, noise_mask) for noise_mask in noise_masks(noise_levels, np.shape(image))]

def random_images(min_noise, max_noise, count, shape):
    noise_levels = np.linspace(min_noise, max_noise, count)
    return noise_masks(noise_levels, shape)

def flatten(matrix_array):
    return [matrix.flatten() for matrix in matrix_array]

def generate_dataset(max_noise):
    import os    
    files = sorted(os.listdir('patterns'))

    count = 100

    images = []
    for file in files:
        images += noisy_images('patterns/' + file, 0, max_noise, count)
    
    labels = []
    for i in range(len(files)):
        labels += [i] * count

    return flatten(images), labels

def save_dataset(x, y, name):
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    from sklearn.externals import joblib
    joblib.dump([x_train, x_test, y_train, y_test], name)    

for max_noise in [0.2, 0.4, 0.6, 0.8]:
    x, y = generate_dataset(max_noise)
    save_dataset(x, y, 'dataset_' + str(max_noise) + '.pkl')
