# Make Some Shapes

This will generate a bunch of circles and squares and prepare the data for streaming to Tensorflow.

In [1]:
from PIL import Image, ImageDraw
import random
from os import listdir
from os.path import isfile, join
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import pickle
import os
from tqdm import tqdm


In [2]:
def make_circles(size=28,number_of_shapes=100):
    features = []
    labels = []
    for _ in tqdm(range(number_of_shapes),desc='Circle', unit='circles'):
        blank_image = Image.new('L', (size, size), 'white')
        img_draw = ImageDraw.Draw(blank_image)
        x0 = random.randint(0,int(size/5))
        x1 = random.randint(int(2*size/3),size)
        img_draw.arc([x0,x0,x1,x1], start=0, end=360, fill='blue')
        blank_image = blank_image.resize((28,28))
        #blank_image = blank_image.rotate(random.randint(-10,10))
        feature = np.array(blank_image, dtype=np.float32).flatten()
        label = 'circle'
        features.append(feature)
        labels.append(label)
    return features, labels

In [3]:
def make_squares(size=28,number_of_shapes=100):
    features = []
    labels = []
    for _ in tqdm(range(number_of_shapes),desc='Square', unit='squares'):
        blank_image = Image.new('L', (size, size), 'white')
        img_draw = ImageDraw.Draw(blank_image)
        x0 = random.randint(0,int(size/5))
        x1 = random.randint(int(size/3),size)
        img_draw.rectangle([x0,x0,x1,x1],  outline='blue')
        blank_image = blank_image.resize((28,28))
        feature = np.array(blank_image, dtype=np.float32).flatten()
        label = 'square'
        features.append(feature)
        labels.append(label)
    return features, labels

In [4]:
#make some training data
circle_features, circle_labels = make_circles(size=28,number_of_shapes=50000)
square_features, square_labels = make_squares(size=28,number_of_shapes=50000)
train_features = np.array(circle_features + square_features)
train_labels = np.array(circle_labels + square_labels)

#make some test data
circle_features, circle_labels = make_circles(size=28,number_of_shapes=1000)
square_features, square_labels = make_squares(size=28,number_of_shapes=1000)
test_features = np.array(circle_features + square_features)
test_labels = np.array(circle_labels + square_labels)

Circle: 100%|██████████| 50000/50000 [00:04<00:00, 11039.10circles/s]
Square: 100%|██████████| 50000/50000 [00:03<00:00, 14006.89squares/s]
Circle: 100%|██████████| 1000/1000 [00:00<00:00, 11048.75circles/s]
Square: 100%|██████████| 1000/1000 [00:00<00:00, 14361.20squares/s]


### Pre-processing Data - "Normalization"

Here we scale the data to have the same "dynamic range" - see [this link](http://cs231n.github.io/neural-networks-2/#datapre) or [this video](https://www.coursera.org/learn/deep-neural-network/lecture/lXv6U/normalizing-inputs)

Note that our features are already similarly scaled because any pixel in the image can take a similar range of values (given their synthetic creation method above). However, we still (linearly) scale here to map our data to the same numeric range as our expected activation function (e.g. relu and softmax).

In [5]:
def normalize_pixel_levels(features):
    lower_val = 0.0
    upper_val = 1.0
    pixel_min = 0
    pixel_max = 255
    return lower_val + (features - pixel_min)*(upper_val-lower_val)/(pixel_max-pixel_min)

In [6]:
train_features = normalize_pixel_levels(train_features)
test_features = normalize_pixel_levels(test_features)

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [8]:
# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
# integer encode
label_encoder = LabelEncoder()
integer_encoded_train = label_encoder.fit_transform(train_labels)
integer_encoded_test = label_encoder.fit_transform(test_labels)


#binary encode
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded_train = integer_encoded_train.reshape(len(integer_encoded_train), 1)
train_labels = onehot_encoder.fit_transform(integer_encoded_train)

integer_encoded_test = integer_encoded_test.reshape(len(integer_encoded_test), 1)
test_labels = onehot_encoder.fit_transform(integer_encoded_test)

In [11]:
# Get randomized datasets for training and validation
train_features, valid_features, train_labels, valid_labels = train_test_split(
    train_features,
    train_labels,
    test_size=0.05,
    random_state=832289)

In [12]:
pickle_file = 'shape_data.pickle'

try:
    with open(pickle_file, 'wb') as pf:
        pickle.dump(
        {
            'train_dataset': train_features,
            'train_labels': train_labels,
            'valid_dataset': valid_features,
            'valid_labels': valid_labels,
            'test_dataset': test_features,
            'test_labels': test_labels,
        },
        pf, pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise