# MNIST synthetic Preprocessing 
### Here, we split the MNIST synthetic dataset into a training and validation set
We will extract individuals numbers from the MNIST synthetic training set with their corresponding labels to add to the regular MNIST set that will be use later to train a CNN

In [1]:
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from keras.datasets import mnist
import torch
import numpy as np
import argparse
import torch.nn as nn
import torch.utils.data as data_utils
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

Using TensorFlow backend.


In [2]:
#load the MNIST synthetic dataset
f = h5py.File('data/MNIST_synthetic.h5', 'r')
train_dataset = f['train_dataset']
train_labels = f['train_labels']
test_dataset= f['test_dataset']

#### Useful functions to identify single digits in an image of digit sequence

In [3]:
def extract_num(image, min_pixel=4):
    """
    Take as input an image containing a sequence of written numbers and output each number in the original image
    as a individual images
    image: image containing the sequence number
    min_pixel : number of pixel minimal for a section to be considered a number (initialize as a 4x4 box)
    """
    image_copy = image.copy()
    image_copy2 = image.copy()
    
    #Binary threshold on the  original image to highlight the are where there are numbers
    ret, thresh = cv2.threshold(image.copy(), 1, 255,cv2.THRESH_BINARY_INV)
    #Countour the numbers
    _, contours, _ = cv2.findContours(thresh.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)

    preprocessed_digits = [] #array to contain the individual numbers' images
    
    # We sort the countours by their position on the x axis to select the numbers in the images from left to right
    contours_array = np.empty((0,4),dtype=int)
    for c in contours:
        x,y,w,h = cv2.boundingRect(c) #extract the coordinates of the box around the number
        contours_array = np.concatenate((contours_array,[[x,y,w,h]]), axis=0)
    columnIndex = 0
    # Sort contour array by the first column (containing x-axis value)
    sortedc = contours_array[contours_array[:,columnIndex].argsort()]
    
    for c in range(len(sortedc)):
        x,y,w,h = sortedc[c]
        
        if (h<=min_pixel or w<=min_pixel or h>=60 or w>=60): 
            #eliminate countours that are too small (likely not a number) or too big (box around the whole image)
            continue
        
        # Creating a rectangle around the digit in the original image (for displaying the digits fetched via contours)
        cv2.rectangle(image_copy, (x,y), (x+w, y+h), color=100, thickness=1)
    
        # Cropping out the digit from the image corresponding to the current contours in the for loop
        digit =image_copy2[y:y+h, x:x+w]
    
        # Resizing that digit to (24,24)
        resized_digit = cv2.resize(digit, (24,24))
    
        # Padding the digit with 2 pixels in each side to produce the image of (28, 28)
        padded_digit = np.pad(resized_digit, ((2,2),(2,2)), "constant", constant_values=0)
        s
        preprocessed_digits.append(padded_digit) 
    inp = np.array(preprocessed_digits)
    return inp

In [4]:
def extract_single_labels(labels):
    """
    Remove the label '10' from the array of 
    """
    no_10 = np.delete(labels, np.where(labels == 10))
   
    return no_10
    

## Construct training set from modified MNIST set


In [5]:
train_modified_mnist=np.array(train_dataset)
train_labels_modified_mnist = np.array(train_labels)
n_test = len(train_modified_mnist)//5 #20% of the training set use for validate
valid_modified_mnist, valid_labels_modified_mnist = train_modified_mnist[:n_test], train_labels_modified_mnist[:n_test]
#np.save('valid_modified_mnist.npy',valid_modified_mnist)
#np.save('valid_labels_modified_mnist.npy',valid_labels_modified_mnist)
rest_train_modified_mnist, rest_train_labels_modified_mnist = train_modified_mnist[n_test:42000], train_labels_modified_mnist[n_test:42000]


In [8]:

combined_rest_train_imgs = np.empty((1, 28, 28))
combined_rest_train_labels = np.empty((1),dtype=int)
combined_rest_train_labels=np.delete(combined_rest_train_labels, 0)
#loop over all the digit sequence images in the training set
for i in range(len(rest_train_modified_mnist)):
    inp = extract_num(rest_train_modified_mnist[i,:,:,:]) #extract single digit images
    ind_labels = extract_single_labels(rest_train_labels_modified_mnist[i]) #extract digit labels
    #if there is either no digits found or too many digits, then adjust the min_pixel size properly
    min_pi=4
    while(inp.shape[0] == 0):
        inp = extract_num(rest_train_modified_mnist[i,:,:,:], min_pixel=min_pi-1)
        
    while(inp.shape[0] > 5):
        inp = extract_num(rest_train_modified_mnist[i,:,:,:], min_pixel=min_pi+1)
        
     #Concatenate the single digit images and labels   
    combined_rest_train_imgs = np.concatenate((combined_rest_train_imgs,inp), axis=0)
    combined_rest_train_labels =np.concatenate((combined_rest_train_labels,ind_labels), axis=0)
    if i % 2000 == 1999:
        print('Number of pictures processed so far: ', i)
#delete first element due to initialization (not an image)
combined_rest_train_imgs=np.delete(combined_rest_train_imgs, 0)        


#np.save('combined_rest_train_imgs.npy', combined_rest_train_imgs)
#np.save('combined_rest_train_labels.npy', combined_rest_train_labels)

Number of pictures processed so far:  1999
Number of pictures processed so far:  3999
Number of pictures processed so far:  5999
Number of pictures processed so far:  7999


KeyboardInterrupt: 