

### [Create training, test and validation datasets](https://cs230-stanford.github.io/train-dev-test-split.html)

Psudeocode

* Loop through each folder `Sample001` - `Sample062`
* Split each class into 90% training set, 10% test set. 
* Save as an array of filenames for `training` and `test` datasets. 

Why?
* **Train set** - fit the model weights
* **Test set** - check the accuracy of the trained model

### [Preprocessing](http://ankivil.com/kaggle-first-steps-with-julia-chars74k-first-place-using-convolutional-neural-networks/)

Pre-processing steps:

* Read the images from the array of image paths and flatten to grayscale
* Resize the image to required size of 32*32
* Save the image to the new folder in /data
* Create the images as a numpy file for loading later

**NOTE**: This could be optimised to allow for more flexible creation of images of different sizes. 

In [2]:
import glob
import random
import pandas as pd
import numpy as np

import os
import math
from scipy.misc import imread, imsave, imresize
# ignore deprecation warnings
import warnings
warnings.filterwarnings('ignore')

class_list = ["01", "02", "03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20", "21", "22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62"]
training_set= [] 
test_set = []
images = []

for i in class_list:
    folder_name = "img/GoodImg/Bmp/Sample0" + i + "/*.png"
    
    filenames = []
    for img_path in glob.glob(folder_name):
        filenames.append(img_path)

    filenames.sort()  # make sure that the filenames have a fixed order before shuffling   
    random.seed(230)
    random.shuffle(images)
        
    split_1 = int(0.9 * len(filenames))
    
    train_filenames = filenames[:split_1]
    test_filenames = filenames[split_1:]
            
    training_set.extend(train_filenames)
    test_set.extend(test_filenames)

print('There are %d training images.' % len(training_set))
print('There are %d test images.' % len(test_set))


There are 6905 training images.
There are 800 test images.


In [3]:
path = 'data'
img_rows, img_cols = 32, 32

### Create test images

- Create the folder `data/test_preproc_32_32`

In [4]:
files = test_set
test_data = np.zeros((len(files), img_rows, img_cols)) 

for i, filepath in enumerate(files):
    image = imread(filepath, True) # True: flatten to grayscale

    imageResized = imresize(image, (img_rows, img_cols))

    # Add the resized image to the dataset
    test_data[i] = imageResized

    #Save images 
    path_list = filepath.split(os.sep)
    newName = 'data/test_preproc_32_32/' + path_list[4]
    imsave(newName, imageResized)

# Add channel/filter dimensions    
test_data = test_data[:,np.newaxis,:,:]

# We rescale the images by dividing every pixel in the image by 255
test_data = test_data.astype('float32')
test_data /= 255

# Save data as a numpy file 
np.save("data/test_preproc_32_32.npy", test_data)

### Create the training images
- Create the folder `data/train_Preproc_32_32`

In [5]:
training_files = training_set
training_data = np.zeros((len(training_files), img_rows, img_cols)) 

for i, filepath in enumerate(training_files):
    image = imread(filepath, True) # True: flatten to grayscale

    imageResized = imresize(image, (img_rows, img_cols))

    # Add the resized image to the dataset
    training_data[i] = imageResized

    #Save images - Uncomment the lines below to create images
    path_list = filepath.split(os.sep)
    newName = 'data/train_preproc_32_32/' + path_list[4]
    imsave(newName, imageResized)
    
# Add channel/filter dimensions    
training_data = training_data[:,np.newaxis,:,:]

# We rescale the images by dividing every pixel in the image by 255
training_data = training_data.astype('float32')
training_data /= 255

# Save data as a numpy file 
np.save("data/training_preproc_32_32.npy", training_data)

### Create training labels

In [6]:
folder_name = "data/train_preproc_32_32/*.png"

y_train = []
filenames = []
for img_path in sorted(glob.glob(folder_name)):
    filenames.append(img_path)
    
    path_list = img_path.split(os.sep)
    img_ext = path_list[2]
    class_str = int(img_ext[3:6])
    y_train.append(class_str)

y_train_shape = len(y_train)
# Create a np array with shape (y_train_shape, 62)
Y_train = np.zeros((y_train_shape,62))

for i in range(y_train_shape):
    Y_train[i][y_train[i]-1] = 1
    
# Save training labels as a numpy file    
np.save("data/trainingLabels32.npy",Y_train)

### Create test labels

In [7]:
folder_name = "data/test_preproc_32_32/*.png"

y_test = []
filenames = []
for img_path in sorted(glob.glob(folder_name)):
    filenames.append(img_path)

    path_list = img_path.split(os.sep)
    img_ext = path_list[2]
    class_str = int(img_ext[3:6])
    y_test.append(class_str)

y_test_shape = len(y_test)
# Create a np array 
Y_test = np.zeros((y_test_shape,62))

for i in range(y_test_shape):
    Y_test[i][y_test[i]-1] = 1
    
# Save training labels as a numpy file    
np.save("data/testLabels32.npy",Y_test)