In [1]:
from PIL import Image 
import os  
import pickle 
import numpy as np 

In [2]:
dst_path = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.curdir), os.pardir)), 'data', 'raw')

In [3]:
dst_path

'D:\\data_analyst\\nova_ims\\dl\\dl_project\\data\\raw'

Function that splits and merges all images into the respective train, validation or test tensors. Returns a list of tuple tensors. 

- img tensor dimensions: (num_images, 64, 64, 3) 

- labels tensor dimensions: (num_images, 10)

In [4]:
def data_gen(src_path, size):  
    
    # start counters
    num_img = 0 
    train_count = 0 
    val_count = 0 
    test_count = 0
    # save list of labels corresponding to each dir's name
    labels = os.listdir(path=src_path) 
    labels.sort() 
    # go through each label/dir and add number of images inside to num_img
    for i in labels: 
        num_img += len(os.listdir(path=os.path.join(src_path, i)))
    
    # initialize arrays filled with zeros corresponding to the output dimensions of the images and its labels for each split
    X_train = np.zeros(shape=[int(num_img*0.6),size,size,3])
    y_train = np.zeros(shape=[int(num_img*0.6),len(labels)])  
    X_val = np.zeros(shape=[int(num_img*0.2),size,size,3])
    y_val = np.zeros(shape=[int(num_img*0.2),len(labels)]) 
    X_test = np.zeros(shape=[int(num_img*0.2),size,size,3]) 
    y_test = np.zeros(shape=[int(num_img*0.2),len(labels)])
    
    # go through labels, save the path to its corresponding dir, and save the number of images inside
    for index, label in enumerate(labels): 
        cur_path = os.path.join(src_path, label)  
        n_img = len(os.listdir(path=cur_path))   

        # open each image, convert into array, add data and labels to the corresponding split arrays
        for i in range(1, n_img + 1):  
            # fill train split when n_img has not reached 61% of the label
            if i <= int(n_img * 0.6):
                # keep track of index of the train arrays
                train_count += 1
                img = Image.open(os.path.join(cur_path, labels[index] + '_{}.jpg'.format(i))) 
                # convert image to array and rescale
                img_arr = 1/255 * np.array(img)  
                label_arr = np.zeros(len(labels)) 
                np.put(label_arr, index, 1)
                X_train[train_count-1][:size][:size][:size] = img_arr  
                y_train[train_count-1] = label_arr   
                
            # fill val split when n_img has not reached 81% of the label 
            elif i <= int(n_img * 0.8):  
                # keep track of index of the val arrays
                val_count += 1
                img = Image.open(os.path.join(cur_path, labels[index] + '_{}.jpg'.format(i))) 
                # convert image to array and rescale
                img_arr = 1/255 * np.array(img)  
                label_arr = np.zeros(len(labels)) 
                np.put(label_arr, index, 1)
                X_val[val_count-1][:size][:size][:size] = img_arr  
                y_val[val_count-1] = label_arr 
                
            # fill test split with remaining images of the label
            else:  
                # keep track of index of the test arrays
                test_count += 1
                img = Image.open(os.path.join(cur_path, labels[index] + '_{}.jpg'.format(i)))  
                # convert image to array and rescale
                img_arr = 1/255 * np.array(img)  
                label_arr = np.zeros(len(labels)) 
                np.put(label_arr, index, 1)
                X_test[test_count-1][:size][:size][:size] = img_arr  
                y_test[test_count-1] = label_arr 
    
    # return list of tensor tuples       
    return [(X_train, y_train), (X_val, y_val), (X_test, y_test)]

Test function and load some data points for quality control

In [5]:
data = data_gen(dst_path, 64)

In [6]:
27000 * 0.6 

16200.0

In [7]:
27000 * 0.2

5400.0

In [8]:
data[0][0].shape

(16200, 64, 64, 3)

In [9]:
data[0][1].shape

(16200, 10)

In [10]:
data[0][1][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
data[0][1][1800]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [12]:
data[0][1][3600]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
data[0][1][-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [14]:
data[1][0].shape

(5400, 64, 64, 3)

In [15]:
data[1][1].shape

(5400, 10)

In [16]:
data[1][1][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [17]:
data[1][1][600]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
data[1][1][1200]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
data[1][1][-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [20]:
data[2][0].shape

(5400, 64, 64, 3)

In [21]:
data[2][1].shape

(5400, 10)

In [22]:
data[2][1][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
data[2][1][600]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
data[2][1][1200]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [25]:
data[2][1][-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

Save and load data in pickle format for later use

In [26]:
with open('X_train.pickle', 'wb') as f: 
    pickle.dump(data[0][0], f) 

In [27]:
with open('y_train.pickle', 'wb') as f: 
    pickle.dump(data[0][1], f) 

In [28]:
with open('X_train.pickle', 'rb') as f: 
    X_train = pickle.load(f)

In [29]:
X_train.shape

(16200, 64, 64, 3)

In [30]:
with open('y_train.pickle', 'rb') as f: 
    y_train = pickle.load(f)

In [31]:
y_train.shape

(16200, 10)