In [2]:
from PIL import Image 
import os  
import pickle 
import numpy as np 

In [3]:
dst_path = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.curdir), os.pardir)), 'data', 'raw')

In [4]:
dst_path

'/media/migbernardo/Data/nova_ims/dl/dl_project/data/raw'

Function that joins all images with the respective labels into a tuple of tensors (img, labels)

- img tensor dimensions: (num_images, 64, 64, 3) 

- labels tensor dimensions: (num_images, 10)

In [5]:
def data_gen(src_path, size):   
    
    num_img = 0  
    count_index = 0 
    # save list of labels corresponding to each dir's name
    labels = os.listdir(path=src_path) 
    labels.sort() 
    # go through each lab/dir and add number of images inside to num_img
    for i in labels: 
        num_img += len(os.listdir(path=os.path.join(src_path, i)))
    
    # initialize arrays filled with zeros corresponding to the output dimensions of the images and its labels
    X = np.zeros(shape=[num_img,size,size,3])
    y = np.zeros(shape=[num_img,len(labels)]) 
    
    # go through labels, save the path to its corresponding dir, and save the number of images inside
    for index, label in enumerate(labels): 
        cur_path = os.path.join(src_path, label)  
        n_img = len(os.listdir(path=cur_path))  

        # open each image, convert into array, and add data and labels to the corresponding X and y arrays
        for i in range(1, n_img + 1):   
            count_index += 1
            img = Image.open(os.path.join(cur_path, labels[index] + '_{}.jpg'.format(i))) 
            img_arr = np.array(img)  
            label_arr = np.zeros(len(labels)) 
            np.put(label_arr, index, 1)
            X[count_index-1][:64][:64][:64] = img_arr  
            y[count_index-1] = label_arr  
            
    # return tensor tuple        
    return (X, y)

Test function and load some data points

In [6]:
data = data_gen(dst_path, 64)

In [7]:
data[0].shape

(27000, 64, 64, 3)

In [8]:
data[1].shape

(27000, 10)

In [9]:
data[1][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [10]:
data[0][0]

array([[[149., 121., 120.],
        [147., 121., 120.],
        [146., 120., 119.],
        ...,
        [102.,  90., 100.],
        [104.,  92., 102.],
        [103.,  91., 101.]],

       [[149., 121., 120.],
        [149., 121., 120.],
        [149., 119., 119.],
        ...,
        [102.,  90., 100.],
        [104.,  92., 102.],
        [103.,  91., 101.]],

       [[143., 117., 116.],
        [144., 115., 117.],
        [147., 116., 121.],
        ...,
        [102.,  92., 101.],
        [102.,  90., 100.],
        [103.,  91., 101.]],

       ...,

       [[141., 117., 115.],
        [141., 117., 117.],
        [137., 117., 118.],
        ...,
        [103.,  93., 102.],
        [100.,  90., 101.],
        [101.,  90., 104.]],

       [[130., 111., 113.],
        [132., 112., 114.],
        [129., 110., 116.],
        ...,
        [105.,  94., 102.],
        [105.,  93., 103.],
        [107.,  95., 107.]],

       [[122., 106., 109.],
        [122., 105., 111.],
        [120., 1

In [11]:
data[1][3000]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [12]:
data[0][3000]

array([[[40., 63., 81.],
        [41., 62., 79.],
        [42., 63., 80.],
        ...,
        [34., 57., 75.],
        [37., 58., 77.],
        [39., 57., 77.]],

       [[40., 63., 79.],
        [39., 62., 78.],
        [40., 63., 79.],
        ...,
        [35., 56., 75.],
        [37., 58., 77.],
        [37., 58., 77.]],

       [[41., 64., 78.],
        [40., 63., 79.],
        [39., 62., 78.],
        ...,
        [38., 56., 78.],
        [37., 58., 77.],
        [37., 58., 77.]],

       ...,

       [[38., 60., 74.],
        [38., 59., 76.],
        [37., 58., 75.],
        ...,
        [34., 60., 75.],
        [37., 60., 78.],
        [37., 58., 77.]],

       [[35., 56., 73.],
        [35., 56., 73.],
        [40., 63., 79.],
        ...,
        [37., 60., 76.],
        [40., 63., 79.],
        [37., 58., 77.]],

       [[33., 56., 74.],
        [33., 56., 72.],
        [41., 64., 78.],
        ...,
        [35., 58., 74.],
        [36., 59., 75.],
        [35., 56., 75.]]

In [13]:
data[1][6000]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
data[0][6000]

array([[[146., 115., 121.],
        [143., 116., 123.],
        [139., 114., 120.],
        ...,
        [ 87.,  75.,  87.],
        [ 88.,  77.,  91.],
        [ 83.,  76.,  94.]],

       [[149., 115., 116.],
        [146., 115., 120.],
        [142., 113., 118.],
        ...,
        [ 86.,  75.,  89.],
        [ 88.,  77.,  91.],
        [ 86.,  75.,  92.]],

       [[169., 129., 129.],
        [165., 129., 131.],
        [161., 126., 130.],
        ...,
        [ 79.,  71.,  86.],
        [ 86.,  75.,  91.],
        [ 88.,  75.,  92.]],

       ...,

       [[187., 144., 137.],
        [187., 144., 138.],
        [185., 142., 136.],
        ...,
        [ 92.,  84.,  97.],
        [ 86.,  78.,  93.],
        [ 80.,  69.,  86.]],

       [[189., 146., 140.],
        [188., 144., 141.],
        [186., 142., 139.],
        ...,
        [ 88.,  77.,  91.],
        [ 95.,  83.,  97.],
        [ 89.,  75.,  90.]],

       [[185., 141., 138.],
        [185., 141., 140.],
        [185., 1

In [15]:
data[1][-1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [16]:
data[0][-1]

array([[[33., 53., 77.],
        [33., 53., 77.],
        [31., 53., 74.],
        ...,
        [33., 56., 74.],
        [32., 55., 73.],
        [33., 56., 74.]],

       [[33., 54., 75.],
        [33., 54., 75.],
        [31., 53., 74.],
        ...,
        [33., 56., 74.],
        [32., 54., 75.],
        [32., 54., 75.]],

       [[31., 53., 74.],
        [31., 53., 74.],
        [32., 54., 75.],
        ...,
        [32., 55., 73.],
        [33., 55., 76.],
        [32., 53., 74.]],

       ...,

       [[31., 54., 72.],
        [31., 53., 74.],
        [31., 53., 74.],
        ...,
        [32., 55., 73.],
        [32., 55., 73.],
        [32., 55., 73.]],

       [[29., 55., 72.],
        [29., 54., 74.],
        [31., 53., 74.],
        ...,
        [31., 54., 72.],
        [32., 55., 73.],
        [32., 54., 75.]],

       [[28., 55., 72.],
        [28., 55., 72.],
        [29., 54., 74.],
        ...,
        [32., 55., 73.],
        [32., 55., 73.],
        [32., 54., 75.]]

Save and load data in pickle format for later use

In [17]:
with open('X.pickle', 'wb') as f: 
    pickle.dump(data[0], f)

In [18]:
with open('y.pickle', 'wb') as f: 
    pickle.dump(data[1], f)

In [19]:
with open('X.pickle', 'rb') as f: 
    X = pickle.load(f)

In [20]:
X.shape

(27000, 64, 64, 3)

In [23]:
with open('y.pickle', 'rb') as f: 
    y = pickle.load(f)

In [24]:
y.shape

(27000, 10)