In [15]:
import os
import numpy as np
import cv2

In [16]:
def load_data(data_dir, IMG_WIDTH, IMG_HEIGHT, NUM_CATEGORIES):
    images = []
    labels = []
    
    # loop through category folders
    for category in range(NUM_CATEGORIES):
        category_dir = os.path.join(data_dir, str(category))
        print(f"Loading images from {category_dir}") 
        #loop through and load the images from each folder into the np array
        for filename in os.listdir(category_dir):
            
            if filename.endswith(".ppm") or filename.endswith(".jpg"):
                
                # read image, cv2 does a lot of the heavy lifting for us
                img_path = os.path.join(category_dir, filename)
                img = cv2.imread(img_path)
                img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
                
                # add image and label to the lists
                images.append(img) 
                labels.append(category)
    

    return np.array(images), np.array(labels)
    


In [18]:
data_dir = "gtsrb-small"
IMG_HEIGHT = 30
IMG_WIDTH = 30
NUM_CATEGORIES = 3

print(os.listdir(data_dir))
images, labels = load_data(data_dir, IMG_WIDTH, IMG_HEIGHT, NUM_CATEGORIES)
print(f"images.shape={images.shape}, labels.shape={labels.shape}")

['.DS_Store', '0', '1', '2']
Loading images from gtsrb-small/0
Loading images from gtsrb-small/1
Loading images from gtsrb-small/2
images.shape=(840, 30, 30, 3), labels.shape=(840,)


In [21]:
# flatten the images
reshaped_images = images.reshape(images.shape[0], -1)
print(reshaped_images.shape)

(840, 2700)


In [22]:
# split the data into training, cross validation, and testing sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(reshaped_images, labels, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

X_train.shape=(672, 2700), y_train.shape=(672,)
