In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

In [None]:
TRAIN_IMAGES_PATH = './data/images/train'
TEST_IMAGES_PATH = './data/images/test'
VAL_IMAGES_PATH = './data/images/val'

TRAIN_NPZ_FILE = './data/npz/train_images.npz'
TEST_NPZ_FILE = './data/npz/test_images.npz'
VAL_NPZ_FILE = './data/npz/val_images.npz'

NUM_FEATURES = 224 * 224 * 3

In [None]:
def preprocess_images_to_numpy_arrays(input_dir, output_file, img_size=224, grayscale=False):
    images, labels = [], []
    directories = [d for d in os.listdir(input_dir) if d!='.DS_Store']
    class_names = sorted(directories)
    class_to_idx = {class_name: idx for idx, class_name in enumerate(class_names)}

    for class_name in tqdm(class_names, desc='Processing classes'):
        class_dir = os.path.join(input_dir, class_name)
        image_names = [img for img in os.listdir(class_dir) if img.endswith(('jpg', 'jpeg'))]
        print(f'Processing {class_dir} directory')
        
        for img_name in image_names:
            if type(img_name)== str:
                img_path = os.path.join(class_dir, img_name)
                img = Image.open(img_path)
                if grayscale:
                    img = img.convert("L")
                else:
                    img = img.convert("RGB")
                img = img.resize((img_size, img_size), Image.Resampling.LANCZOS)
                img_array = np.array(img, dtype=np.float32)/255.0

                images.append(img_array)
                labels.append(class_to_idx[class_name])
                
    images = np.stack(images)
    labels = np.array(labels)

    np.savez_compressed(output_file, images=images, labels=labels, classes=class_names)
    print(f"Saved preprocessed data to {output_file}")

In [None]:
preprocess_images_to_numpy_arrays(input_dir=TRAIN_IMAGES_PATH, output_file=TRAIN_NPZ_FILE)

In [None]:
preprocess_images_to_numpy_arrays(input_dir=TEST_IMAGES_PATH, output_file=TEST_NPZ_FILE)

In [None]:
preprocess_images_to_numpy_arrays(input_dir=VAL_IMAGES_PATH, output_file=VAL_NPZ_FILE)

In [None]:
def load_and_visualize_npz(file_path, dataset, idx=0):
    data = np.load(file_path, allow_pickle=True)
    images, labels, class_names = data['images'], data['labels'], data['classes']

    image = images[idx]
    label = labels[idx]
    class_name = class_names[label]

    plt.imshow(image)
    plt.title(f"Class: {class_name} in {dataset}")
    plt.axis("off")
    plt.show()

In [None]:
load_and_visualize_npz(TRAIN_NPZ_FILE, 'train', idx=10)

In [None]:
load_and_visualize_npz(TEST_NPZ_FILE, 'test', idx=5)

In [None]:
load_and_visualize_npz(VAL_NPZ_FILE, 'val', idx=5)