In [1]:
import torch
from torchvision import datasets, transforms
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import shutil
import os
from tqdm import tqdm
from PIL import Image, ImageDraw
import random

img_size = 64 # 64 lub 128
data_path = f'data/x{img_size}' # original data path that will be split
data_folder = 'data' # folder where data will be saved

## Divide data between train, test, split

In [2]:
folder_list = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]

train_size = 0.6
val_size = 0.2
test_size = 0.2

labeled_files = {}
train_val_test_files = {}
for folder in folder_list:
    folder_path = os.path.join(data_path, folder)
    labeled_files[folder] = os.listdir(folder_path)
    
    train_val_test_files[folder] = {}

    train_files, val_test_files = train_test_split(labeled_files[folder], test_size=1-train_size, random_state=42, shuffle=True)
    val_files, test_files = train_test_split(val_test_files, test_size=test_size/(test_size+val_size), random_state=42, shuffle=True)

    train_val_test_files[folder]['train'] = train_files
    train_val_test_files[folder]['val'] = val_files
    train_val_test_files[folder]['test'] = test_files

    print(folder+":", len(labeled_files[folder]), 'train:', len(train_files), 'val:', len(val_files), 'test:', len(test_files))



City: 25318 train: 15190 val: 5064 test: 5064
Fire: 25423 train: 15253 val: 5085 test: 5085
Lake: 25236 train: 15141 val: 5047 test: 5048
Mountain: 24992 train: 14995 val: 4998 test: 4999


In [4]:
def copy_files(source_directory: str, destination_directory: str, files:list[str]):
    """
    Copy specified files from a source directory to a destination directory.

    This function iterates over a list of filenames and copies each file from the source directory to the destination directory, if the file does not already exist in the destination directory. It uses `shutil.copy2` to perform the copy operation, which preserves the file's metadata.

    Parameters
    ----------
    source_directory : str
        The path to the source directory from which files will be copied.
    destination_directory : str
        The path to the destination directory to which files will be copied.
    files : list of str
        A list of filenames (strings) that specifies which files to copy from the source directory to the destination directory.

    Returns
    -------
    None
    """
    files_in_destination = os.listdir(destination_directory)
    for file in tqdm(files, desc="Copying files"):
        if file not in files_in_destination:
            src_file = os.path.join(source_directory, file)
            
            if os.path.isfile(src_file):
                dst_file = os.path.join(destination_directory, file)
                shutil.copy2(src_file, dst_file)

In [5]:
for split in ['train', 'val', 'test']:
    for folder in folder_list:
        source_directory = os.path.join(data_path, folder)
        aug_path = f'{data_folder}/divided_x{img_size}'
        destination_directory = os.path.join(aug_path, f'{split}', folder)
        
        os.makedirs(destination_directory, exist_ok=True)
        print(f'Copying {split} files from {source_directory} to {destination_directory}')
        copy_files(source_directory, destination_directory, train_val_test_files[folder][split])
        

Copying train files from data/x64/City to data/divided_x64/train/City


Copying files: 100%|██████████| 15190/15190 [02:23<00:00, 106.06it/s]


Copying train files from data/x64/Fire to data/divided_x64/train/Fire


Copying files: 100%|██████████| 15253/15253 [02:20<00:00, 108.44it/s]


Copying train files from data/x64/Lake to data/divided_x64/train/Lake


Copying files: 100%|██████████| 15141/15141 [02:16<00:00, 111.06it/s]


Copying train files from data/x64/Mountain to data/divided_x64/train/Mountain


Copying files: 100%|██████████| 14995/14995 [02:14<00:00, 111.76it/s]


Copying val files from data/x64/City to data/divided_x64/val/City


Copying files: 100%|██████████| 5064/5064 [00:45<00:00, 112.03it/s]


Copying val files from data/x64/Fire to data/divided_x64/val/Fire


Copying files: 100%|██████████| 5085/5085 [00:45<00:00, 112.25it/s]


Copying val files from data/x64/Lake to data/divided_x64/val/Lake


Copying files: 100%|██████████| 5047/5047 [00:45<00:00, 110.03it/s]


Copying val files from data/x64/Mountain to data/divided_x64/val/Mountain


Copying files: 100%|██████████| 4998/4998 [00:45<00:00, 110.29it/s]


Copying test files from data/x64/City to data/divided_x64/test/City


Copying files: 100%|██████████| 5064/5064 [00:45<00:00, 111.40it/s]


Copying test files from data/x64/Fire to data/divided_x64/test/Fire


Copying files: 100%|██████████| 5085/5085 [00:45<00:00, 112.91it/s]


Copying test files from data/x64/Lake to data/divided_x64/test/Lake


Copying files: 100%|██████████| 5048/5048 [00:44<00:00, 112.19it/s]


Copying test files from data/x64/Mountain to data/divided_x64/test/Mountain


Copying files: 100%|██████████| 4999/4999 [00:46<00:00, 107.76it/s]


## Augmentated images

### Preparing data

In [6]:
class AddRandomBlackRectangle:
    """
    A class to add a randomly sized, positioned, and rotated black rectangle to an image.

    This transformation is applied to simulate occlusions in images for data augmentation purposes. 
    The size, position, and rotation angle of the rectangle are randomly determined within specified limits.

    Parameters
    ----------
    min_size : int
        The minimum size of the rectangle's sides. This value is used as the lower bound for random size generation.
    max_size : int
        The maximum size of the rectangle's sides. This value is used as the upper bound for random size generation.

    Methods
    -------
    __call__(self, img):
        Applies the transformation to the given image and returns the modified image.
        """
        
    def __init__(self, min_size, max_size):
        self.min_size = min_size
        self.max_size = max_size

    def __call__(self, img):
        width, height = img.size
        rect_width = random.randint(self.min_size, self.max_size)
        rect_height = random.randint(self.min_size, self.max_size)
        x = random.randint(0, width - rect_width)
        y = random.randint(0, height - rect_height)
        
        # Create a new image with the same size as the original image
        rect_img = Image.new('RGB', img.size, (0, 0, 0))
        
        # Create a new mask image with the same size as the original image
        mask_img = Image.new('L', img.size, 0)
        
        # Draw a rectangle on the new image and the mask
        draw_img = ImageDraw.Draw(rect_img)
        draw_mask = ImageDraw.Draw(mask_img)
        draw_img.rectangle([x, y, x + rect_width, y + rect_height], fill="black")
        draw_mask.rectangle([x, y, x + rect_width, y + rect_height], fill=255)
        
        # Rotate the new image and the mask
        angle = random.randint(0, 90)
        rect_img = rect_img.rotate(angle)
        mask_img = mask_img.rotate(angle)
        
        # Paste the new image onto the original image using the mask
        img.paste(rect_img, (0, 0), mask_img)
        
        return img

In [13]:
# Define a transform to normalize the data
transform = transforms.Compose([
    AddRandomBlackRectangle(img_size//5, img_size//4),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)), # normalize the images to [-1, 1]
])

dataset_path_train = f'{data_folder}/divided_x{img_size}/train'
dataset_path_val = f'{data_folder}/divided_x{img_size}/val'
dataset_path_test = f'{data_folder}/divided_x{img_size}/test'

In [14]:
train_dataset = datasets.ImageFolder(dataset_path_train, transform=transform)
val_dataset = datasets.ImageFolder(dataset_path_val, transform=transform)
test_dataset = datasets.ImageFolder(dataset_path_test, transform=transform)

print('train', len(train_dataset), 'val:', len(val_dataset), 'test:', len(test_dataset))

train 60579 val: 20194 test: 20196


In [15]:
train_dl = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dl = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=64, shuffle=True)

### Save augmented data

In [10]:
def save_augmented_images(data_loader, dataset):
    """
    Saves augmented images from a data loader to disk.

    This function iterates over a given data loader, converts each image to a PIL Image format, and saves it to a new 
    location on disk with an 'augmented_x{img_size}' directory in the path. If the target directory does not exist, it 
    is created. Each image is saved even if a file with the same name already exists, potentially overwriting existing files.

    Parameters
    ----------
    data_loader : torch.utils.data.DataLoader
        The data loader containing images and labels to be saved. The data loader should yield batches of images and 
        their corresponding labels.
    dataset : torch.utils.data.Dataset
        The dataset object associated with the data loader. This is used to retrieve the original file paths of the images 
        for constructing new file paths for the augmented images.
    """
    
    for i, (images, labels) in tqdm(enumerate(data_loader), desc='Saving images', total=len(data_loader)):
        # Convert tensor to PIL Image
        # print(i)
        image = transforms.ToPILImage()(images[0])
        
        # Get the path of the original image
        original_path = dataset.samples[i][0].split('/')
        original_path[1] = f'augmented_x{img_size}'
        original_path = '/'.join(original_path)
        
        # Create a new path for the transformed image
        new_path = os.path.join(original_path)
        
        # Create directories if they don't exist
        os.makedirs(os.path.dirname(new_path), exist_ok=True)
        
        # Save the image if the file doesn't exist
        if not os.path.exists(new_path):
            image.save(new_path)
        
        # Save the image
        image.save(new_path)

In [16]:
train_dl = DataLoader(train_dataset, batch_size=1)
val_dl = DataLoader(val_dataset, batch_size=1)
test_dl = DataLoader(test_dataset, batch_size=1)

save_augmented_images(train_dl, train_dataset)
save_augmented_images(val_dl, val_dataset)
save_augmented_images(test_dl, test_dataset)

Saving images: 100%|██████████| 60579/60579 [13:52<00:00, 72.80it/s]
Saving images: 100%|██████████| 20194/20194 [05:01<00:00, 66.89it/s]
Saving images: 100%|██████████| 20196/20196 [05:09<00:00, 65.15it/s]


## Save divided datasets to pickle

In [19]:
import os
from PIL import Image
import numpy as np
import pickle
from tqdm import tqdm


def load_and_serialize_images(images_path: str) -> None:
    """
    Load images from a directory and serialize them into a pickle file.

    This function walks through the subdirectories 'val', 'train', and 'test' 
    in the given directory, loads all JPEG images, and serializes them into a 
    pickle file. The pickle file is saved in the parent directory of the given 
    directory, with the name in the format "{dir_name}_{subdir}.pkl".

    Parameters
    ----------
    images_path : str
        The path to the directory containing the images. This directory should 
        contain the subdirectories 'val', 'train', and 'test'.

    Returns
    -------
    None
    """

    subdirs = ['val', 'train', 'test']

    dir_name = os.path.basename(os.path.normpath(images_path))
    parent_dir = os.path.dirname(os.path.normpath(images_path))

    for subdir in subdirs:
        image_arrays = []
        subdir_path = os.path.join(images_path, subdir)

        for root, dirs, files in os.walk(subdir_path):
            for file in tqdm(files, desc=f'Loading images from {subdir}'):
                if file.endswith('.jpg'):
                    file_path = os.path.join(root, file)
                    image_arrays.append(np.array(Image.open(file_path)))

        # Serialize the list of numpy arrays with pickle
        pickle_file_path = f"{parent_dir}/{dir_name}/{subdir}.pkl"
        with open(pickle_file_path, 'wb') as f:
            pickle.dump(image_arrays, f)


load_and_serialize_images(f'{data_folder}/augmented_x64')
load_and_serialize_images(f'{data_folder}/divided_x64')

Loading images from val: 0it [00:00, ?it/s]
Loading images from val: 100%|██████████| 5064/5064 [00:21<00:00, 233.00it/s]
Loading images from val: 100%|██████████| 5085/5085 [00:20<00:00, 251.33it/s]
Loading images from val: 100%|██████████| 5047/5047 [00:20<00:00, 247.34it/s]
Loading images from val: 100%|██████████| 4998/4998 [00:20<00:00, 243.62it/s]
Loading images from train: 0it [00:00, ?it/s]
Loading images from train: 100%|██████████| 15190/15190 [01:01<00:00, 248.53it/s]
Loading images from train: 100%|██████████| 15253/15253 [01:01<00:00, 248.43it/s]
Loading images from train: 100%|██████████| 15141/15141 [00:58<00:00, 257.31it/s]
Loading images from train: 100%|██████████| 14995/14995 [01:01<00:00, 243.27it/s]
Loading images from test: 0it [00:00, ?it/s]
Loading images from test: 100%|██████████| 5064/5064 [00:20<00:00, 249.32it/s]
Loading images from test: 100%|██████████| 5085/5085 [00:20<00:00, 249.01it/s]
Loading images from test: 100%|██████████| 5048/5048 [00:20<00:00, 