# Initial Experimental Setup - Train / Validation / Test Split

## Implementation

The purpose of this notebook is to obtain perform the train. validation, and test on the GC10-DET for the thesis implementation as outlined in the section 3.2.1 of the bachelor thesis.

## Note:

- For the initial five scenarios the validation and test sets are merged in one test set. The ratio after this is of 64 / 36 % between train and test sets.

- For the Scenario 6, the division of train, validation and test keeps the same.

## Step 1 - Importing Dependencies

- Importing the necessary libraries to execute the code.

In [None]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split
from PIL import Image
import torchvision.transforms as transforms
from class_labels_generator import labels_generator
from labels_generator import generate_csv

## Step 2 - Create a Custom Pytorch Dataset

- Defining a custom PyTorch dataset for the split porpuse.
- The resizing transformation is already fixed inside the dataset definition.

In [None]:
class SplitDataset(Dataset):
    def __init__(self, labels_file, root_dir, transform=None):
        self.annotations = pd.read_csv(labels_file, header=None)
        self.root_dir = root_dir
        self.transform = transforms.Resize((224, 224))

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        label = torch.tensor(int(self.annotations.iloc[(index, 2)]))

        if self.transform:
            image = self.transform(image)

        return(image, label)   

## Step 3 - Spliting the Dataset
    
- Defining the data ratios.
- Defining the data paths.
- Defining the dataset classes names and labels.

In [None]:
train_fraction = (0.8*0.8)            # Defining the train data ration
validation_fraction = (0.2*0.8)       # Defining the validation data ratio
test_fraction = 0.2                   # Defining the test data ratio
proportions = [train_fraction, validation_fraction, test_fraction]

raw_path = 'path/to/original/complete/dataset'
train_dir = 'path/to/save/raw/training/set'
validation_dir = 'path/to/save/validation/set'
test_dir = 'path/to/save/test/set'

labels_generator(raw_path)

classes = ["0_punching_hole",
           "1_welding_line", 
           "2_crescent_gap", 
           "3_water_spot", 
           "4_oil_spot", 
           "5_silk_spot",
           "6_inclusion", 
           "7_rolled_pit", 
           "8_crease",
           "9_waist_folding"]

- Saving the images in a desired folder.

In [None]:
def folder_creation(dir, type):
    """Create the folder for each set of the dataset"""
    path = os.path.join(dir, 'images/', type)
    os.makedirs(path, exist_ok=True)
    return path

def save_images(dataset, type, path, dataset_name):
    """Save images in the correct folder after split"""
    for idx in range(dataset.__len__()):
        img, _ = dataset.__getitem__(idx)
        img_name = dataset_name + type + '_' + str(idx) + '.jpg'
        img.save(os.path.join(path, img_name))

- Performoing the dataset split.

In [None]:
for type in classes:

    file = type + '.csv'

    classData = SplitDataset(labels_file=os.path.join(raw_path, file), 
                        root_dir=os.path.join(raw_path,'images/', type))

    lengths = [int(p * len(classData)) for p in proportions]
    lengths[-1] = len(classData) - sum(lengths[:-1])

    class_split = random_split(classData, lengths, generator=torch.Generator().manual_seed(42))

    classTrainData = class_split[0]
    classValidationData = class_split[1]
    classTestData = class_split[2]

    #Defining the Folders
    train_path = folder_creation(train_dir, type)
    validation_path = folder_creation(validation_dir, type)
    test_path = folder_creation(test_dir, type)

    # Saving the train dataset
    save_images(classTrainData, type, train_path, dataset_name="train_")

    # Saving the validation dataset
    save_images(classValidationData, type, validation_path, dataset_name="validation_")

    # Save the test dataset
    save_images(classTestData, type, test_path, dataset_name="test_")

## Step 4 - Generating the label .csv Files

In [None]:
# Saving complete labels
generate_csv(train_dir)
generate_csv(validation_dir)
generate_csv(test_dir)

# Saving class labels
labels_generator(train_dir)
labels_generator(validation_dir)
labels_generator(test_dir)