In [23]:
"""
labeling:
1 --> Left Neural Foraminal Narrowing
2 --> Right Neural Foraminal Narrowing
3 --> Left Subarticular Stenosis
4 --> Right Subarticular Stenosis
5 --> Spinal Canal Stenosis

Model Creation and training will be done in this notebook.
"""

from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os

In [19]:
"""
After renaming the preprocessed images to include the condition digit labeling at the end of the filename:

1) read in each image
2) separate and define label as separate variable
3) create Pytorch acceptable image dataset for CNN input later
4) split dataset into train, validation, and test (80, 5, 15)
"""

# class steps were followed from here: 
# https://stackoverflow.com/questions/67406731/pytorch-import-dataset-with-images-as-labels

class Create_Dataset(Dataset): 

    def __init__(self, root_path, transform=None):
        self.root_path = root_path
        self.data_paths = [f for f in sorted(os.listdir(root_path)) if f.endswith(".png")]
        self.transform = transform

    def __getitem__(self, idx):

        #extract label
        img_file = self.data_paths[idx]
        label = int(img_file.replace('.png', '').split('_')[-1])
        #read in the img
        img_name = self.data_paths[idx]
        img_path = os.path.join(self.root_path, img_name)
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, label

    def __len__(self):
        return len(self.data_paths)

#creat the class object
DATA_PATH = '../../data/preprocessed_renamed'
data = Create_Dataset(root_path = DATA_PATH, 
                      transform = transforms.Compose([transforms.Resize((100, 100)), # for some reason some image sizes are not 100 x 100
                                                      transforms.ToTensor()]))

In [21]:
"""
Get the mean and std of entire dataset for normalization.

Shawn: This ran for about 4 min
"""

#dataload the entire dataset in order to calculate mean and std for normalization
all_data_loader = DataLoader(data, batch_size=500, shuffle=False)


# steps followed from here:
# https://saturncloud.io/blog/how-to-normalize-image-dataset-using-pytorch/

def get_mean_std(loader):
    # Compute the mean and standard deviation of all pixels in the dataset
    num_pixels = 0
    mean = 0.0
    std = 0.0
    for images, _ in loader:
        batch_size, _, height, width = images.shape
        num_pixels += batch_size * height * width
        mean += images.mean(axis=(0, 2, 3)).sum()
        std += images.std(axis=(0, 2, 3)).sum()

    mean /= num_pixels
    std /= num_pixels

    return mean, std

mean, std = get_mean_std(all_data_loader)

In [26]:
"""
Normalize the entire dataset when initializing dataset object
"""

data = Create_Dataset(root_path = DATA_PATH, 
                      transform = transforms.Compose([transforms.Resize((100, 100)), # for some reason some image sizes are not 100 x 100
                                                      transforms.ToTensor(),
                                                      transforms.Normalize(mean=mean, std=std)]))

In [28]:
"""
After normalizing, split the dataset to train, val, and testing (80, 5, 15)
"""

#setup random split sizing
train_size = int(0.8 * len(data))
val_size = int(0.05 * len(data))
test_size = len(data) - train_size - val_size

#random split dataset with random_split()
train, val, test = random_split(data, [train_size, val_size, test_size])

#create dataloader for training, validation, and testing
train_loader = DataLoader(train, batch_size=500, shuffle=True)
val_loader = DataLoader(val, batch_size=500, shuffle=True)
test_loader = DataLoader(test, batch_size=500, shuffle=True)