# Image Data preparator
### author: Oleksandr Lytvyn
#### Tasks
1. Upload available images
2. Create a dataset
3. Perform data augmentation
4. Save data in corresponding folder

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torchvision.transforms as transforms
from sklearn.utils import shuffle
from PIL import Image
from glob import glob
import os
torch.manual_seed(1)

<torch._C.Generator at 0x2dfea176e30>

In [14]:
# define dataset class
class BrainMRIDataset(Dataset):

    def __init__(self, data_directory, reshape=True, height=128, width=128, transform=None):
        self.data_directory = data_directory
        self.no_class = glob(self.data_directory + '/no/*')
        self.yes_class = glob(self.data_directory + '/yes/*')
        self.height = height
        self.width = width
        self.reshape=reshape
        self.transform = transform

        labels = [0 for i in range(len(self.no_class))]
        labels += [1 for i in range(len(self.yes_class))]

        image_links = self.no_class + self.yes_class
        self.dataframe = pd.DataFrame({
            'image':image_links,
            'labels': labels
        })

        self.dataframe = shuffle(self.dataframe)
        self.dataframe.reset_index(inplace=True,drop=True)

    def __len__(self):
        return len(self.no_class)+len(self.yes_class)

    def __getitem__(self,idx):

        image = self.dataframe['image'][idx]
        label = self.dataframe['labels'][idx]

        image = Image.open(image).convert("L")

        if self.reshape:
            image = image.resize((self.height,self.width))

        array = np.asarray(image)
        if self.transform:
            array = self.transform(array)

        array = array.reshape(1, self.height,self.width)
        image = torch.tensor(array)
        label = torch.tensor(label)

        return [image,label]

    def __repr__(self):
        return str(self.dataframe.head())

In [15]:
# define transformation
color_transformations = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.ColorJitter(brightness=0.4),
        transforms.ColorJitter(brightness=0.5),
        transforms.ColorJitter(brightness=0.6),
        transforms.ColorJitter(brightness=0.7)
    ]),
    transforms.ToTensor()
])

rotation_transformations = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.RandomRotation(degrees=30),
        transforms.RandomRotation(degrees=25),
        transforms.RandomRotation(degrees=20),
        transforms.RandomRotation(degrees=15)
    ]),
    transforms.ToTensor()
])

flip_transformations = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.RandomHorizontalFlip(p=1),
        transforms.RandomVerticalFlip(p=1),
        transforms.Compose([
            transforms.RandomHorizontalFlip(p=1),
            transforms.RandomVerticalFlip(p=1)
        ])
    ]),
    transforms.ToTensor()
])

grayscale_transformations = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.Grayscale(),
        transforms.RandomPerspective(distortion_scale=.1, p=1)
    ]),
    transforms.ToTensor()
])

In [16]:
#upload data, create dataset and perform transformation
path_to_project_root = '../../'
data_dir = path_to_project_root + 'data/brain_tumor_imgs'
dataset = BrainMRIDataset(data_dir, height=64, width=64, transform=None) + \
          BrainMRIDataset(data_dir, height=64, width=64, transform=rotation_transformations) + \
          BrainMRIDataset(data_dir, height=64, width=64, transform=flip_transformations) + \
          BrainMRIDataset(data_dir, height=64, width=64, transform=grayscale_transformations) + \
          BrainMRIDataset(data_dir, height=64, width=64, transform=color_transformations)


In [17]:
#split create data
datasetsize = len(dataset)
train_size = int(datasetsize * 0.8)
whole_test_size = datasetsize - train_size
single_part_size = int(whole_test_size/3) #whole test data is divided into 3 parts - for test, do1 and do2

if datasetsize != (train_size + single_part_size*3):
    train_size += 1

print(f"Whole dataset size: {datasetsize}\n"
      f"Train dataset size: {train_size}\n"
      f"Test, DO1 and DO2 sizes: {single_part_size}")

train_data, test_data, do1_data, do2_data = torch.utils.data.random_split(dataset, [train_size, single_part_size, single_part_size, single_part_size])



Whole dataset size: 1265
Train dataset size: 1013
Test, DO1 and DO2 sizes: 84


In [12]:
#create corresponding dirs for data scientist (ds), data owner 1 (do1) and data owner 2 (do2)
path_to_project_root = '../../'
main_image_dir = path_to_project_root + f'data/prepared_brain_tumor_imgs/'
all_dir_names = []
party_names = ['ds', 'do1', 'do2']

for name in party_names:
    party_dir_names = [f'{name}/imgs', f'{name}/labels']
    if name == 'ds':
        party_dir_names = [f'{name}/train/imgs',
                     f'{name}/train/labels',
                     f'{name}/test/imgs',
                     f'{name}/test/labels']

    for dir_name in party_dir_names:
        os.makedirs(main_image_dir + dir_name, exist_ok=True)

    all_dir_names += party_dir_names

['ds/train/imgs', 'ds/train/labels', 'ds/test/imgs', 'ds/test/labels', 'do1/imgs', 'do1/labels', 'do2/imgs', 'do2/labels']


In [22]:
# save created data in corresponding libraries
for i, data in enumerate(train_data):
    torch.save(data[0], main_image_dir + f'ds/train/imgs/img{i}')
    torch.save(data[1], main_image_dir + f'ds/train/labels/lbl{i}')

for i, data in enumerate(test_data):
    torch.save(data[0], main_image_dir + f'ds/test/imgs/img{i}')
    torch.save(data[1], main_image_dir + f'ds/test/labels/lbl{i}')

for i, data in enumerate(do1_data):
    torch.save(data[0], main_image_dir + f'do1/imgs/img{i}')
    torch.save(data[1], main_image_dir + f'do1/labels/lbl{i}')

for i, data in enumerate(do2_data):
    torch.save(data[0], main_image_dir + f'do2/imgs/img{i}')
    torch.save(data[1], main_image_dir + f'do2/labels/lbl{i}')

  image = torch.tensor(array)
