## Load Data
create a pytorch Dataset from MVS10015 Dataset
pytorch [documentaion](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

In [37]:
# dependencies

import os
import shutil
import math
import random
import csv
import pandas as pd
from PIL import Image
import torch
from skimage import io
import torchvision.datasets as datasets
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from torchvision.io import read_image
import skimage.io as io

In [None]:
# Create train/test split (80%)/(20%):

source_folder = 'Data/HAM100000_images/'
test_folder = 'Data/HAM100000_images_test/'
train_folder = 'Data/HAM100000_images_train/'

file_list = os.listdir(source_folder)

number_test_images = math.ceil(0.2 * len(file_list))
number_train_images = len(file_list) - number_test_images
# TODO: Set manual seed and select number_test_images in and rest in 'Data/HAM100000_images_train' 
seed_value = 42
random.seed(seed_value)

random_numbers = set()

while len(random_numbers) <= number_test_images:
    random_numbers.add(random.randint(0, len(file_list)-1))

for index, file in enumerate(file_list):
    if index in random_numbers:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(train_folder, file)
        shutil.copy(source_path, destination_path)
    else:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(test_folder, file)
        shutil.copy(source_path, destination_path)

In [45]:
# Create annotations file for given train/test split

test_directory = 'Data/HAM100000_images_test/'

with open('Data/HAM10000_labels.csv') as file:
    csv_reader = csv.reader(file)

    print(next(csv_reader)[0])
    write_test = []
    write_train = []

    for row in csv_reader:
        path_image = os.path.join(test_directory, row[0])
        path_image = path_image + '.jpg'
        if os.path.isfile(path_image):
            write_test.append([row[0], row[1]])
        else:
            write_train.append([row[0], row[1]])
    
with open('Data/HAM100000_train_label.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    for row in write_train:
        csv_writer.writerow(row)


with open('Data/HAM100000_test_label.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    for row in write_test:
        csv_writer.writerow(row)

image_id


In [27]:
# suibclass of Dataset class to load images and labels
class ImageDataset(Dataset):
    def __init__(self, csv_annotation, root_dir, transform=None):
        
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.frame = pd.read_csv(csv_annotation)
        self.root_dir = root_dir
        self.transform = transform


    def __len__(self):
        return len(self.frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_name = os.path.join(self.root_dir, self.frame.iloc[idx, 0])
        image_name = image_name + '.jpg'
        image = io.imread(image_name)

        sample = {'image': image, 'label': self.frame.iloc[idx, 1]}

        if self.transform:
            sample = self.transform(sample)
        
        return sample    

In [16]:
# show an image
root = 'Data\HAM100000_images'
frame = pd.read_csv('Data\HAM10000_labels.csv')

image_path = os.path.join(root, frame.iloc[0,0])
image_path = image_path + '.jpg'
with Image.open(image_path) as image:
    image.show()

print(image_path)

Data\HAM100000_images\ISIC_0025030.jpg


In [25]:
# calculate mean and std of dataset, all images have size (600, 450), after transforming images to tensors they have size (3, 600, 450)
file_list = os.listdir('Data\HAM100000_images')
mean = [0, 0, 0]
std = [0, 0, 0]

for file_name in file_list:
    image_path = os.path.join('Data\HAM100000_images', file_name)
    image = io.imread(image_path)
    mean[0] += image[:, :, 0].mean()
    mean[1] += image[:, :, 1].mean()
    mean[2] += image[:, :, 2].mean()
    std[0] += image[:, :, 0].std()
    std[1] += image[:, :, 1].std()
    std[2] += image[:, :, 2].std()    
for i in [0, 1, 2]:
    mean[i] /= len(file_list)
    std[i] /= len(file_list)

print(mean, std)

[194.69792020561766, 139.26262746509832, 145.48524135685338] [22.855094582223956, 30.168411555547745, 33.903190491317204]


In [29]:
# load data into pytorch dataset and test ImageDatset

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[194.69792020561766, 139.26262746509832, 145.48524135685338], std=[22.855094582223956, 30.168411555547745, 33.903190491317204])
])

training_data = ImageDataset(csv_annotation='Data/HAM100000_train_label.csv', root_dir='Data/HAM100000_images_train/', transform=transform)
test_data = ImageDataset(csv_annotation='Data/HAM100000_test_label.csv', root_dir='Data/HAM100000_images_test/', transform=transform)

In [30]:
# load data

train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True)




In [55]:
# load dataset

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[194.69792020561766, 139.26262746509832, 145.48524135685338], std=[22.855094582223956, 30.168411555547745, 33.903190491317204])
])

# to use these data needs to be in class folders

train_dataset = datasets.ImageFolder(root='Data/HAM100000_images_train/', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = datasets.ImageFolder(root='Data/HAM100000_images_test/', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)


## Model 0