## Load Data
create a pytorch Dataset from MVS10015 Dataset
pytorch [documentaion](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

In [71]:
# dependencies

import os
import shutil
import math
import random
import csv
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from skimage import io
import torchvision.datasets as datasets
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from torchvision.io import read_image
import skimage.io as io

In [None]:
# Create train/test split (80%)/(20%):

source_folder = 'Data/HAM100000_images/'
test_folder = 'Data/HAM100000_images_test/'
train_folder = 'Data/HAM100000_images_train/'

file_list = os.listdir(source_folder)

number_test_images = math.ceil(0.2 * len(file_list))
number_train_images = len(file_list) - number_test_images
# TODO: Set manual seed and select number_test_images in and rest in 'Data/HAM100000_images_train' 
seed_value = 42
random.seed(seed_value)

random_numbers = set()

while len(random_numbers) <= number_test_images:
    random_numbers.add(random.randint(0, len(file_list)-1))

for index, file in enumerate(file_list):
    if index in random_numbers:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(train_folder, file)
        shutil.copy(source_path, destination_path)
    else:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(test_folder, file)
        shutil.copy(source_path, destination_path)

In [45]:
# Create annotations file for given train/test split

test_directory = 'Data/HAM100000_images_test/'

with open('Data/HAM10000_labels.csv') as file:
    csv_reader = csv.reader(file)

    print(next(csv_reader)[0])
    write_test = []
    write_train = []

    for row in csv_reader:
        path_image = os.path.join(test_directory, row[0])
        path_image = path_image + '.jpg'
        if os.path.isfile(path_image):
            write_test.append([row[0], row[1]])
        else:
            write_train.append([row[0], row[1]])
    
with open('Data/HAM100000_train_label.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    for row in write_train:
        csv_writer.writerow(row)


with open('Data/HAM100000_test_label.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    for row in write_test:
        csv_writer.writerow(row)

image_id


In [16]:
# show an image
root = 'Data\HAM100000_images'
frame = pd.read_csv('Data\HAM10000_labels.csv')

image_path = os.path.join(root, frame.iloc[0,0])
image_path = image_path + '.jpg'
with Image.open(image_path) as image:
    image.show()

print(image_path)

Data\HAM100000_images\ISIC_0025030.jpg


In [25]:
# calculate mean and std of dataset, all images have size (600, 450), after transforming images to tensors they have size (3, 600, 450)
file_list = os.listdir('Data\HAM100000_images')
mean = [0, 0, 0]
std = [0, 0, 0]

for file_name in file_list:
    image_path = os.path.join('Data\HAM100000_images', file_name)
    image = io.imread(image_path)
    mean[0] += image[:, :, 0].mean()
    mean[1] += image[:, :, 1].mean()
    mean[2] += image[:, :, 2].mean()
    std[0] += image[:, :, 0].std()
    std[1] += image[:, :, 1].std()
    std[2] += image[:, :, 2].std()    
for i in [0, 1, 2]:
    mean[i] /= len(file_list)
    std[i] /= len(file_list)

print(mean, std)

[194.69792020561766, 139.26262746509832, 145.48524135685338] [22.855094582223956, 30.168411555547745, 33.903190491317204]


In [30]:
# load data

train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True)




In [67]:
# load dataset
batch_size = 32
shuffle = True

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[194.69792020561766, 139.26262746509832, 145.48524135685338], std=[22.855094582223956, 30.168411555547745, 33.903190491317204])
])

# to use these data needs to be in class folders

train_dataset = datasets.ImageFolder(root='Data/HAM100000_images_train/', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)

test_dataset = datasets.ImageFolder(root='Data/HAM100000_images_test/', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [62]:
test = next(iter(train_loader))
print(type(test))
print(len(test))
print(type(test[0]))
print(test[0].shape)
print(type(test[1]))
print(test[1].shape)

<class 'list'>
2
<class 'torch.Tensor'>
torch.Size([32, 3, 450, 600])
<class 'torch.Tensor'>
torch.Size([32])


## Model 0

In [68]:
class net(nn.Module):
    def __init__(self, dropout = False):
        super(net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        
        self.fc1 = nn.Linear(32 * 75 * 100, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 7)

        # less droupout in deeper layers
        self.dropout = False 
        self.dropout_3 = nn.Dropout(p=0.3) 
        self.dropout_5 = nn.Dropout(p=0.5)
    def forward(self, x, train=True):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = x.view(-1, 32 * 75 * 100)
        x = self.fc1(x)
        x = F.relu(x)
        if train and self.dropout:
            x = self.dropout_5(x)    
        x = self.fc2(x)
        x = F.relu(x)
        if train and self.dropout:
            x = self.dropout_3(x)
        x = self.fc3(x)
        
        return x

classes = ('akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc')

In [69]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [72]:
# train loop
def train_model(model, data_loader, epochs, lr=0.1, optim=None):
  if optim is None:
    optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)
  else:
    optimizer = optim
  loss_fn = nn.CrossEntropyLoss()
  
  model.to(device)
  for epoch in range(epochs):
    epoch_loss = 0
    for batch, (X, y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X)
      y_one_hot = torch.nn.functional.one_hot(y, num_classes=7)
      loss = loss_fn(y_pred, y_one_hot)
      epoch_loss += loss
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print(f"average batch loss in {epoch+1}: {epoch_loss/batch_size}")

In [73]:
# test loop
def test_model(model, data_loader):
  model.to(device)
  with torch.no_grad():
    right, wrong = np.array([0 for i in range(10)]), np.array([0 for i in range(10)])
    for batch, (X,y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X, train=False)
      y_pred = torch.argmax(y_pred, dim=1)
      for i in range(len(y_pred)):
        if y_pred[i]==y[i]:
          right[y[i]] += 1
        else:
         wrong[y[i]] += 1

  for i in range(10):
    print(f"accuracy for {classes[i]}: {right[i]}/{right[i]+wrong[i]} | accuracy in %: {right[i]/(right[i]+wrong[i])*100}")
  
  print(f"overall accuracy: {np.sum(right)} / {np.sum(right) + np.sum(wrong)} | accuracy in percent % {100*np.sum(right)/(np.sum(right) + np.sum(wrong))}")

In [75]:
test = next(iter(train_loader))
model_0 = net()
bing = model_0(test[0])
# train_model(model_0, train_loader, epochs=3, lr=0.01)

In [77]:
print(bing.shape)
test[0].shape

torch.Size([1152, 7])


torch.Size([32, 3, 450, 600])