In [2]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import PIL, urllib
from google.colab import drive
import os
from PIL import Image
import pandas as pd

In [3]:
#mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
def get_diagnosis(image_name, file):

    matching_row = file[file['image_id'] == image_name.split('.')[0]]
    if not matching_row.empty:
        return matching_row.iloc[0]['dx']
    return None

def create_image_diagnosis_dict(base_path, folders, file):
    metadata = pd.read_csv(file)

    image_diagnosis_dict = {}
    for folder in folders:
      path = base_path + folder
      for image_name in os.listdir(path):
          diagnosis = get_diagnosis(image_name, metadata)
          if diagnosis is not None:
              # Add the image name and its diagnosis to the dictionary
              image_diagnosis_dict[image_name] = diagnosis
    return image_diagnosis_dict

In [29]:
class diagnoses(Dataset):
  CLASSES = ('nv', 'akiec', 'bcc', 'bkl', 'df', 'mel')  # Class variable

  def __init__(self, data, transform=None):
      self.data = data
      self.transform = transform

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      image_path, label = self.data[idx]
      image = Image.open(image_path)
      if self.transform:
          image = self.transform(image)
      label_idx = self.CLASSES.index(label) if isinstance(label, str) else label
      return image, label_idx

In [26]:
#import raw data
#########################
## CHANGE TO YOUR PATH ##
#########################

base_path = '/content/drive/MyDrive/Skin Cancer Detection - APS360/data'
folders = ('/HAM10000_images_part_1', '/HAM10000_images_part_2')
metadata = '/HAM10000_metadata.csv'

dataset = []

matches = create_image_diagnosis_dict(base_path, folders, base_path + metadata)

In [30]:
def make_dataset(idx, base_path, folders):
  dataset = []
  for folder in folders:
    path = base_path + folder
    for img in os.listdir(path):
      image_path = os.path.join(path, img)
      label = idx.get(img)
      if label is None:
        continue
      dataset.append((image_path, label))
  return dataset

In [27]:
def get_data_loader(batch_size, idx, base_path, folders):

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    dataset = make_dataset(idx, base_path, folders)

    training_validation_ratio = 0.7
    split = int(len(dataset) * training_validation_ratio) #split at 70%

    validation_set = dataset[split:]
    train_set = dataset[:split]

    #declare pytorch dataset
    torch_train = diagnoses(train_set, transform)
    torch_validation = diagnoses(validation_set, transform)


    #create loaders
    train_loader = torch.utils.data.DataLoader(torch_train, batch_size=batch_size,
                                               num_workers=1, shuffle = True)
    val_loader = torch.utils.data.DataLoader(torch_validation, batch_size=batch_size,
                                              num_workers=1, shuffle = False)

    return train_loader, val_loader

In [32]:
train_loader, val_loader = get_data_loader(64, matches, base_path, folders)