In [10]:
# this notebook is to utilised transfer learning to train a model on a dataset to classify medical image data on breast cancer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
import torchvision
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import matplotlib.pyplot as plt
import os
import skimage.io as io
import time
import copy

In [30]:
# get root directory
root_dir = '/media/victoru/B612CEC512CE8A37/ai50/pytorch_test/data/breatcancer_data/archive'
id_dir = os.listdir(root_dir)
label_dir = os.listdir(os.path.join(root_dir,id_dir[0]))

# create transform for data augmentation: normalize, and convert to tensor
mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])
data_transforms = {
    label_dir[0]: transforms.Compose([
        transforms.RandomHorizontalFlip(), #add randomness to data augmentation
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    label_dir[1]: transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
}
# define hyperparameters
batch_size = 100
num_epochs = 25
random_seed = 42
shuffle_dataset = True
validation_split = 0.2

#show image
def imshow(inp, title):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    plt.title(title)
    plt.show()

In [20]:

root_dir = '/media/victoru/B612CEC512CE8A37/ai50/pytorch_test/data/breatcancer_data/archive'
id_dir = os.listdir(root_dir)
label_dir = os.listdir(os.path.join(root_dir,id_dir[0]))
print(id_dir)
print(label_dir)
for id in id_dir:
    for label in label_dir:

        print(type(id))
        print(type(label))
        print(os.path.join(root_dir,id,label))

        break
    break

['10253', '10254', '10255', '10256', '10257', '10258', '10259', '10260', '10261', '10262', '10264', '10268', '10269', '10272', '10273', '10274', '10275', '10276', '10277', '10278', '10282', '10285', '10286', '10288', '10290', '10291', '10292', '10293', '10295', '10299', '10300', '10301', '10302', '10303', '10304', '10305', '10306', '10307', '10308', '12241', '12626', '12748', '12749', '12750', '12751', '12752', '12810', '12811', '12817', '12818', '12819', '12820', '12821', '12822', '12823', '12824', '12826', '12867', '12868', '12869', '12871', '12872', '12873', '12875', '12876', '12877', '12878', '12879', '12880', '12881', '12882', '12883', '12884', '12886', '12890', '12891', '12892', '12893', '12894', '12895', '12897', '12898', '12900', '12901', '12905', '12906', '12907', '12908', '12909', '12910', '12911', '12929', '12930', '12931', '12932', '12933', '12934', '12935', '12947', '12948', '10279', '12242', '12870', '12896', '12949', '13461', '14155', '15512', '16550', '8951', '9077', '9

In [21]:
#create dataset class for breast cancer data

class breastcancerDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        # get root directory
        self.root_dir = root_dir
        # get transform
        self.transform = transform
        # get list of label from directory under root_dir
        id_dir = os.listdir(root_dir)
        label_dir = os.listdir(os.path.join(root_dir,id_dir[0]))
        self.data = pd.DataFrame(columns=['id','label','image_name'])
        
        # loop through id in id_dir and label in label_dir to get image name and label
        for id in id_dir:
            for label in label_dir:
                tempdata = pd.DataFrame(columns=['id','label','image_name'])
                tempdata['image_name'] = os.listdir(os.path.join(root_dir, id, label))
                tempdata['id'] = id
                tempdata['label'] = label
                self.data = pd.concat([self.data, tempdata],ignore_index = True, axis = 0)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # get image name and label from dataframe
        id = self.data.iloc[idx,0]
        img_name = os.path.join(self.root_dir,self.data.iloc[idx, 0],self.data.iloc[idx, 1],self.data.iloc[idx, 2])
        image = io.imread(img_name)
        label = torch.tensor(int(self.data.iloc[idx, 1]))
                
        if self.transform:
            image = self.transform(image)
            
        return id,image,label

In [31]:
# create dataset object
dataset = breastcancerDataset(root_dir)

# create data indices for training and validation splits
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# create samplers for training and validation splits
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

# create data loaders
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
valid_loader = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)

In [34]:
# check random img from trainset
dataiter = iter(train_loader)
print(len(dataiter))
print(dataset_size)
print(len(train_indices)/len(dataiter))
print(len(train_indices))
#images, labels = dataiter.next()
#print(labels)

#show images
#imshow(torchvision.utils.make_grid(images))

2221
277524
99.963980189104
222020
