In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'



from sklearn.model_selection import train_test_split
import numpy as np

from PIL import Image

import torch
from torchvision.transforms import ToTensor, Resize
import ray


## Serialize images to make dataloader faster

When Image sizes are large. Dataloaders spend a lot of time loading them.
Meaning training loop.

One way to make this faster is to save images as resized torch tensors.
Then load from these serialized objects

In [2]:
img_folder = 'data/jpeg/train'
tensor_folder = 'data/tensor'

In [24]:
import multiprocessing

In [32]:
def create_paths(img_folder, tensor_folder):
    
    '''
    img_folder = folder in which images are stored
    tensor_folder = where to save the tensors
    resize_shape = resize shape of tensors
    num_cpus = num_cpus to use in parallel with ray
    
    '''

    if not os.path.exists(tensor_folder):
        os.makedirs(tensor_folder)

    image_fns = os.listdir(img_folder)
    image_paths = [os.path.join(img_folder, elem) for elem in image_fns]


    

    file_ext = image_fns[0].split('.')[-1]
    print(f'file_ext identified  = {file_ext}')

    dest_fns = [x.split('.')[0]+'.pt' for x in image_fns]
    dest_paths = [os.path.join(tensor_folder, elem) for elem in dest_fns]
    
    return image_paths, dest_paths



In [35]:
def img2tensor(a,b, resize_shape = (512,512)):
    to_tensor = ToTensor()
    resize_func = Resize(size = resize_shape)
    image = Image.open(a)
    torch.save(to_tensor(resize_func(image))
                                        , b)
    image.close()

In [36]:
a,b = serialize_images(img_folder, tensor_folder)

file_ext identified  = jpg


In [None]:
with multiprocessing.Pool(processes=10) as pool:
    results = pool.starmap(img2tensor, zip(image_paths, dest_paths))

In [117]:
from fastai2.vision.all import *

from fastai2.metrics import RocAuc
from fastai2.vision.all import *

## Custom datasets into fastai2 dataloaders

In [118]:
train_df = pd.read_csv('data/train.csv')

In [135]:
X_train, X_val, y_train, y_val = train_test_split(train_df.index,train_df.target.values, test_size=0.33)

In [139]:
set(X_train).intersection(set(X_val))

set()

In [140]:
train_df['use'] = ''

In [141]:
train_df.loc[X_train, 'use'] = 'train'
train_df.loc[X_val, 'use'] = 'val'

In [142]:
train_df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,use
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,train
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,train
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,val
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,val
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,train


In [143]:
train_df.use.value_counts()

train    22194
val      10932
Name: use, dtype: int64

In [148]:
train_df[train_df.use=='train'].target.value_counts(normalize=True)

0    0.982878
1    0.017122
Name: target, dtype: float64

In [147]:
train_df[train_df.use=='val'].target.value_counts(normalize=True)

0    0.981339
1    0.018661
Name: target, dtype: float64

In [152]:
r = Resize(224,224)

In [161]:
class MelanomaDataset(torch.utils.data.Dataset):
    
    def __init__(self, use, tfms):
        # use = 'train' or val
        # tfms = transformations
        self.use = use
        self.df = train_df[train_df.use==use]
        self.df.reset_index(inplace=True)
        self.tfms = tfms
    
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        img_fn = self.df.iloc[idx]['image_name'].split('.')[0] + '.pt'
        img = torch.load(f'data/tensor/{img_fn}')
        if self.tfms:  # transformation
            img = self.tfms(img)
        if self.use=='train':
            target = np.array([self.df.iloc[idx]['target']])
            target = torch.tensor(target, dtype=torch.float32)
            return {'image':img, 'class' : target}

        elif self.use=='val':
            return {'image':img}

In [162]:
train_dataset = MelanomaDataset(use='train', tfms = None)

In [168]:
ls data/jpeg/train | wc -l 

33126


In [166]:
train_dataset[10]

FileNotFoundError: [Errno 2] No such file or directory: 'data/tensor/ISIC_0079038.pt'