In [1]:
try:
    import h5py
    from numba import jit, cuda 
    import pandas as pd
    import numpy as np
    import cv2
    import timeit

    %store -r IMAGE_SIZE
    import glob
    import os

    # for reading and displaying images
    from skimage.io import imread
    import matplotlib.pyplot as plt
    %matplotlib inline

    # for creating validation set
    from sklearn.model_selection import train_test_split
    from skimage.color import rgb2gray, gray2rgb

    # for evaluating the model
    from sklearn.metrics import accuracy_score
    from tqdm import tqdm

    # PyTorch libraries and modules
    import torch
    from torch.autograd import Variable
    from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
    from torch.optim import Adam, SGD

    import torchvision
except Exception as error:
    !pip install {str(error)[17:-1]}
except:
    print("Pacote n√£o encontrado")

In [2]:
lesions = {0: "Basal Cell Carcinoma", 
    1: "Lentigo",
    2: "Malignant Melanoma", 
    3: "Melanocytic naevus",
    4: "seborrhoeic keratosis",
    5: "Wart", 
    6: "Actinic Keratosis",
    7: "Squamous Cell Carcinoma",
    8: "Intraepithelial Carcinoma", 
    9: "Pyogenic Granuloma",
    10: "Haemangioma",
    11: "Dermatofibroma"
}

In [3]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)
val = pd.read_csv('val.csv',index_col=0)

train_y = train['label'].values
test_y = test['label'].values
val_y = val['label'].values

train.head()

Unnamed: 0,id,label
0,Wart_original_100.jpg_b1cf4712-7089-44ae-8b03-...,5
1,Wart_original_37.jpg_4b8537e9-0e10-4865-a920-a...,5
2,Wart_original_2.jpg_4df8344c-e516-4ed5-a5b5-dc...,5
3,Wart_original_21.jpg_3c281b36-dfec-46eb-abb8-1...,5
4,Wart_original_54.jpg_294c8aac-a465-4a29-9030-7...,5


In [17]:

df_train = pd.DataFrame(train)
print(df_train['label'].values.tolist()[50000])
df_test = pd.DataFrame(test)
df_val = pd.DataFrame(val)

def img_array(dataframe, img_type):
    train_img = []
    for index, row in tqdm(dataframe.iterrows()):
        # defining the image path
        try:
            if int(row['id']) or row['id'] == str(0):
                image_path = 'dataset-split/' + img_type + '/' + lesions[row['label']] + "/" + str(row['id']) + '.jpg'
        except:
            image_path = 'dataset-split/' + img_type + '/' + lesions[row['label']] + "/resnet_augmented/" + str(row['id']) + '.jpg'
        # reading the image
        img = cv2.imread(image_path, cv2.IMREAD_COLOR)

        # normalize
        # img = cv2.normalize(img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)

        # img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation = cv2.INTER_CUBIC)

        # appending the image into the list
        train_img.append(img)

    return train_img

2


In [32]:
save_path = './train.hdf5'
print('image size: %d bytes'%os.path.getsize('dataset-split/train'))
hf = h5py.File(save_path, 'a') # open a hdf5 file

binary_data_np = np.asarray(img_array(df_train, 'train'))
dset = hf.create_dataset('train', data=binary_data_np)  # write the data to hdf5 file
hf.close()  # close the hdf5 file
print('hdf5 file size: %d bytes'%os.path.getsize(save_path))

209it [00:00, 2087.91it/s]image size: 4096 bytes
1794it [00:00, 2322.42it/s]
uint8
hdf5 file size: 270049280 bytes


In [38]:
save_path = './test.hdf5'
print('image size: %d bytes'%os.path.getsize('dataset-split/test'))
hf = h5py.File(save_path, 'a') # open a hdf5 file

oi = img_array(df_test, 'test')

binary_data_np = np.asarray(oi)
dset = hf.create_dataset('test', data=binary_data_np)  # write the data to hdf5 file
hf.close()  # close the hdf5 file
print('hdf5 file size: %d bytes'%os.path.getsize(save_path))

235it [00:00, 2247.64it/s]image size: 4096 bytes
hdf5 file size: 35376128 bytes



In [24]:
with h5py.File("./train.hdf5", "r") as hf:
    x_train = hf['train'][:]
with h5py.File("./test.hdf5", "r") as hf:
    x_val = hf['test'][:]