# Dataset Preprocessing

In [2]:
from pandas import read_csv

from sklearn.model_selection import train_test_split

from torchvision import transforms
from torch.utils.data import DataLoader

from src.data.dataset import SkinDataset
from src.utils.config import Config
from src.utils.utils import merge_metadata_label, get_dataset_mean_std

In [3]:
IMAGES_PATH = Config.get_path('images')
GROUND_TRUTH_PATH = Config.get_path('ground_truth_csv')
METADATA_PATH = Config.get_path('metadata_csv')

MERGED_DATASET_PATH = Config.get_path('merged_dataset_csv')
CLEAN_DATASET_PATH = Config.get_path('clean_dataset_csv')

TRAINING_PATH = Config.get_path('training_csv')
TEST_PATH = Config.get_path('test_csv')

For each metadata sample, I add the ``label`` field. This field is taken from the corresponding row of the ground truth CSV. <br/>
I remove ``lesion_id`` attribute from metadata attributes. <br/>

In [4]:
metadata = read_csv('C:/Users/UX534/ISIC-2019-v2/data/raw/ISIC_2019_Training_Metadata.csv', usecols=['image', 'age_approx', 'anatom_site_general', 'sex'])
diagnoses = read_csv('C:/Users/UX534/ISIC-2019-v2/data/raw/ISIC_2019_Training_GroundTruth.csv')

dataset = merge_metadata_label(metadata, diagnoses)
dataset.to_csv('C:/Users/UX534/ISIC-2019-v2/data/raw/ISIC_2019_Merged_dataset.csv', encoding='utf-8', index=False)

print('Size of the Initial Dataset:', len(dataset))

Size of the Initial Dataset: 25331


Then, I remove all rows with any empty cells.<br/>
Samples of classes we are not interested in are also removed.

In [5]:
dataset.dropna(subset=['age_approx', 'anatom_site_general', 'sex'], inplace=True)
dataset.index.name = 'index'

for forbidden_label in ['BKL', 'AK', 'VASC']:
    dataset.drop(dataset[dataset['label'] == forbidden_label].index, inplace = True)

dataset.to_csv('C:/Users/UX534/ISIC-2019-v2/data/interim/ISIC_2019_dataset_clean.csv', encoding='utf-8')

print('Size of the Clean Dataset:', len(dataset))

Size of the Clean Dataset: 19080


In order to train the classifier, is necessary to normalize images.<br/>
They must have zero average, and unit standard deviation.

In [7]:
dset = SkinDataset('C:/Users/UX534/ISIC-2019-v2/data/images/', 'C:/Users/UX534/ISIC-2019-v2/data/interim/ISIC_2019_dataset_clean.csv', transforms.ToTensor())
dataset_loader = DataLoader(dset, pin_memory=True)
mean, std_dev = get_dataset_mean_std(dataset_loader)

print('Dataset images mean:', mean)
print('Dataset images standard deviation:', std_dev)

Dataset images mean: [0.5949144093096469, 0.5940122161980812, 0.5965893571793713]
Dataset images standard deviation: [0.0806834955201987, 0.08093546973609639, 0.0811099191136179]


Now I split the dataset into **Training Set** [85%], and **Test Set** [15%]. <br/>
Obviously, the split is totally random.

In [9]:
tr, te = train_test_split(dataset, train_size=0.85, shuffle=True)

tr.to_csv('C:/Users/UX534/ISIC-2019-v2/data/processed/ISIC_2019_dataset_train.csv', encoding='utf-8')
te.to_csv('C:/Users/UX534/ISIC-2019-v2/data/processed/ISIC_2019_dataset_test.csv', encoding='utf-8')

print('Training Set Size:', len(tr))
print('Test Set Size:', len(te))

Training Set Size: 16218
Test Set Size: 2862
