In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

In [2]:
labels2019 = pd.read_csv('../isic2019/labels/official/ISIC_2019_Training_GroundTruth.csv')
labels2020 = pd.read_csv('../labels2020.csv')

In [5]:
labels2019.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
bin_labels2019 = labels2019[['image', 'MEL']]
bin_labels2019

Unnamed: 0,image,MEL
0,ISIC_0000000,0.0
1,ISIC_0000001,0.0
2,ISIC_0000002,1.0
3,ISIC_0000003,0.0
4,ISIC_0000004,1.0
...,...,...
25274,ISIC_0073247,0.0
25275,ISIC_0073248,0.0
25276,ISIC_0073249,1.0
25277,ISIC_0073251,0.0


In [34]:
bin_labels2019 = bin_labels2019.rename(columns={'MEL': 'target', 'image': 'image_name'})
bin_labels2019['target'] = bin_labels2019['target'].astype(int)
bin_labels2019

Unnamed: 0,image_name,target
0,ISIC_0000000,0
1,ISIC_0000001,0
2,ISIC_0000002,1
3,ISIC_0000003,0
4,ISIC_0000004,1
...,...,...
25274,ISIC_0073247,0
25275,ISIC_0073248,0
25276,ISIC_0073249,1
25277,ISIC_0073251,0


In [35]:
bin_labels2019.loc[(labels2019['NV'] == 1.) | (labels2019['BKL'] == 1.), 'target'] = 0
bin_labels2019.loc[(labels2019['NV'] != 1.) & (labels2019['BKL'] != 1.), 'target'] = 1
bin_labels2019['target'].sum()

9808

In [50]:
bin_labels2019

Unnamed: 0,image_name,target
0,ISIC_0000000,0
1,ISIC_0000001,0
2,ISIC_0000002,1
3,ISIC_0000003,0
4,ISIC_0000004,1
...,...,...
25274,ISIC_0073247,1
25275,ISIC_0073248,0
25276,ISIC_0073249,1
25277,ISIC_0073251,0


In [37]:
bin_labels2020 = labels2020[['image_name', 'target']]
bin_labels2020

Unnamed: 0,image_name,target
0,ISIC_2637011,0
1,ISIC_0015719,0
2,ISIC_0052212,0
3,ISIC_0068279,0
4,ISIC_0074268,0
...,...,...
32248,ISIC_9999134,0
32249,ISIC_9999320,0
32250,ISIC_9999515,0
32251,ISIC_9999666,0


In [51]:
bin_labels2019.to_csv('cleaned_labels/binary_labels2019.csv', index=False)
bin_labels2020.to_csv('cleaned_labels/binary_labels2020.csv', index=False)

In [38]:
img_names2019 = labels2019.image.values
path_dict2019 = {img_name:'../isic2019/images/official/' for img_name in img_names2019}

img_names2020 = labels2020.image_name.values
path_dict2020 = {img_name:'../train/' for img_name in img_names2020}

In [78]:
from cleanvision.dataset.base_dataset import Dataset
from cleanvision.dataset.torch_dataset import TorchDataset
import torch

class MergedDataset(torch.utils.data.Dataset):
    def __init__(self, ISIC2019_labels_path, ISIC2020_labels_path):
        self.labels2019 = pd.read_csv(ISIC2019_labels_path)
        self.labels2020 = pd.read_csv(ISIC2020_labels_path)

        img_names2019 = self.labels2019.image_name.values
        self.path_dict2019 = {img_name:'../isic2019/images/official/' for img_name in img_names2019}

        img_names2020 = self.labels2020.image_name.values
        self.path_dict2020 = {img_name:'../train/' for img_name in img_names2020}

    def __getitem__(self, index):
        if index < 25279: # the last ISIC2019 image index
            labels = self.labels2019
            path_dict = self.path_dict2019
            dataset = 'ISIC2019'
        else:
            index -= 25279
            labels = self.labels2020
            path_dict = self.path_dict2020
            dataset = 'ISIC2020'

        img_name = labels['image_name'].values[index]
        img_path = path_dict[img_name] + img_name + ".jpg"
        img = Image.open(img_path).copy()
        return img, labels.target[index], dataset

    def __len__(self):
        return self.labels2019.shape[0] + self.labels2020.shape[0]

In [79]:
dataset = MergedDataset('cleaned_labels/binary_labels2019.csv',
                        'cleaned_labels/binary_labels2020.csv')

In [80]:
len(dataset)

57532

## CLEANVISION CLEANING

In [83]:
from cleanvision import Imagelab


imagelab = Imagelab(torchvision_dataset=dataset)

imagelab.find_issues()

imagelab.report()

Checking for dark, light, odd_aspect_ratio, low_information, exact_duplicates, near_duplicates, blurry, grayscale, odd_size images ...


  0%|          | 0/57532 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
imagelab.issues.to_csv('cleanvision_results/merged_issues.csv', index=False)

In [None]:
duplicates = imagelab.info['exact_duplicates']['sets']
duplicates = pd.DataFrame(duplicates)

near_duplicates = imagelab.info['near_duplicates']['sets']
near_duplicates = pd.DataFrame(near_duplicates)

duplicates.to_csv('cleanvision_results/merged_duplicates.csv', index=False)
near_duplicates.to_csv('cleanvision_results/merged_near_duplicates.csv', index=False)