In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

In [12]:
labels2019 = pd.read_csv('../isic2019/labels/official/ISIC_2019_Training_GroundTruth.csv')
labels2020 = pd.read_csv('../labels2020.csv')

In [13]:
labels2019.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
bin_labels2019 = labels2019[['image', 'MEL']]
bin_labels2019

Unnamed: 0,image,MEL
0,ISIC_0000000,0.0
1,ISIC_0000001,0.0
2,ISIC_0000002,1.0
3,ISIC_0000003,0.0
4,ISIC_0000004,1.0
...,...,...
25274,ISIC_0073247,0.0
25275,ISIC_0073248,0.0
25276,ISIC_0073249,1.0
25277,ISIC_0073251,0.0


In [5]:
bin_labels2019 = bin_labels2019.rename(columns={'MEL': 'target', 'image': 'image_name'})
bin_labels2019['target'] = bin_labels2019['target'].astype(int)
bin_labels2019

Unnamed: 0,image_name,target
0,ISIC_0000000,0
1,ISIC_0000001,0
2,ISIC_0000002,1
3,ISIC_0000003,0
4,ISIC_0000004,1
...,...,...
25274,ISIC_0073247,0
25275,ISIC_0073248,0
25276,ISIC_0073249,1
25277,ISIC_0073251,0


In [6]:
bin_labels2019.loc[(labels2019['NV'] == 1.) | (labels2019['BKL'] == 1.), 'target'] = 0
bin_labels2019.loc[(labels2019['NV'] != 1.) & (labels2019['BKL'] != 1.), 'target'] = 1
bin_labels2019['target'].sum()

9808

In [7]:
bin_labels2019

Unnamed: 0,image_name,target
0,ISIC_0000000,0
1,ISIC_0000001,0
2,ISIC_0000002,1
3,ISIC_0000003,0
4,ISIC_0000004,1
...,...,...
25274,ISIC_0073247,1
25275,ISIC_0073248,0
25276,ISIC_0073249,1
25277,ISIC_0073251,0


In [8]:
bin_labels2020 = labels2020[['image_name', 'target']]
bin_labels2020

Unnamed: 0,image_name,target
0,ISIC_2637011,0
1,ISIC_0015719,0
2,ISIC_0052212,0
3,ISIC_0068279,0
4,ISIC_0074268,0
...,...,...
32248,ISIC_9999134,0
32249,ISIC_9999320,0
32250,ISIC_9999515,0
32251,ISIC_9999666,0


In [9]:
bin_labels2019.to_csv('cleaned_labels/binary_labels2019.csv', index=False)
bin_labels2020.to_csv('cleaned_labels/binary_labels2020.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'cleaned_labels'

In [14]:
img_names2019 = labels2019.image.values
path_dict2019 = {img_name:'../isic2019/images/official/' for img_name in img_names2019}

img_names2020 = labels2020.image_name.values
path_dict2020 = {img_name:'../train/' for img_name in img_names2020}

In [15]:
from cleanvision.dataset.base_dataset import Dataset
from cleanvision.dataset.torch_dataset import TorchDataset
import torch

class MergedDataset(torch.utils.data.Dataset):
    def __init__(self, ISIC2019_labels_path, ISIC2020_labels_path):
        self.labels2019 = pd.read_csv(ISIC2019_labels_path)
        self.labels2020 = pd.read_csv(ISIC2020_labels_path)

        img_names2019 = self.labels2019.image_name.values
        self.path_dict2019 = {img_name:'../isic2019/images/official/' for img_name in img_names2019}

        img_names2020 = self.labels2020.image_name.values
        self.path_dict2020 = {img_name:'../train/' for img_name in img_names2020}

    def __getitem__(self, index):
        if index < 25279: # the last ISIC2019 image index
            labels = self.labels2019
            path_dict = self.path_dict2019
            dataset = 'ISIC2019'
        else:
            index -= 25279
            labels = self.labels2020
            path_dict = self.path_dict2020
            dataset = 'ISIC2020'

        img_name = labels['image_name'].values[index]
        img_path = path_dict[img_name] + img_name + ".jpg"
        img = Image.open(img_path).copy()
        return img, labels.target[index], dataset

    def __len__(self):
        return self.labels2019.shape[0] + self.labels2020.shape[0]

In [16]:
dataset = MergedDataset('cleaned_labels/binary_labels2019.csv',
                        'cleaned_labels/binary_labels2020.csv')

In [17]:
len(dataset)

57532

In [23]:
mel = bin_labels2019.target.sum() + bin_labels2020.target.sum()

print('Benign: ', len(dataset) - mel)
print('Malignant: ', mel)

Benign:  47146
Malignant:  10386


## CLEANVISION CLEANING

In [None]:
from cleanvision import Imagelab


imagelab = Imagelab(torchvision_dataset=dataset)

imagelab.find_issues()

imagelab.report()

Checking for dark, light, odd_aspect_ratio, low_information, exact_duplicates, near_duplicates, blurry, grayscale, odd_size images ...


  0%|          | 0/57532 [00:00<?, ?it/s]

In [None]:
imagelab.issues.to_csv('cleanvision_results/merged_issues.csv', index=False)

In [None]:
duplicates = imagelab.info['exact_duplicates']['sets']
duplicates = pd.DataFrame(duplicates)

near_duplicates = imagelab.info['near_duplicates']['sets']
near_duplicates = pd.DataFrame(near_duplicates)

duplicates.to_csv('cleanvision_results/merged_duplicates.csv', index=False)
near_duplicates.to_csv('cleanvision_results/merged_near_duplicates.csv', index=False)

In [14]:
results = pd.read_csv('cleanvision_results/merged_issues.csv')
near_dups = pd.read_csv('cleanvision_results/merged_duplicates.csv')

EmptyDataError: No columns to parse from file

In [15]:
check_cols = ['is_odd_size_issue', 'is_odd_aspect_ratio_issue', 'is_low_information_issue', 'is_light_issue', 
              'is_grayscale_issue', 'is_dark_issue', 'is_blurry_issue', 'is_exact_duplicates_issue', 'is_near_duplicates_issue']

for col in check_cols:
    print(col, ': ', results[col].sum())

is_odd_size_issue :  0
is_odd_aspect_ratio_issue :  0
is_low_information_issue :  0
is_light_issue :  0
is_grayscale_issue :  0
is_dark_issue :  0
is_blurry_issue :  2566
is_exact_duplicates_issue :  0
is_near_duplicates_issue :  190


In [16]:
results.head(10)

Unnamed: 0,odd_size_score,is_odd_size_issue,odd_aspect_ratio_score,is_odd_aspect_ratio_issue,low_information_score,is_low_information_issue,light_score,is_light_issue,grayscale_score,is_grayscale_issue,dark_score,is_dark_issue,blurry_score,is_blurry_issue,exact_duplicates_score,is_exact_duplicates_issue,near_duplicates_score,is_near_duplicates_issue
0,0.864616,False,0.750489,False,0.839741,False,0.840544,False,1,False,0.864893,False,0.415308,False,1.0,False,1.0,False
1,0.864616,False,0.750489,False,0.695747,False,0.698366,False,1,False,0.716847,False,0.681465,False,1.0,False,1.0,False
2,0.864616,False,0.750489,False,0.843054,False,0.714519,False,1,False,0.7802,False,0.328465,False,1.0,False,1.0,False
3,0.864616,False,0.750489,False,0.811329,False,0.740138,False,1,False,0.900491,False,0.657485,False,1.0,False,1.0,False
4,0.864616,False,0.750489,False,0.735952,False,0.996078,False,1,False,0.931215,False,0.684219,False,1.0,False,1.0,False
5,0.864616,False,0.750489,False,0.785876,False,0.706048,False,1,False,0.698453,False,0.528653,False,1.0,False,1.0,False
6,0.864616,False,0.750489,False,0.788204,False,0.740844,False,1,False,0.720045,False,0.533442,False,1.0,False,1.0,False
7,0.864616,False,0.750489,False,0.823424,False,0.701208,False,1,False,0.857467,False,0.606019,False,1.0,False,1.0,False
8,0.864616,False,0.750489,False,0.825632,False,0.67893,False,1,False,0.766406,False,0.342328,False,1.0,False,1.0,False
9,0.864616,False,0.750489,False,0.71506,False,0.582975,False,1,False,0.673603,False,0.622513,False,1.0,False,1.0,False


In [17]:
results.odd_aspect_ratio_score.describe()

count    57532.000000
mean         0.750928
std          0.143700
min          0.561333
25%          0.666667
50%          0.750000
75%          0.750000
max          1.000000
Name: odd_aspect_ratio_score, dtype: float64