In [1]:
from constants import *

In [2]:
import matplotlib.pyplot as plt
import glob
import pandas as pd
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Set up multiple outputs for cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 0. All shapes

In [5]:
shapes = set()
images_path = glob.glob('../ISIC-2019/ISIC_2019_Training_Input/*.jpg')
# images_names = list(map(lambda x: x.split('/')[-1], images_path))
for img_path in images_path:
    shapes.add(plt.imread(img_path).shape)

In [6]:
len(shapes)

101

In [8]:
for s in shapes:
    if s[0] == s[1]:
        print(s)

(1024, 1024, 3)


## 1. Check shapes by all classes

In [4]:
df_2019 = pd.read_csv('../ISIC-2019/ISIC_2019_Training_Metadata.csv')

df_2019_target = pd.read_csv('../ISIC-2019/ISIC_2019_Training_GroundTruth.csv', 
                             index_col='image').drop('UNK', axis=1)
df_2019_target = df_2019_target.astype(int)

df = pd.merge(left=df_2019_target, right=df_2019[['image', 'lesion_id']], on='image', how='left')

In [11]:
import random
def train_test_split_on_column(df, column_to_split, test_rate=0.15, val_rate=0.15, random_seed=40):
    # Setting random seed for the split
    random.seed(random_seed)
    
    num_of_missing = df[column_to_split].isnull().sum()
    fill_missing_values = ['missing_' + str(x) for x in range(num_of_missing)]
    
    null_indices = df[df[column_to_split].isna()].index
    for i, fill_label in zip(null_indices, fill_missing_values):
        df.loc[i, column_to_split] = fill_label
    
    values = df[column_to_split].unique().tolist()
    
    test_size = round(test_rate * len(values))
    val_size = round(val_rate * len(values))

    test_val_values = random.sample(population=values, k=test_size + val_size)
    test_values = random.sample(population=test_val_values, k=test_size)
    val_values = [x for x in test_val_values if x not in test_values]

    df_train = df[~df[column_to_split].isin(test_val_values)]
    df_test = df[df[column_to_split].isin(test_values)]
    df_val = df[df[column_to_split].isin(val_values)]
    
    return df_train, df_test, df_val

In [12]:
df_train, df_test, df_val = train_test_split_on_column(df, 'lesion_id')

In [14]:
df_train_non_mel = df_train[df_train['MEL'] != 1]

In [16]:
counter = 0
for img in df_train_non_mel.image:
    if plt.imread('../ISIC-2019/ISIC_2019_Training_Input/' + img + '.jpg').shape == (1024, 1024, 3):
        counter += 1
print(counter)

6788


In [5]:
class_2_shapes = {}
for disease_class in CLASSES_2019:
    df_disease = df[df[disease_class.upper()] == 1]
    
    tmp = []
    for img in df_disease.image:
        tmp.append(plt.imread(os.path.join('..', 'ISIC-2019', 'ISIC_2019_Training_Input', img + '.' + IMG_FORMAT)).shape)

    class_2_shapes[disease_class] = tmp

In [6]:
# Overall number of melanomas (1024, 1024, 3)
class_2_shapes['mel'].count((1024, 1024, 3))

2858

In [7]:
# Overall number of non-melanomas (1024, 1024, 3)
total = 0
for key, items in class_2_shapes.items():
    if key != 'mel':
        total += items.count((1024, 1024, 3))
total

9556

In [8]:
# Overall number of non-melanomas
total = 0
for key, items in class_2_shapes.items():
    if key != 'mel':
        total += len(items)
total

20809

In [7]:
# Number of images from non-melanoma class that have one of the shapes: (450, class_2_shapes, (1024, 1024, 3), (680, 1024, 3)
counter = 0
for key, items in class_2_shapes.items():
    if key != 'mel':
        for item in items:
            if item in [(450, 600, 3), (1024, 1024, 3), (680, 1024, 3)]:
                counter += 1
print(counter)

19350


In [None]:
for key, item in class_2_shapes.items():
    print(key)
    values, counts = np.unique(ar=np.array(class_2_shapes[key]), return_counts=True, axis=0)
    for v, c in zip(values, counts):
        print(v, ' -> ', c)
    print('-' * 50)

## 2. Check shapes by binary classes

In [3]:
shapes_mel = []

counter = 0
files = list(map(lambda x: x.split('/')[-1], glob.glob(os.path.join(TRAIN_PATH, 'mel', '*.' + IMG_FORMAT))))
for img in files:
    counter += 1
    shapes_mel.append(plt.imread(os.path.join('..', 'ISIC-2019', 'ISIC_2019_Training_Input', img)).shape)

In [4]:
shapes_non_mel = []

counter = 0
files = list(map(lambda x: x.split('/')[-1], glob.glob(os.path.join(TRAIN_PATH, 'other', '*.' + IMG_FORMAT))))
for img in files:
    counter += 1
    if str.isnumeric(img[0]) and img[1] == '_':
        img = img[2:]
    shapes_non_mel.append(plt.imread(os.path.join('..', 'ISIC-2019', 'ISIC_2019_Training_Input', img)).shape)

In [5]:
m_values, m_counts = np.unique(ar=shapes_mel, return_index=False, return_inverse=False, return_counts=True, axis=0)
nm_values, nm_counts = np.unique(ar=shapes_non_mel, return_index=False, return_inverse=False, return_counts=True, axis=0)

## 3. New pre-processing

In [8]:
for value, count in zip(m_values, m_counts):
    print(value, ' -> ', count)

[450 600   3]  ->  761
[ 680 1024    3]  ->  154
[1024 1024    3]  ->  2067


In [9]:
total = sum(m_counts)
for value, count in zip(m_values, m_counts):
    print(value, ' -> ', '{:.1%}'.format(count / total))

[450 600   3]  ->  25.5%
[ 680 1024    3]  ->  5.2%
[1024 1024    3]  ->  69.3%


In [11]:
4793 + 636 + 5222

10651

In [10]:
for value, count in zip(nm_values, nm_counts):
    print(value, ' -> ', count)

[450 600   3]  ->  4793
[ 680 1024    3]  ->  636
[1024 1024    3]  ->  5222


In [11]:
total = sum(nm_counts)
for value, count in zip(nm_values, nm_counts):
    print(value, ' -> ', '{:.1%}'.format(count / total))

[450 600   3]  ->  45.0%
[ 680 1024    3]  ->  6.0%
[1024 1024    3]  ->  49.0%


In [1]:
45 + 6 + 49

100

## 4. Before (Old)

In [6]:
for value, count in zip(m_values, m_counts):
    print(value, ' -> ', count)

[450 600   3]  ->  761
[542 722   3]  ->  2
[545 722   3]  ->  1
[576 767   3]  ->  9
[602 639   3]  ->  1
[640 964   3]  ->  1
[649 965   3]  ->  1
[674 962   3]  ->  1
[ 674 1024    3]  ->  2
[ 677 1024    3]  ->  7
[ 678 1024    3]  ->  1
[ 679 1024    3]  ->  10
[ 680 1024    3]  ->  154
[ 681 1024    3]  ->  9
[ 682 1016    3]  ->  1
[ 682 1024    3]  ->  8
[ 683 1024    3]  ->  4
[ 684 1024    3]  ->  5
[ 685 1024    3]  ->  8
[ 686 1024    3]  ->  1
[ 687 1024    3]  ->  1
[ 688 1024    3]  ->  1
[ 689 1024    3]  ->  1
[719 824   3]  ->  3
[720 964   3]  ->  1
[724 960   3]  ->  1
[ 767 1022    3]  ->  2
[ 768 1024    3]  ->  133
[802 919   3]  ->  7
[ 861 1024    3]  ->  1
[ 878 1024    3]  ->  1
[1024  857    3]  ->  1
[1024 1024    3]  ->  2067


In [7]:
total = sum(m_counts)
for value, count in zip(m_values, m_counts):
    print(value, ' -> ', '{:.1%}'.format(count / total))

[450 600   3]  ->  23.7%
[542 722   3]  ->  0.1%
[545 722   3]  ->  0.0%
[576 767   3]  ->  0.3%
[602 639   3]  ->  0.0%
[640 964   3]  ->  0.0%
[649 965   3]  ->  0.0%
[674 962   3]  ->  0.0%
[ 674 1024    3]  ->  0.1%
[ 677 1024    3]  ->  0.2%
[ 678 1024    3]  ->  0.0%
[ 679 1024    3]  ->  0.3%
[ 680 1024    3]  ->  4.8%
[ 681 1024    3]  ->  0.3%
[ 682 1016    3]  ->  0.0%
[ 682 1024    3]  ->  0.2%
[ 683 1024    3]  ->  0.1%
[ 684 1024    3]  ->  0.2%
[ 685 1024    3]  ->  0.2%
[ 686 1024    3]  ->  0.0%
[ 687 1024    3]  ->  0.0%
[ 688 1024    3]  ->  0.0%
[ 689 1024    3]  ->  0.0%
[719 824   3]  ->  0.1%
[720 964   3]  ->  0.0%
[724 960   3]  ->  0.0%
[ 767 1022    3]  ->  0.1%
[ 768 1024    3]  ->  4.1%
[802 919   3]  ->  0.2%
[ 861 1024    3]  ->  0.0%
[ 878 1024    3]  ->  0.0%
[1024  857    3]  ->  0.0%
[1024 1024    3]  ->  64.5%


In [8]:
for value, count in zip(nm_values, nm_counts):
    print(value, ' -> ', count)

[450 600   3]  ->  4793
[542 718   3]  ->  2
[542 722   3]  ->  3
[542 725   3]  ->  1
[543 722   3]  ->  1
[576 767   3]  ->  33
[638 959   3]  ->  1
[639 959   3]  ->  1
[640 957   3]  ->  1
[641 962   3]  ->  1
[ 669 1024    3]  ->  1
[671 962   3]  ->  1
[ 671 1024    3]  ->  1
[672 964   3]  ->  1
[ 672 1024    3]  ->  1
[ 673 1024    3]  ->  3
[674 962   3]  ->  5
[ 674 1024    3]  ->  2
[ 675 1024    3]  ->  1
[676 962   3]  ->  1
[ 676 1024    3]  ->  11
[ 677 1024    3]  ->  19
[ 678 1024    3]  ->  10
[ 679 1024    3]  ->  41
[680 853   3]  ->  1
[ 680 1024    3]  ->  636
[ 681 1024    3]  ->  48
[682 796   3]  ->  1
[ 682 1024    3]  ->  117
[ 683 1024    3]  ->  22
[ 684 1024    3]  ->  19
[ 685 1024    3]  ->  91
[ 686 1024    3]  ->  10
[ 689 1024    3]  ->  2
[ 690 1024    3]  ->  1
[ 692 1024    3]  ->  2
[ 704 1007    3]  ->  1
[ 710 1024    3]  ->  1
[ 711 1007    3]  ->  1
[ 711 1008    3]  ->  1
[ 711 1024    3]  ->  1
[ 713 1011    3]  ->  1
[ 715 1024    3]  ->  1

In [9]:
total = sum(nm_counts)
for value, count in zip(nm_values, nm_counts):
    print(value, ' -> ', '{:.1%}'.format(count / total))

[450 600   3]  ->  41.1%
[542 718   3]  ->  0.0%
[542 722   3]  ->  0.0%
[542 725   3]  ->  0.0%
[543 722   3]  ->  0.0%
[576 767   3]  ->  0.3%
[638 959   3]  ->  0.0%
[639 959   3]  ->  0.0%
[640 957   3]  ->  0.0%
[641 962   3]  ->  0.0%
[ 669 1024    3]  ->  0.0%
[671 962   3]  ->  0.0%
[ 671 1024    3]  ->  0.0%
[672 964   3]  ->  0.0%
[ 672 1024    3]  ->  0.0%
[ 673 1024    3]  ->  0.0%
[674 962   3]  ->  0.0%
[ 674 1024    3]  ->  0.0%
[ 675 1024    3]  ->  0.0%
[676 962   3]  ->  0.0%
[ 676 1024    3]  ->  0.1%
[ 677 1024    3]  ->  0.2%
[ 678 1024    3]  ->  0.1%
[ 679 1024    3]  ->  0.4%
[680 853   3]  ->  0.0%
[ 680 1024    3]  ->  5.4%
[ 681 1024    3]  ->  0.4%
[682 796   3]  ->  0.0%
[ 682 1024    3]  ->  1.0%
[ 683 1024    3]  ->  0.2%
[ 684 1024    3]  ->  0.2%
[ 685 1024    3]  ->  0.8%
[ 686 1024    3]  ->  0.1%
[ 689 1024    3]  ->  0.0%
[ 690 1024    3]  ->  0.0%
[ 692 1024    3]  ->  0.0%
[ 704 1007    3]  ->  0.0%
[ 710 1024    3]  ->  0.0%
[ 711 1007    3]  -> 

In [2]:
41.1 + 5.4 +  44.7

91.2