## Import Libraries

In [1]:
import os
import cv2
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Path

In [2]:
downloads = os.path.abspath('../../datasets/HAM10000/downloads')

In [3]:
gt = os.path.join(downloads, "HAM10000_metadata.csv")

In [4]:
data_pd = pd.read_csv(gt)
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [5]:
samples = data_pd.groupby('dx').count()
samples['lesion_id']

dx
akiec     327
bcc       514
bkl      1099
df        115
mel      1113
nv       6705
vasc      142
Name: lesion_id, dtype: int64

In [6]:
df_count = data_pd.groupby('lesion_id').count()
df_count.head()

Unnamed: 0_level_0,image_id,dx,dx_type,age,sex,localization,dataset
lesion_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HAM_0000000,2,2,2,2,2,2,2
HAM_0000001,1,1,1,1,1,1,1
HAM_0000002,3,3,3,3,3,3,3
HAM_0000003,1,1,1,1,1,1,1
HAM_0000004,1,1,1,1,1,1,1


In [7]:
df_count = df_count[df_count['dx'] == 1]
df_count.reset_index(inplace=True)

In [8]:
def duplicates(x):
    unique = set(df_count['lesion_id'])
    if x in unique:
        return 'no' 
    else:
        return 'duplicates'

In [9]:
data_pd['is_duplicate'] = data_pd['lesion_id'].apply(duplicates)
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,duplicates


In [10]:
df_count = data_pd[data_pd['is_duplicate'] == 'no']

## Train Test Split

In [11]:
# creating test_df
_, test_df = train_test_split(df_count, test_size=0.15, stratify=df_count['dx'], random_state=42)

In [12]:
def identify_train_or_test(x):
    test_data = set(test_df['image_id'])
    if str(x) in test_data:
        return 'test'
    else:
        return 'train'

In [13]:
# creating train_df
data_pd['train_test_split'] = data_pd['image_id'].apply(identify_train_or_test)
train_df = data_pd[data_pd['train_test_split'] == 'train']

In [14]:
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,duplicates,train


In [15]:
test_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate
4966,HAM_0001445,ISIC_0029433,nv,follow_up,35.0,male,trunk,vidir_molemax,no
9184,HAM_0006105,ISIC_0032236,nv,histo,40.0,female,foot,rosendahl,no
9558,HAM_0004212,ISIC_0034114,nv,consensus,40.0,female,unknown,vidir_modern,no
5784,HAM_0002063,ISIC_0024819,nv,follow_up,70.0,female,lower extremity,vidir_molemax,no
6290,HAM_0001750,ISIC_0028832,nv,follow_up,55.0,male,trunk,vidir_molemax,no


In [16]:
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])

In [17]:
len(test_list)

828

In [18]:
len(train_list)

9187

In [19]:
# set the 'image_id' as the index in 'data_pd'
data_pd.set_index('image_id', inplace=True)

## Create Folders for Train Test Split

In [20]:
dataset_dir = os.path.abspath('../../datasets/HAM10000/split')

if os.path.exists(dataset_dir):
    print("Folders already exist!")
else:
    os.makedirs(dataset_dir)
    
    train_dir = os.path.join(dataset_dir, 'train')
    test_dir = os.path.join(dataset_dir, 'test')
    
    os.makedirs(train_dir)
    os.makedirs(test_dir)
    
    targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
    for i in targetnames:
        os.makedirs(os.path.join(train_dir, i))
        os.makedirs(os.path.join(test_dir, i))

    print("Folders successfully created!")

Folders successfully created!


## Copy Images to Train and Test Folders

In [21]:
# source path
source_dir = os.path.join(downloads, "HAM10000_images")

In [22]:
def copy_images(source_dir, target_dir, target_list):
    
    for image in tqdm(target_list, colour="green"):
        
        file_name = image + '.jpg'
        label = data_pd.loc[image, 'dx']

        # path of source image 
        source = os.path.join(source_dir, file_name)
        
        # image resize
        img = cv2.imread(source)
        img = cv2.resize(img, (256, 256))
    
        # path of target image
        target = os.path.join(target_dir, label, file_name)

        cv2.imwrite(target, img)

In [23]:
copy_images(source_dir, train_dir, train_list)

100%|[32m██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 9187/9187 [01:15<00:00, 121.69it/s][0m


In [24]:
copy_images(source_dir, test_dir, test_list)

100%|[32m████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 828/828 [00:06<00:00, 122.92it/s][0m
