In [1]:
import os
import shutil
import cv2
import gc
import keras
import numpy as np
import pandas as pd
from keras.applications.mobilenet import MobileNet
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import (BatchNormalization, Dense, Dropout, Flatten)
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from matplotlib import pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import train_test_split
import gradio
import gradio as gr

In [2]:
metadata = pd.read_csv(r"C:\Users\PC\Cookies\Desktop\image_classification_maysa\archive (5)\HAM10000_metadata.csv")
metadata.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
metadata["dx"].value_counts() / metadata.shape[0]

nv       0.669496
mel      0.111133
bkl      0.109735
bcc      0.051323
akiec    0.032651
vasc     0.014179
df       0.011483
Name: dx, dtype: float64

In [14]:
image_sample = cv2.imread(r"C:\Users\PC\Cookies\Desktop\image_classification\archive (5)\HAM10000_images_part_1\ISIC_0024306.jpg")
print(image_sample.shape)

(450, 600, 3)


In [5]:
lesion_id_cnt = metadata["lesion_id"].value_counts()
def check_duplicates(id):
    
    if lesion_id_cnt[id] > 1:
        return True
    else:
        return False

metadata["has_duplicate"] = metadata["lesion_id"].map(check_duplicates)

In [15]:
metadata["has_duplicate"].value_counts()

False    5514
True     4501
Name: has_duplicate, dtype: int64

In [16]:
image_folder_1 = r"C:\Users\PC\Cookies\Desktop\image_classification\archive (5)\HAM10000_images_part_1"
image_folder_2 = r"C:\Users\PC\Cookies\Desktop\image_classification\archive (5)\HAM10000_images_part_2"
metadata["folder"] = 0
metadata.set_index("image_id", drop=False, inplace=True)

for image in os.listdir(image_folder_1):
    image_id = image.split(".")[0]
    metadata.loc[image_id, "folder"] = "1"

for image in os.listdir(image_folder_2):
    image_id = image.split(".")[0]
    metadata.loc[image_id, "folder"] = "2"

In [17]:
metadata.head()

Unnamed: 0_level_0,lesion_id,image_id,dx,dx_type,age,sex,localization,has_duplicate,folder
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ISIC_0027419,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,True,1
ISIC_0025030,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,True,1
ISIC_0026769,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,True,1
ISIC_0025661,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,True,1
ISIC_0031633,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,True,2


In [18]:
data_train_no_dup, data_val = train_test_split(metadata[metadata["has_duplicate"] == False], test_size=0.36, stratify=metadata[metadata["has_duplicate"] == False]["dx"]) # 36% of the data with no duplicates is roughly 20% of the total
data_train = pd.concat((data_train_no_dup, metadata[metadata["has_duplicate"] == True]), axis=0)
data_val, data_test = train_test_split(data_val, test_size=0.5, stratify=data_val["dx"])
print("Train: " + str(data_train.shape[0] / metadata.shape[0]))
print("Validation: " + str(data_val.shape[0] / metadata.shape[0]))
print("Test: " + str(data_test.shape[0] / metadata.shape[0]))
val_len = data_val.shape[0]
test_len = data_test.shape[0]

Train: 0.801697453819271
Validation: 0.09915127309036445
Test: 0.09915127309036445


In [19]:
base_dir = r"C:\Users\PC\Cookies\Desktop\image_classification\base_dir"
#os.mkdir(base_dir)

train_dir = os.path.join(base_dir, "image_train")
#os.mkdir(train_dir)

val_dir = os.path.join(base_dir, "image_val")
#os.mkdir(val_dir)

test_dir = os.path.join(base_dir, "image_test")
#os.mkdir(test_dir)

labels = list(metadata["dx"].unique())

for label in labels:
    label_path_train = os.path.join(train_dir, label)
    #os.mkdir(label_path_train)
    label_path_val = os.path.join(val_dir, label)
    #os.mkdir(label_path_val)
    label_path_test = os.path.join(test_dir, label)
    #os.mkdir(label_path_test)

In [20]:
image_dir = r"C:\Users\PC\Cookies\Desktop\image_classification\archive (5)\HAM10000_images_part_"

for i in range(data_train.shape[0]):
    image_name = data_train["image_id"][i] + ".jpg"
    src_dir = os.path.join(image_dir + data_train["folder"][i], image_name)
    dst_dir = os.path.join(train_dir, data_train["dx"][i], image_name)
    shutil.copyfile(src_dir, dst_dir)

for i in range(data_val.shape[0]):
    image_name = data_val["image_id"][i] + ".jpg"
    src_dir = os.path.join(image_dir + data_val["folder"][i], image_name)
    dst_dir = os.path.join(val_dir, data_val["dx"][i], image_name)
    shutil.copyfile(src_dir, dst_dir)
    
for i in range(data_test.shape[0]):
    image_name = data_test["image_id"][i] + ".jpg"
    src_dir = os.path.join(image_dir + data_test["folder"][i], image_name)
    dst_dir = os.path.join(test_dir, data_test["dx"][i], image_name)
    shutil.copyfile(src_dir, dst_dir)

In [21]:
for label in labels:
    print(label + " train: " + str(len(os.listdir(os.path.join(train_dir, label)))))
print("\n")
for label in labels:
    print(label + " val: " + str(len(os.listdir(os.path.join(val_dir, label)))))
print("\n")
for label in labels:
    print(label + " val: " + str(len(os.listdir(os.path.join(test_dir, label)))))

bkl train: 5986
nv train: 6996
df train: 4754
mel train: 5937
vasc train: 5584
bcc train: 5710
akiec train: 5719


bkl val: 141
nv val: 1463
df val: 13
mel val: 71
vasc val: 20
bcc val: 58
akiec val: 51


bkl val: 141
nv val: 1444
df val: 11
mel val: 74
vasc val: 23
bcc val: 56
akiec val: 50


In [None]:
#del data_train_no_dup, metadata
#gc.collect()

In [22]:
data_gen_param = {
    "rotation_range": 180,
    "width_shift_range": 0.1,
    "height_shift_range": 0.1,
    "zoom_range": 0.1,
    "horizontal_flip": True,
    "vertical_flip": True
}
data_generator = ImageDataGenerator(**data_gen_param)
num_images_each_label = 6000

aug_dir = os.path.join(base_dir, "aug_dir")
print(os.path.abspath(aug_dir))
#os.mkdir(aug_dir)

for label in labels:
    
    img_dir = os.path.join(aug_dir, "aug_img")
    #os.mkdir(img_dir)
    
    src_dir_label = os.path.join(train_dir, label)
    for image_name in os.listdir(src_dir_label):
        shutil.copy(os.path.join(src_dir_label, image_name), os.path.join(img_dir, image_name))
    
    batch_size = 32
    data_flow_param = {
        "directory": aug_dir,
        "color_mode": "rgb",
        "batch_size": batch_size,
        "shuffle": True,
        "save_to_dir": os.path.join(train_dir, label),
        "save_format": "jpg"
    }
    aug_data_gen = data_generator.flow_from_directory(**data_flow_param)
    
    num_img_aug = num_images_each_label - len(os.listdir(os.path.join(train_dir, label)))
    num_batch = int(num_img_aug / batch_size)
    
    for i in range(0, num_batch):
        next(aug_data_gen)
    
    shutil.rmtree(img_dir)

C:\Users\PC\Cookies\Desktop\image_classification\base_dir\aug_dir


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\PC\\Cookies\\Desktop\\image_classification\\base_dir\\aug_dir\\aug_img\\ISIC_0024312.jpg'

In [23]:
for label in labels:
    print(label + " train: " + str(len(os.listdir(os.path.join(train_dir, label)))))
print("\n")
for label in labels:
    print(label + " val: " + str(len(os.listdir(os.path.join(val_dir, label)))))

bkl train: 5986
nv train: 6996
df train: 4754
mel train: 5937
vasc train: 5584
bcc train: 5710
akiec train: 5719


bkl val: 141
nv val: 1463
df val: 13
mel val: 71
vasc val: 20
bcc val: 58
akiec val: 51


In [25]:
batch_size = 32
IMAGE_SHAPE = (224, 224, 3)
data_gen_param = {
    "samplewise_center": True,
    "samplewise_std_normalization": True,
    "rotation_range": 180,
    "width_shift_range": 0.1,
    "height_shift_range": 0.1,
    "zoom_range": 0.1,
    "horizontal_flip": True,
    "vertical_flip": True,
    "rescale": 1.0 / 255
}
data_generator = ImageDataGenerator(**data_gen_param)

train_flow_param = {
    "directory": train_dir,
    "batch_size": batch_size,
    "target_size": IMAGE_SHAPE[:2],
    "shuffle": True
}
train_flow = data_generator.flow_from_directory(**train_flow_param)

val_flow_param = {
    "directory": val_dir,
    "batch_size": batch_size,
    "target_size": IMAGE_SHAPE[:2],
    "shuffle": False
}
val_flow = data_generator.flow_from_directory(**val_flow_param)

test_flow_param = {
    "directory": test_dir,
    "batch_size": 1,
    "target_size": IMAGE_SHAPE[:2],
    "shuffle": False
}
test_flow = data_generator.flow_from_directory(**test_flow_param)

Found 40686 images belonging to 7 classes.
Found 1817 images belonging to 7 classes.
Found 1799 images belonging to 7 classes.


In [26]:
model_dir = r'C:\Users\PC\Cookies\Desktop\image_classification\model'
model1 = os.path.join(model_dir, "model1.h5")

In [27]:
model = keras.models.load_model(model1 , compile=False)

In [28]:
model.compile()

In [29]:
y_test_true = val_flow.classes
y_test_pred = np.argmax(model.predict_generator(val_flow, steps = len(val_flow)), axis = 1)

  y_test_pred = np.argmax(model.predict_generator(val_flow, steps = len(val_flow)), axis = 1)


In [30]:
report = metrics.classification_report(y_test_true , y_test_pred  , target_names=labels)
print(report)

              precision    recall  f1-score   support

         bkl       0.71      0.33      0.45        51
          nv       0.66      0.67      0.67        58
          df       0.48      0.35      0.41       141
         mel       0.62      0.62      0.62        13
        vasc       0.26      0.52      0.34        71
         bcc       0.94      0.93      0.93      1463
       akiec       0.76      0.95      0.84        20

    accuracy                           0.84      1817
   macro avg       0.63      0.62      0.61      1817
weighted avg       0.86      0.84      0.84      1817

