# Import packages

In [2]:
import os
import re
import glob
import pathlib
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import cv2

import PIL
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from collections import Counter

from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
SEED=123
np.random.seed(SEED)

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import (
    Input, Dense, Conv2D, Flatten, Activation, 
    MaxPooling2D, AveragePooling2D, ZeroPadding2D, GlobalAveragePooling2D, GlobalMaxPooling2D, add
)

from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.utils import plot_model

from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import load_model

# Set path

In [3]:
BASEPATH = '/home/ubuntu/w210_melanoma/data'

df_train_full = pd.read_csv(os.path.join(BASEPATH, 'metadata_train.csv'))

train_path = BASEPATH + '/train'
train_dir = pathlib.Path(train_path)

test_path = BASEPATH + '/ISIC_2020_Test_Input'
test_dir = pathlib.Path(test_path)

# Configurations

In [6]:
CFG = dict(
        batch_size        =  16,     
        epochs            =  50, #10,  
        verbose           =   1,   
        workers           =   4,   

        optimizer         = 'adam', 

        RANDOM_STATE      =  123,   
    
        # Path to save a model
        path_model        = '/home/ubuntu/w210_melanoma/model/',

        # Images sizes
        img_size          = 224, 
        img_height        = 224, 
        img_width         = 224, 

        # Postprocessing
        label_smooth_fac  =  0.00,      
)

# Generate Image Features by Pre-trained ResNet50

In [7]:
image_generator = ImageDataGenerator(rescale=1./255)
image_generator = image_generator.flow_from_directory(train_dir,              
                                                     target_size=(CFG['img_size'], CFG['img_size']),
                                                     batch_size = 1,
                                                     class_mode='categorical',
                                                     shuffle=False
                                                     )

Found 33126 images belonging to 2 classes.


In [8]:
image_names = []
for file in image_generator.filenames:
    image_names.append(file[-16:-4])
image_names[-5:]

['ISIC_9955163',
 'ISIC_9963177',
 'ISIC_9967383',
 'ISIC_9978107',
 'ISIC_9998682']

In [8]:
model_ResNet50 = tf.keras.Sequential([
     tf.keras.applications.ResNet50(
        input_shape=(224, 224, 3),
        weights='imagenet',
        include_top=False
    ),
    
    GlobalAveragePooling2D()    
])

Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/tensorflow2_latest_p37/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2021-10-27 21:05:58.689 ip-172-31-94-192:24765 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-10-27 21:05:58.710 ip-172-31-94-192:24765 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [9]:
model_ResNet50.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Functional)        (None, 7, 7, 2048)        23587712  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
_________________________________________________________________


In [10]:
image_features = model_ResNet50.predict(image_generator)

In [21]:
img_ftr_df = pd.read_csv('/home/ubuntu/w210_melanoma/melanoma-detection/image_features.csv')

In [22]:
img_ftr_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.0,0.0,0.0,0.0,1.119927,0.0,0.0,0.972906,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017956,0.0,0.0,1.664471,0.0,0.0
1,0.0,0.0,0.0,0.0,1.128498,0.0,0.0,0.979767,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018901,0.0,0.0,1.60345,0.0,0.0
2,0.0,0.0,0.007749,0.0,1.376424,0.0,0.0,0.909722,0.008902,0.0,...,0.0,0.0,0.0,0.0,0.0218,0.0,0.0,1.683696,0.0,0.0
3,0.0,0.0,0.0,0.0,1.127139,0.0,0.0,0.978074,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020395,0.0,0.0,1.673495,0.0,0.0
4,0.0,0.0,0.0,0.0,1.126925,0.0,0.0,0.979247,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020795,0.0,0.0,1.676136,0.0,0.0


In [23]:
img_name_df = pd.DataFrame(image_names, columns= ['image_name'])
img_name_df.head()

Unnamed: 0,image_name
0,ISIC_0015719
1,ISIC_0052212
2,ISIC_0068279
3,ISIC_0074268
4,ISIC_0074311


In [33]:
img_ftr_df = pd.concat([img_name_df, img_ftr_df], axis = 1)

In [34]:
img_ftr_df.head()

Unnamed: 0,image_name,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,ISIC_0015719,0.0,0.0,0.0,0.0,1.119927,0.0,0.0,0.972906,0.0,...,0.0,0.0,0.0,0.0,0.017956,0.0,0.0,1.664471,0.0,0.0
1,ISIC_0052212,0.0,0.0,0.0,0.0,1.128498,0.0,0.0,0.979767,0.0,...,0.0,0.0,0.0,0.0,0.018901,0.0,0.0,1.60345,0.0,0.0
2,ISIC_0068279,0.0,0.0,0.007749,0.0,1.376424,0.0,0.0,0.909722,0.008902,...,0.0,0.0,0.0,0.0,0.0218,0.0,0.0,1.683696,0.0,0.0
3,ISIC_0074268,0.0,0.0,0.0,0.0,1.127139,0.0,0.0,0.978074,0.0,...,0.0,0.0,0.0,0.0,0.020395,0.0,0.0,1.673495,0.0,0.0
4,ISIC_0074311,0.0,0.0,0.0,0.0,1.126925,0.0,0.0,0.979247,0.0,...,0.0,0.0,0.0,0.0,0.020795,0.0,0.0,1.676136,0.0,0.0


In [27]:
df_train_full.tail()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
33121,ISIC_9999134,IP_6526534,male,50.0,torso,unknown,benign,0
33122,ISIC_9999320,IP_3650745,male,65.0,torso,unknown,benign,0
33123,ISIC_9999515,IP_2026598,male,20.0,lower extremity,unknown,benign,0
33124,ISIC_9999666,IP_7702038,male,50.0,lower extremity,unknown,benign,0
33125,ISIC_9999806,IP_0046310,male,45.0,torso,nevus,benign,0


In [35]:
concatenated_whole_df = pd.merge(df_train_full, img_ftr_df, on = 'image_name')

In [37]:
concatenated_whole_df.tail()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
33121,ISIC_9999134,IP_6526534,male,50.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015403,0.0,0.0,1.686957,0.0,0.0
33122,ISIC_9999320,IP_3650745,male,65.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020238,0.0,0.0,1.669718,0.0,0.0
33123,ISIC_9999515,IP_2026598,male,20.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018359,0.0,0.0,1.640222,0.0,0.0
33124,ISIC_9999666,IP_7702038,male,50.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017546,0.0,0.0,1.641765,0.0,0.0
33125,ISIC_9999806,IP_0046310,male,45.0,torso,nevus,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01717,0.0,0.0,1.622705,0.0,0.0


In [38]:
concatenated_whole_df.isnull().sum()

image_name                         0
patient_id                         0
sex                               65
age_approx                        68
anatom_site_general_challenge    527
                                ... 
2043                               0
2044                               0
2045                               0
2046                               0
2047                               0
Length: 2056, dtype: int64

In [39]:
concatenated_whole_df.to_csv('/home/ubuntu/w210_melanoma/melanoma-detection/concatenated_whole.csv', index = False)

In [40]:
df = pd.read_csv('/home/ubuntu/w210_melanoma/melanoma-detection/concatenated_whole.csv')
df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.022666,0.0,0.0,1.617575,0.0,0.0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017956,0.0,0.0,1.664471,0.0,0.0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018901,0.0,0.0,1.60345,0.0,0.0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0218,0.0,0.0,1.683696,0.0,0.0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020395,0.0,0.0,1.673495,0.0,0.0


In [41]:
df.tail()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
33121,ISIC_9999134,IP_6526534,male,50.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015403,0.0,0.0,1.686957,0.0,0.0
33122,ISIC_9999320,IP_3650745,male,65.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020238,0.0,0.0,1.669718,0.0,0.0
33123,ISIC_9999515,IP_2026598,male,20.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018359,0.0,0.0,1.640222,0.0,0.0
33124,ISIC_9999666,IP_7702038,male,50.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017546,0.0,0.0,1.641765,0.0,0.0
33125,ISIC_9999806,IP_0046310,male,45.0,torso,nevus,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01717,0.0,0.0,1.622705,0.0,0.0


In [46]:
df.to_csv('/home/ubuntu/w210_melanoma/melanoma-detection/concatenated_whole.csv.gz', index = False, compression='gzip')

In [48]:
df_gz = pd.read_csv('/home/ubuntu/w210_melanoma/melanoma-detection/concatenated_whole.csv.gz', compression='gzip', error_bad_lines=False)
df_gz.tail()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
33121,ISIC_9999134,IP_6526534,male,50.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015403,0.0,0.0,1.686957,0.0,0.0
33122,ISIC_9999320,IP_3650745,male,65.0,torso,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020238,0.0,0.0,1.669718,0.0,0.0
33123,ISIC_9999515,IP_2026598,male,20.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018359,0.0,0.0,1.640222,0.0,0.0
33124,ISIC_9999666,IP_7702038,male,50.0,lower extremity,unknown,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017546,0.0,0.0,1.641765,0.0,0.0
33125,ISIC_9999806,IP_0046310,male,45.0,torso,nevus,benign,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01717,0.0,0.0,1.622705,0.0,0.0
