In [1]:
!pip install image-classifiers > /dev/null

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import os
import gc
import cv2
import pydicom
import warnings
import numpy as np 
import pandas as pd 
import seaborn as sns
import multiprocessing
import tensorflow as tf
import matplotlib.pyplot as plt

from PIL import Image
from joblib import Parallel, delayed
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from classification_models.tfkeras import Classifiers
from IPython.core.interactiveshell import InteractiveShell
from tensorflow.compat.v1.keras.layers import CuDNNLSTM, CuDNNGRU
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import StratifiedKFold,KFold, GroupKFold
from tensorflow.keras.applications import InceptionResNetV2,InceptionV3, ResNet50
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau,ModelCheckpoint
from tensorflow.keras.layers import Flatten, Dense, Dropout, Conv2D, MaxPooling2D,GlobalAveragePooling2D

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
plt.rcParams['figure.figsize'] = (10,5)

In [4]:
NUM_CORES = multiprocessing.cpu_count()
SPLITS = 5
INPUT_PATH = './../input/siim-isic-melanoma-classification'
SUB_PATH = './../../working/'
DIM = (128,128)
AUTO = tf.data.experimental.AUTOTUNE
BATCH = 16
EPOCHS = 10
CNN_ARCH = 'resnet50'

In [5]:
class DataGenerator(Sequence):
    def __init__(self,df,image_path,batch_size,dim,n_channels,to_fit):
      self.df = df
      self.image_path = image_path
      self.batch = batch_size
      self.dim = dim
      self.n_channels = n_channels
      self.to_fit = to_fit
      
    def __len__(self):
      return int(np.floor(self.df.shape[0])/self.batch)
    
    def __getitem__(self, index):
      list_IDs = self.df['image_name'].values[index*self.batch : (index+1)*self.batch]
      X = self._generate_X(list_IDs)
      if self.to_fit:
          target_y = self._generate_y(list_IDs)
          return np.array(X), np.array(target_y)
      return np.array(X)
    
    def _generate_X(self,list_IDs):
      X = Parallel(n_jobs=self.batch)(delayed(self._load_image)(i) for i in list_IDs)
      return X
    
    def _generate_y(self,list_IDs):
      target_y = []
      for i, ids in enumerate(list_IDs):
          target_y.append(self.df[self.df['image_name'] == ids]['target'].values[0])
      return target_y
    
    def _load_image(self,file_):
      img = cv2.imread(os.path.join(self.image_path,file_)+'.jpg')
      return img

In [6]:
def scale(type_,data):
    assert type_ in ['norm','std']
    if type_ == 'std':
        return (data - np.mean(data))/np.std(data)
    elif type_ == 'norm':
        return (data - np.min(data))/(np.max(data) - np.min(data))
    
    
def corrcted_age_from_dcm(folder,file):
    return pydicom.dcmread(f'{folder}/{file}.dcm')[('0010', '1010')][1:3]

In [7]:
def show(folder,file_name,gray=False):
    img = cv2.imread(f'jpeg/{folder}/{file_name}.jpg')
    if gray:
        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        print(f'Shape of Gray Image : {img.shape}')
    else:
        img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        print(f'Shape of RGB Image : {img.shape}')
    _=plt.imshow(img)
    _=plt.xticks([])
    _=plt.yticks([])

In [8]:
def color_constancy(img, power=6, gamma=None):
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    img_dtype = img.dtype

    if gamma is not None:
        img = img.astype('uint8')
        look_up_table = np.ones((256,1), dtype='uint8') * 0
        for i in range(256):
            look_up_table[i][0] = 255*pow(i/255, 1/gamma)
        img = cv2.LUT(img, look_up_table)

    img = img.astype('float32')
    img_power = np.power(img, power)
    rgb_vec = np.power(np.mean(img_power, (0,1)), 1/power)
    rgb_norm = np.sqrt(np.sum(np.power(rgb_vec, 2.0)))
    rgb_vec = rgb_vec/rgb_norm
    rgb_vec = 1/(rgb_vec*np.sqrt(3))
    img = np.multiply(img, rgb_vec)

    img = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2RGB)
    return img.astype(img_dtype)


In [9]:
def decode_jpeg(filename,label=None,size=(128,128)):
    if label is None:
        path = './../melanoma-128x128-jpeg/128x128/Test/'
    else:
        path = './../melanoma-128x128-jpeg/128x128/Train/'
    file_path = path + filename
    bits = tf.io.read_file(file_path)
    img = tf.image.decode_image(bits,channels=3)
    img = tf.stack((img[:,:,2],img[:,:,1],img[:,:,0]),axis=2)
    img = tf.cast(img,tf.float32)/255.0
    #img = tf.image.resize(img,size=size)
    if label is None:
        return img
    else:
        return img,label
    

def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label

In [10]:
def get_fold_data(fold):
    train_fold = (tf.data.Dataset
                    .from_tensor_slices((train[train['stratified_folds'] != fold]['image_name'],
                                         train[train['stratified_folds'] != fold]['target']))
                    .map(decode_jpeg,num_parallel_calls=AUTO)
                    .map(data_augment, num_parallel_calls=AUTO)
                    .repeat()
                    .batch(BATCH)
                    .prefetch(AUTO))
    valid_fold = (tf.data.Dataset
                    .from_tensor_slices((train[train['stratified_folds'] == fold]['image_name'],
                                         train[train['stratified_folds'] == fold]['target']))
                    .map(decode_jpeg,num_parallel_calls=AUTO)
                    .map(data_augment, num_parallel_calls=AUTO)
                    .repeat()
                    .batch(BATCH)
                    .prefetch(AUTO))
    return train_fold,valid_fold

In [11]:
def get_model():
    ResNet18, preprocess_input = Classifiers.get('resnet18')
    base_model = ResNet18(input_shape=(*DIM,3), weights='imagenet', include_top=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=[base_model.input], outputs=[output])  
    return model

In [12]:
os.chdir(INPUT_PATH)
os.listdir()

['test.csv',
 'train',
 'jpeg',
 'sample_submission.csv',
 'train.csv',
 'tfrecords',
 'test']

In [13]:
train = pd.read_csv('./../markings/marking.csv')
train['image_id'] = train['image_id'] + '.jpg'

dir_files = os.listdir('./../melanoma-128x128-jpeg/128x128/Train')
csv_files = train['image_id'].values

q = list(set(dir_files).intersection(set(csv_files)))
train = train.loc[train.image_id.isin(q)]

train['image_id'] = train['image_id'].apply(lambda x:x.split('.')[0])

train = train.rename(columns={'patient_id':'p_id',
                             'age_approx':'age',
                             'anatom_site_general_challenge':'site',
                             'image_id':'image_name'})

train = train.sort_values(by=['p_id','site','age'])

In [14]:
test = pd.read_csv('./../melanoma-age-corrected-dataset/test.csv')
submission = pd.read_csv('sample_submission.csv')
use_cols = test.columns.tolist() + ['target']
train = train[use_cols]

train['image_name'] = train['image_name'] + '.jpg'
test['image_name'] = test['image_name'] + '.jpg'
submission['image_name'] = submission['image_name'] + '.jpg'

In [15]:
folds = StratifiedKFold(n_splits=SPLITS)
train['stratified_folds'] = -1
for i, (_,val_idx) in enumerate(folds.split(X = train,y = train['target'])):
    train.loc[train.index.isin(val_idx),'stratified_folds'] = i

folds_value = []
for i in range(SPLITS):
    distribution = []
    distribution.append(i)
    distribution.append(train[(train['stratified_folds'] == i) & (train['target'] == 1)].shape[0]/train.shape[0])
    distribution.append(train[(train['stratified_folds'] == i) & (train['target'] == 0)].shape[0]/train.shape[0])
    folds_value.append(distribution)

pd.DataFrame(folds_value,columns=['stratified_folds','positive_distribution','negative_distribution'])

Unnamed: 0,stratified_folds,positive_distribution,negative_distribution
0,0,0.003253,0.196754
1,1,0.003629,0.196378
2,2,0.005975,0.159091
3,3,0.023026,0.176964
4,4,0.044956,0.154469


In [16]:
folds = np.linspace(0,SPLITS-1,SPLITS)
for i in folds:
    print(f'FOR FOLD {i}')
    for j in [x for x in folds if x not in [i]]:
        num_overlap = len(set(train[train['stratified_folds'] == i]['p_id'].values).\
                          intersection(set(train[train['stratified_folds'] == j]['p_id'].values)))
        if num_overlap > 0: 
            print(f'Overlapped Patient ids in stratified fold {j} is {num_overlap}')
    print('=============================================================')

FOR FOLD 0.0
Overlapped Patient ids in stratified fold 1.0 is 1782
Overlapped Patient ids in stratified fold 2.0 is 1672
Overlapped Patient ids in stratified fold 3.0 is 648
Overlapped Patient ids in stratified fold 4.0 is 53
FOR FOLD 1.0
Overlapped Patient ids in stratified fold 0.0 is 1782
Overlapped Patient ids in stratified fold 2.0 is 1669
Overlapped Patient ids in stratified fold 3.0 is 653
Overlapped Patient ids in stratified fold 4.0 is 54
FOR FOLD 2.0
Overlapped Patient ids in stratified fold 0.0 is 1672
Overlapped Patient ids in stratified fold 1.0 is 1669
Overlapped Patient ids in stratified fold 3.0 is 686
Overlapped Patient ids in stratified fold 4.0 is 52
FOR FOLD 3.0
Overlapped Patient ids in stratified fold 0.0 is 648
Overlapped Patient ids in stratified fold 1.0 is 653
Overlapped Patient ids in stratified fold 2.0 is 686
Overlapped Patient ids in stratified fold 4.0 is 867
FOR FOLD 4.0
Overlapped Patient ids in stratified fold 0.0 is 53
Overlapped Patient ids in strati

In [24]:
GPU = True 
TPU = False 
if(GPU):
  print('Setting GPU')
  K.clear_session()
  config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
  graph = tf.compat.v1.get_default_graph()
  sess = tf.compat.v1.Session(graph=graph,config=config)
  tf.compat.v1.keras.backend.set_session(sess)
elif((not GPU) or (TPU)):
  print('Setting TPU')
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
  except ValueError:
      tpu = None

  if tpu:
      tf.config.experimental_connect_to_cluster(tpu)
      tf.tpu.experimental.initialize_tpu_system(tpu)
      strategy = tf.distribute.experimental.TPUStrategy(tpu)
  else:
      strategy = tf.distribute.get_strategy()

  print("REPLICAS: ", strategy.num_replicas_in_sync)

Setting GPU


In [18]:
train_data_fold_0,valid_data_fold_0 = get_fold_data(0)
train_data_fold_1,valid_data_fold_1 = get_fold_data(1)
train_data_fold_2,valid_data_fold_2 = get_fold_data(2)
train_data_fold_3,valid_data_fold_3 = get_fold_data(3)
train_data_fold_4,valid_data_fold_4 = get_fold_data(4)

test_data = (tf.data.Dataset
            .from_tensor_slices(submission['image_name'])
            .map(decode_jpeg,num_parallel_calls=AUTO)
            .map(data_augment, num_parallel_calls=AUTO)
            .batch(BATCH)
            .prefetch(AUTO))
gc.collect()

22

In [25]:
model_checkpoint = ModelCheckpoint(filepath='./../../working/ResNet18_weights.{epoch:02d}.hdf5',
                                   monitor='val_auc',verbose=1,save_best_only=True,
                                   save_weights_only=True,mode='max')
lr_schedule = ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=3,min_lr=0.000008)

STEPS_PER_EPOCH = train[train['stratified_folds'] != 1].shape[0]//BATCH
STEPS_PER_EPOCH_VALID = train[train['stratified_folds'] == 1].shape[0]//BATCH

In [26]:
model_fold_0 = get_model()
model_fold_0.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(lr=0.000025),
              metrics=['accuracy',tf.keras.metrics.AUC()])
history_0 = model_fold_0.fit(train_data_fold_0,
                epochs=10,
                callbacks=[model_checkpoint,lr_schedule],
                validation_data=valid_data_fold_0,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=STEPS_PER_EPOCH_VALID)

In [None]:
probs_fold_0 = model_fold_0.predict(test_data,verbose = 1)
submission['pred_fold_0'] = probs_fold_0

In [47]:
model_fold_1 = get_model()
model_fold_1.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(lr=0.000025),
              metrics=['accuracy',tf.keras.metrics.AUC()])
history_1 = model_fold_0.fit(train_data_fold_1,
                epochs=10,
                callbacks=[model_checkpoint,lr_schedule],
                validation_data=valid_data_fold_1,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=STEPS_PER_EPOCH_VALID)

Train for 2920 steps, validate for 730 steps
Epoch 1/10
Epoch 00001: val_auc improved from 0.76777 to 0.89187, saving model to ./../../working/ResNet18_weights.01.hdf5
Epoch 2/10
Epoch 00002: val_auc did not improve from 0.89187
Epoch 3/10
Epoch 00003: val_auc did not improve from 0.89187
Epoch 4/10
Epoch 00004: val_auc did not improve from 0.89187
Epoch 5/10
Epoch 00005: val_auc did not improve from 0.89187
Epoch 6/10
Epoch 00006: val_auc did not improve from 0.89187
Epoch 7/10
Epoch 00007: val_auc did not improve from 0.89187
Epoch 8/10
Epoch 00008: val_auc did not improve from 0.89187
Epoch 9/10
Epoch 00009: val_auc did not improve from 0.89187
Epoch 10/10
Epoch 00010: val_auc did not improve from 0.89187


In [48]:
probs_fold_1 = model_fold_1.predict(test_data,verbose = 1)
submission['pred_fold_1'] = probs_fold_1



In [None]:
model_fold_2 = get_model()
model_fold_2.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(lr=0.000025),
              metrics=['accuracy',tf.keras.metrics.AUC()])
history_2 = model_fold_0.fit(train_data_fold_2,
                epochs=10,
                callbacks=[model_checkpoint,lr_schedule],
                validation_data=valid_data_fold_2,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=STEPS_PER_EPOCH_VALID)

In [None]:
model_fold_3 = get_model()
model_fold_3.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(lr=0.000025),
              metrics=['accuracy',tf.keras.metrics.AUC()])
history_3 = model_fold_0.fit(train_data_fold_3,
                epochs=10,
                callbacks=[model_checkpoint,lr_schedule],
                validation_data=valid_data_fold_3,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=STEPS_PER_EPOCH_VALID)

In [None]:
model_fold_4 = get_model()
model_fold_4.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(lr=0.000025),
              metrics=['accuracy',tf.keras.metrics.AUC()])
history_4 = model_fold_0.fit(train_data_fold_4,
                epochs=10,
                callbacks=[model_checkpoint,lr_schedule],
                validation_data=valid_data_fold_4,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=STEPS_PER_EPOCH_VALID)

In [None]:
probs_fold_2 = model_fold_2.predict(test_data,verbose = 1)
submission['pred_fold_2'] = probs_fold_2

probs_fold_3 = model_fold_3.predict(test_data,verbose = 1)
submission['pred_fold_3'] = probs_fold_3

probs_fold_4 = model_fold_4.predict(test_data,verbose = 1)
submission['pred_fold_4'] = probs_fold_4

## Garbage

In [None]:
%%capture
"""#WaveNet Model
def WaveNetResidualConv1D(num_filters, kernel_size, stacked_layer):

    def build_residual_block(l_input):
        resid_input = l_input
        for dilation_rate in [2**i for i in range(stacked_layer)]:
            l_sigmoid_conv1d = Conv1D(
              num_filters, kernel_size, dilation_rate=dilation_rate,
              padding='same', activation='sigmoid')(l_input)
            l_tanh_conv1d = Conv1D(
             num_filters, kernel_size, dilation_rate=dilation_rate,
             padding='same', activation='mish')(l_input)
            l_input = Multiply()([l_sigmoid_conv1d, l_tanh_conv1d])
            l_input = Conv1D(num_filters, 1, padding='same')(l_input)
            resid_input = Add()([resid_input ,l_input])
        return resid_input
    return build_residual_block
def Classifier(shape_):
    num_filters_ = 16
    kernel_size_ = 3
    stacked_layers_ = [12, 8, 4, 1]
    l_input = Input(shape=(shape_))
    x = Conv1D(num_filters_, 1, padding='same')(l_input)
    x = WaveNetResidualConv1D(num_filters_, kernel_size_, stacked_layers_[0])(x)
    x = Conv1D(num_filters_*2, 1, padding='same')(x)
    x = WaveNetResidualConv1D(num_filters_*2, kernel_size_, stacked_layers_[1])(x)
    x = Conv1D(num_filters_*4, 1, padding='same')(x)
    x = WaveNetResidualConv1D(num_filters_*4, kernel_size_, stacked_layers_[2])(x)
    x = Conv1D(num_filters_*8, 1, padding='same')(x)
    x = WaveNetResidualConv1D(num_filters_*8, kernel_size_, stacked_layers_[3])(x)
    l_output = Dense(1, activation='linear')(x)
    model = models.Model(inputs=[l_input], outputs=[l_output])
    opt = Adam(lr=LR)
    opt = tfa.optimizers.SWA(opt)
    model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
    return model"""

"""SIIM Pneumonia Prediction Image Aug
    
    albu.Compose([
    albu.HorizontalFlip(),
    albu.VerticalFlip()
    albu.OneOf([
        albu.RandomContrast(),
        albu.RandomGamma(),
        albu.RandomBrightness(),
        ], p=0.3),
    albu.OneOf([
        albu.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03),
        albu.GridDistortion(),
        albu.OpticalDistortion(distort_limit=2, shift_limit=0.5),
        ], p=0.3),
    albu.ShiftScaleRotate(),
    albu.Resize(img_size,img_size,always_apply=True),
    ])

"""

"""folds = GroupKFold(n_splits=SPLITS)
train['group_folds'] = -1
for i, (_,val_idx) in enumerate(folds.split(X = train,y = train['target'],groups=train['p_id'])):
    train.loc[train.index.isin(val_idx),'group_folds'] = i
    
for i in range(SPLITS):
    print(f'Fold {i}')
    print(train[train['group_folds'] == i]['target'].value_counts())
    print('===================')"""