In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.applications.mobilenet_v2 import preprocess_input


In [93]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [94]:
import itertools
from typing import List
import numpy as np
import os
import re
from typing import Tuple
import matplotlib.pyplot as plt

In [95]:
def get_score(h5_file:str) -> Tuple[float, float]:
    """ Extracts useful data from a .h5 file

    :param h5_file: File to extract the train and val accuracy scores from
    :return: train accuracy, difference between train and validation
    """
    scores = re.findall(r"\d+\.\d+",h5_file)
    train_accuracy = float(scores[0])
    val_accuracy = float(scores[1])
    difference = train_accuracy - val_accuracy
    return val_accuracy, difference



def keep_best_saved_h5(folder_relative:str, common_filename: str, maximum_difference:float) -> str:
    """ Goes through all common named .h5-files,
    deletes all from folder except for the best result.

    :param maximum_difference: max difference allowed between accuracy and val_accuracy.
    :return Best scoring .h5 file
    """
    current_directory = os.getcwd()
    os.chdir(current_directory + folder_relative)
    all_files = os.listdir()
    best_scoring_file = ""
    # try-except incase of errors: returns to current directory
    try:
        # Keep the files with a low difference between train_accuracy and validation_accuracy.
        # Deletes the rest from directory
        model_files = [file for file in all_files if file.startswith(common_filename)]
        scores = []
        not_overfitting_models = []
        best_scoring_file = ""
        for file in model_files:
            validation_accuracy, diff = get_score(file) # Uses function get_score()
            if abs(diff) > maximum_difference:
                os.remove(file)
            else:
                not_overfitting_models.append(file)
                scores.append(validation_accuracy)

        # Keep only the file with highest validation accuracy score.
        # Deletes the rest from directory
        highest_score_index = scores.index(max(scores))
        for i, file in enumerate(not_overfitting_models):
            if i == highest_score_index:
                best_scoring_file = file
            else:
                os.remove(file)
        os.chdir(current_directory)
    except:
        os.chdir(current_directory)
    print(f"Currently in directory:{os.getcwd()}")
    print(f"File coming out of the function: {best_scoring_file}")
    return best_scoring_file

In [152]:
train_path = "data/train"
test_path = "data/test"
labels = {"akiec":0, "bcc":1, "bkl":2, "df":3, "mel":4, "nv":5, "vasc":6}
classes = list(labels.keys())
size = (224,224)

In [161]:
model_generator = ImageDataGenerator(preprocessing_function=preprocess_input, rotation_range = 40,
                                        width_shift_range = 0.2, height_shift_range = 0.2, rescale = 1./255,
                                        shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True,
                                        validation_split=.2)

train_batches = model_generator.flow_from_directory(train_path,target_size=(1, 28, 28, 3), classes=classes, color_mode='rgb', batch_size=10, shuffle=True, subset="training")
validation_batches = model_generator.flow_from_directory(train_path,size,classes=classes, color_mode='rgb', batch_size=20, shuffle=True, subset="validation")
test_batches = ImageDataGenerator(preprocessing_function=preprocess_input)\
    .flow_from_directory(test_path,size,classes=classes, batch_size=10, shuffle=False)

Found 21622 images belonging to 7 classes.
Found 5402 images belonging to 7 classes.
Found 2003 images belonging to 7 classes.


In [164]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(224, 224, 3, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(7)
])
  
model_extended_2.summary()

ValueError: Input 0 of layer max_pooling2d_95 is incompatible with the layer: expected ndim=4, found ndim=5. Full shape received: (None, 28, 28, 3, 16)

In [165]:
# Compile and fit the model (10 epochs too)
model_extended_2.compile(loss=tf.keras.losses.CategoricalCrossentropy,
              optimizer='adam',
              metrics=['accuracy'])

model_extended_2.fit(train_batches, epochs=10,  validation_data=validation_batches, batch_size=16, verbose=1)

ValueError: could not broadcast input array from shape (1,28,3) into shape (1,28,28,3,3)

# model 1


In [113]:
# Create the final version of the model
# Create the second version of the model
# pixel width and height of our images

# number of filters in the convnet layer
filters = 64

# conv net parameters
strides = (2, 2)
pool_size = (2,2)
kernel_size = (5, 5)

model = Sequential()
model.add(Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, activation='relu', padding='same', input_shape=(32, 32, 3)))
model.add(Dropout(0.3))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(filters=128, kernel_size=kernel_size, strides=strides, activation='relu', padding='same'))
model.add(Dropout(0.3))
model.add(Conv2D(filters=256, kernel_size=kernel_size, strides=strides, activation='relu', padding='same'))
model.add(Dropout(0.3))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(7, activation='softmax'))
  
model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_84 (Conv2D)           (None, 16, 16, 64)        4864      
_________________________________________________________________
dropout_49 (Dropout)         (None, 16, 16, 64)        0         
_________________________________________________________________
max_pooling2d_49 (MaxPooling (None, 8, 8, 64)          0         
_________________________________________________________________
conv2d_85 (Conv2D)           (None, 4, 4, 128)         204928    
_________________________________________________________________
dropout_50 (Dropout)         (None, 4, 4, 128)         0         
_________________________________________________________________
conv2d_86 (Conv2D)           (None, 2, 2, 256)         819456    
_________________________________________________________________
dropout_51 (Dropout)         (None, 2, 2, 256)       

In [114]:
# Compile and fit the model (10 epochs too)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer='adam',
              metrics=['accuracy'])

model_name = 'model_costumized'
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
mc = ModelCheckpoint('deployment/models/'+model_name+'training{accuracy:.4f}_{val_accuracy:.4f}.h5',
                     monitor='val_accuracy', mode='max', verbose=1,
                     save_best_only=True)
#lrl = ReduceLROnPlateau(patience=10,verbose=1,monitor="loss")

model_costumized = model.fit(train_batches, epochs=10,  validation_data=validation_batches, batch_size=16, verbose=1, callbacks=[es, mc])

ValueError: could not broadcast input array from shape (32,32,3) into shape (32,32,3,3)

In [38]:
from tensorflow.keras.models import load_model
best_model = keep_best_saved_h5("/deployment/models/",model_name,0.02)
model = load_model(f"deployment/models/{best_model}")

Currently in directory:C:\Users\leono\Documents\python_projects\challenge-mole\challenge-mole
File coming out of the function: model_costumizedtraining0.4894_0.4828.h5


# Mobile_net different pre processing

In [48]:
from keras.applications.mobilenet_v2 import preprocess_input
import numpy as np
import os

def get_preprocessed_images(images_directory: str, image_size: tuple) -> list:
    images = []
    for img in os.listdir(images_directory):
        img = image.load_img(images_directory+img, target_size=image_size)
        img = image.img_to_array(img)
        img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
        img = preprocess_input(img)
        images.append(img)
    return np.vstack(images)

In [52]:
# Import the keras preprocessing method.
from tensorflow.keras.preprocessing import image

image_size = (224, 224)
# Load your images and preprocess them.
#akiec_images = get_preprocessed_images("data/train/akiec", image_size)
cancer_images = get_preprocessed_images("data", image_size)
#bkl_images = get_preprocessed_images("data/train/bkl", image_size)
#df_images = get_preprocessed_images("data/train/df", image_size)
#mel_images = get_preprocessed_images("data/train/mel", image_size)
#nv_images = get_preprocessed_images("data/train/nv", image_size)
#vasc_images = get_preprocessed_images("data/train/vasc", image_size)

# Make a numpy array for each of the class labels (one hot encoded).
akiec_labels = np.tile([1, 0], (akiec_images.shape[0], 1))
bcc_labels = np.tile([0, 1], (bcc_images.shape[0], 1))
bkl_labels = np.tile([0, 1], (bkl_images.shape[0], 1))
df_labels = np.tile([0, 1], (df_images.shape[0], 1))
mel_labels = np.tile([0, 1], (mel_labels.shape[0], 1))
nv_labels = np.tile([0, 1], (nv_labels.shape[0], 1))
vasc_labels = np.tile([0, 1], (vasc_labels.shape[0], 1))

# Concatenate your images and your labels into X and y.
X = np.concatenate([akiec_images, bcc_images, bkl_images, df_images, mel_images, nv_images, vasc_images])
y = np.concatenate([akiec_labels, bcc_labels, bkl_labels, df_labels, mel_labels, nv_labels, vasc_labels])

print(X.shape, y.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'dataHAM10000_metadata.csv'

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, 
    y,
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, 
    y_train_val,
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)

In [None]:
from keras.preprocessing.image import ImageDataGenerator

# Determine the number of generated samples you want per original sample.
datagen_batch_size = 16

# Make a datagenerator object using ImageDataGenerator.
train_datagen = ImageDataGenerator(rotation_range=60,
                                    horizontal_flip=True)

# Feed the generator your train data.
train_generator = train_datagen.flow(X_train, y_train, batch_size=datagen_batch_size)

# Make a datagenerator object using ImageDataGenerator.
validation_datagen = ImageDataGenerator(rotation_range=60,
                                        horizontal_flip=True)

# Feed the generator your validation data.
validation_generator = validation_datagen.flow(X_val, y_val, batch_size=datagen_batch_size)

In [None]:
# Import your chosen model!
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2

# Make a model object. 
# Make sure you exclude the top part. set the input shape of the model to 224x224 pixels, with 3 color channels.
model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224,224,3))

# Freeze the imported layers so they cannot be retrained.
for layer in model.layers:
    layer.trainable = False
    
model.summary()

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout

new_model = Sequential()
new_model.add(model)
new_model.add(Flatten())
new_model.add(Dense(64, activation='relu'))
new_model.add(Dropout(0.5))
new_model.add(Dense(7, activation='sigmoid'))

# Summarize.
new_model.summary()

In [None]:
# Compile and fit the model. Use the Adam optimizer and crossentropical loss. 
# Use the validation data argument during fitting to include your validation data.
new_model.compile(optimizer='adam',
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                  metrics=['accuracy'])
history = new_model.fit(train_generator,
                        epochs=10, 
                        batch_size=8,
                        validation_data=validation_generator
                       )

In [None]:
import tensorflow
import matplotlib.pyplot as plt

def plot_history(history : tensorflow.keras.callbacks.History):
    """ This helper function takes the tensorflow.python.keras.callbacks.History
    that is output from your `fit` method to plot the loss and accuracy of
    the training and validation set.
    """
    fig, axs = plt.subplots(1,2, figsize=(12,6))
    axs[0].plot(history.history['accuracy'], label='training set')
    axs[0].plot(history.history['val_accuracy'], label = 'validation set')
    axs[0].set(xlabel = 'Epoch', ylabel='Accuracy', ylim=[0, 1])

    axs[1].plot(history.history['loss'], label='training set')
    axs[1].plot(history.history['val_loss'], label = 'validation set')
    axs[1].set(xlabel = 'Epoch', ylabel='Loss', ylim=[0, 10])
    
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower right')
    
plot_history(history)

In [47]:
# Mobile_net same pre processing

In [42]:
model_generator_mobile_net = ImageDataGenerator(preprocessing_function=preprocess_input, rotation_range = 40,
                                        width_shift_range = 0.2, height_shift_range = 0.2, rescale = 1./255,
                                        shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True,
                                        validation_split=.2)

train_batches_mobile_net = model_generator_mobile_net.flow_from_directory(train_path,target_size=size,classes=classes,
                                                batch_size=10, shuffle=True, subset="training")
validation_batches_mobile_net = model_generator_mobile_net.flow_from_directory(train_path,size,classes=classes,
                                                batch_size=20, shuffle=True, subset="validation")
test_batches_mobile_net = ImageDataGenerator(preprocessing_function=preprocess_input)\
    .flow_from_directory(test_path,size,classes=classes, batch_size=10, shuffle=False)

Found 21622 images belonging to 7 classes.
Found 5402 images belonging to 7 classes.
Found 2003 images belonging to 7 classes.


In [45]:
from keras import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout

new_model = Sequential()
new_model.add(model)
new_model.add(Flatten())
new_model.add(Dense(64, activation='relu'))
new_model.add(Dropout(0.3))
new_model.add(Dense(128, activation='relu'))
new_model.add(Dropout(0.3))
new_model.add(Dense(7, activation='sigmoid'))

# Summarize.
new_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_2 (Sequential)    (None, 7)                 628231    
_________________________________________________________________
flatten_5 (Flatten)          (None, 7)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                512       
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 7)                

In [46]:
# Compile and fit the model (10 epochs too)
new_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              optimizer='adam',
              metrics=['accuracy'])

model_name = 'model_costumized_mobile_net'
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
mc = ModelCheckpoint('deployment/models/'+model_name+'training{accuracy:.4f}_{val_accuracy:.4f}.h5',
                     monitor='val_accuracy', mode='max', verbose=1,
                     save_best_only=True)
#lrl = ReduceLROnPlateau(patience=10,verbose=1,monitor="loss")

model_costumized_mobile_net = model.fit(train_batches, epochs=10,  validation_data=validation_batches, batch_size=16, verbose=1, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.47927, saving model to deployment/models\model_costumized_mobile_nettraining0.5000_0.4793.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.47927 to 0.48612, saving model to deployment/models\model_costumized_mobile_nettraining0.5094_0.4861.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.48612
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.48612
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.48612 to 0.51259, saving model to deployment/models\model_costumized_mobile_nettraining0.5433_0.5126.h5
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.51259
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.51259
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.51259
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.51259
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.51259
