In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
from pathlib import Path
import shutil
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.callbacks import (
    ReduceLROnPlateau,
    EarlyStopping,
    ModelCheckpoint,
    TensorBoard
)
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils.model import make_model, freeze_all_vgg, unfreeze_last_vgg
from utils.data import train_test_valid_split, filter_binary_labels, optimize_dataset

In [3]:
DATASET_SOURCE_PATH = Path(r'../data/dataset')
SPLITS_DESTINATION = Path(r'../data')

IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 64
SEED = None

#Model parameters:
N_HIDDEN = 512

#Train parameters:
BASE_LEARNING_RATE = 0.001
FINE_TUNING_LEARNING_RATE = 0.001
INITIAL_EPOCHS = 30
FINE_TUNING_EPOCHS = 30
FINE_TUNE_AT_LAYER = 15
LOG_DIR = Path(r'../models/vgg16/logs')
SAVE_DIR = Path(r'../models/vgg16/checkpoints/trained_weights')

## Image files management -> train, test, valid splits

In [4]:
X_train, X_test, X_valid = train_test_valid_split(DATASET_SOURCE_PATH, test_size=0.15, valid_size=0.15)

In [5]:
split = 'train'
destination_path = Path(SPLITS_DESTINATION) / split
if Path(destination_path).exists():
    print('--------------DELETE ' + split.upper() + ' SPLIT------------')
    for directory in destination_path.iterdir():
        if directory.is_dir():
            shutil.rmtree(directory)
print('--------------COPY ' + split.upper() + ' SPLIT------------')
for idx, _ in tqdm(X_train.iterrows(), total=X_train.shape[0]):
    destination = (destination_path / idx.parent.name) / idx.name
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    shutil.copy(idx, destination)

split = 'test'
destination_path = Path(SPLITS_DESTINATION) / split
if Path(destination_path).exists():
    print('--------------DELETE ' + split.upper() + ' SPLIT------------')
    for directory in destination_path.iterdir():
        if directory.is_dir():
            shutil.rmtree(directory)
print('--------------COPY ' + split.upper() + ' SPLIT------------')
for idx, _ in tqdm(X_test.iterrows(), total=X_test.shape[0]):
    destination = (destination_path / idx.parent.name) / idx.name
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    shutil.copy(idx, destination)

split = 'valid'
destination_path = Path(SPLITS_DESTINATION) / split
if Path(destination_path).exists():
    print('--------------DELETE ' + split.upper() + ' SPLIT------------')
    for directory in destination_path.iterdir():
        if directory.is_dir():
            shutil.rmtree(directory)
print('--------------COPY ' + split.upper() + ' SPLIT------------')
for idx, _ in tqdm(X_valid.iterrows(), total=X_valid.shape[0]):
    destination = (destination_path / idx.parent.name) / idx.name
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    shutil.copy(idx, destination)

--------------DELETE TRAIN SPLIT------------
--------------COPY TRAIN SPLIT------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 823/823 [00:01<00:00, 679.04it/s]


--------------DELETE TEST SPLIT------------
--------------COPY TEST SPLIT------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:00<00:00, 675.18it/s]


--------------DELETE VALID SPLIT------------
--------------COPY VALID SPLIT------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:00<00:00, 562.07it/s]


## Dataset loading

In [6]:
train_path = Path(r'../data/train')
train_ds = tf.keras.preprocessing.image_dataset_from_directory(train_path, image_size=(IMG_HEIGHT, IMG_WIDTH),\
                                                               batch_size=BATCH_SIZE, shuffle=True, \
                                                               label_mode='categorical', seed=SEED)

valid_path = Path(r'../data/valid')
valid_ds = tf.keras.preprocessing.image_dataset_from_directory(valid_path, image_size=(IMG_HEIGHT, IMG_WIDTH),\
                                                               batch_size=BATCH_SIZE, shuffle=True, \
                                                               label_mode='categorical', seed=SEED)

class_names = train_ds.class_names
assert class_names == valid_ds.class_names
AUTOTUNE = tf.data.AUTOTUNE

if len(class_names) == 2:  # take the one-hot-encoded matrix of labels and convert to a vector if binary classification
    train_ds = train_ds.map(filter_binary_labels, num_parallel_calls=AUTOTUNE)
    valid_ds = valid_ds.map(filter_binary_labels, num_parallel_calls=AUTOTUNE)
train_ds = optimize_dataset(train_ds)
valid_ds = optimize_dataset(valid_ds)

Found 822 files belonging to 2 classes.
Found 177 files belonging to 2 classes.


In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        label_idx = labels.numpy()[i][0] if len(class_names) == 2 else np.argmax(labels.numpy()[i], axis=0)
        plt.title(class_names[label_idx])
        plt.axis("off")

## Model

In [8]:
model = make_model(n_classes=len(class_names), n_hidden=N_HIDDEN)
freeze_all_vgg(model)
loss = tf.keras.losses.CategoricalCrossentropy() if len(class_names) > 2 else tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=BASE_LEARNING_RATE),
              loss=loss, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 sequential (Sequential)     (None, 224, 224, 3)       0         
                                                                 
 tf.__operators__.getitem (S  (None, 224, 224, 3)      0         
 licingOpLambda)                                                 
                                                                 
 tf.nn.bias_add (TFOpLambda)  (None, 224, 224, 3)      0         
                                                                 
 vgg16 (Functional)          (None, 512)               14714688  
                                                                 
 flatten (Flatten)           (None, 512)               0         
                                                             

#### Classifier initial training

In [9]:
tb = TensorBoard(log_dir=LOG_DIR)
checkpoint = ModelCheckpoint(r'../models/vgg16/checkpoints/train_{epoch}.tf', verbose=1, save_weights_only=True,\
                             save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1)

history = model.fit(train_ds, epochs=INITIAL_EPOCHS, validation_data=valid_ds, callbacks=[tb, checkpoint, reduce_lr, early_stopping])

Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.30603, saving model to ../models/vgg16/checkpoints\train_1.tf
Epoch 2/30
Epoch 00002: val_loss improved from 2.30603 to 1.55796, saving model to ../models/vgg16/checkpoints\train_2.tf
Epoch 3/30
Epoch 00003: val_loss improved from 1.55796 to 0.96544, saving model to ../models/vgg16/checkpoints\train_3.tf
Epoch 4/30
Epoch 00004: val_loss improved from 0.96544 to 0.91993, saving model to ../models/vgg16/checkpoints\train_4.tf
Epoch 5/30
Epoch 00005: val_loss improved from 0.91993 to 0.75096, saving model to ../models/vgg16/checkpoints\train_5.tf
Epoch 6/30
Epoch 00006: val_loss improved from 0.75096 to 0.74665, saving model to ../models/vgg16/checkpoints\train_6.tf
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.74665
Epoch 8/30
Epoch 00008: val_loss improved from 0.74665 to 0.71698, saving model to ../models/vgg16/checkpoints\train_8.tf
Epoch 9/30
Epoch 00009: val_loss improved from 0.71698 to 0.71357, saving model to ../mo

#### Fine tuning

In [10]:
unfreeze_last_vgg(model, which_freeze=FINE_TUNE_AT_LAYER)

total_epochs = INITIAL_EPOCHS + FINE_TUNING_EPOCHS
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=FINE_TUNING_LEARNING_RATE),
              loss=loss, metrics=['accuracy'])
history = model.fit(train_ds, epochs=total_epochs, validation_data=valid_ds, callbacks=[tb, checkpoint, reduce_lr, early_stopping], \
                    initial_epoch=history.epoch[-1])

model.save_weights(SAVE_DIR)

Epoch 25/60
Epoch 00025: val_loss did not improve from 0.65999
Epoch 26/60
Epoch 00026: val_loss did not improve from 0.65999
Epoch 27/60
Epoch 00027: val_loss did not improve from 0.65999
Epoch 28/60
Epoch 00028: val_loss did not improve from 0.65999
Epoch 29/60
Epoch 00029: val_loss did not improve from 0.65999
Epoch 30/60
Epoch 00030: val_loss did not improve from 0.65999
Epoch 31/60
Epoch 00031: val_loss improved from 0.65999 to 0.61122, saving model to ../models/vgg16/checkpoints\train_31.tf
Epoch 32/60
Epoch 00032: val_loss did not improve from 0.61122
Epoch 33/60
Epoch 00033: val_loss did not improve from 0.61122
Epoch 34/60
Epoch 00034: val_loss did not improve from 0.61122
Epoch 35/60
Epoch 00035: val_loss did not improve from 0.61122

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 36/60
Epoch 00036: val_loss did not improve from 0.61122
Epoch 37/60
Epoch 00037: val_loss did not improve from 0.61122
Epoch 38/60
Epoch 00038: val_loss did 

#### Model evaluation

In [13]:
test_path = Path(r'../data/test')
test_ds = tf.keras.preprocessing.image_dataset_from_directory(test_path, image_size=(IMG_HEIGHT, IMG_WIDTH), \
                                                              batch_size=BATCH_SIZE, shuffle=False, \
                                                              label_mode='categorical')
assert class_names == test_ds.class_names

if len(class_names) == 2:  # take the one-hot-encoded matrix of labels and convert to a vector if binary classification
    test_ds = test_ds.map(filter_binary_labels, num_parallel_calls=AUTOTUNE)
test_ds = optimize_dataset(test_ds)

metrics = model.evaluate(test_ds)
print('Loss: {} --------- Accuracy: {}%'.format(metrics[0], np.round(metrics[1]*100, 2)))

y_pred = model.predict(test_ds)
y_true = tf.concat([y for x, y in test_ds], axis=0)
if len(class_names) == 2: # uses a threshold for the predictions if binary classification problem
    y_pred[y_pred >= 0.5] = 1
    y_pred[y_pred < 0.5] = 0
    y_true = y_true.numpy()
else: # uses argmax if not binary classification
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_true.numpy(), axis=1)

print(classification_report(y_true, y_pred, target_names=class_names, digits=2))

pred_labels = [('PRED_' + class_name) for class_name in class_names]
real_labels = [('REAL_' + class_name) for class_name in class_names]
pd.DataFrame(confusion_matrix(y_true, y_pred), columns=pred_labels, index=real_labels)

Found 177 files belonging to 2 classes.
Loss: 0.487979918718338 --------- Accuracy: 83.62%
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       108
           1       0.87      0.68      0.76        69

    accuracy                           0.84       177
   macro avg       0.85      0.81      0.82       177
weighted avg       0.84      0.84      0.83       177



Unnamed: 0,PRED_0,PRED_1
REAL_0,101,7
REAL_1,22,47
