In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

# Tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

# Some libraries
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns

#Load dataset
input_file = np.load('/content/drive/My Drive/public_data.npz', allow_pickle=True)
data = input_file['data']

#Normalize data
data = data / 255.0
labels = input_file['labels']

label_dict = {'unhealthy': 0, 'healthy': 1}
labels = np.array([label_dict[label] for label in labels])

2.14.0


A function to show images

In [24]:
def show_images(X_val, y_val):
    num_img = 10
    fig, axes = plt.subplots(1, num_img, figsize=(96, 96))

    # Iterate through the selected number of images
    for i in range(num_img):
        ax = axes[i % num_img]
        ax.imshow((X_val[i] / 255))  # Show the image
        ax.set_title(f'{y_val[i][0]}')  # Show the corresponding digit label


    # Adjust layout and display the images
    plt.tight_layout()
    plt.show()

Remove unrelevant data

In [25]:

shrek_indices = []
trol_indices = []
new_data = []
new_labels = []
for i, image in enumerate(data):
  if np.sum(data[506] - image) == 0:
    shrek_indices.append(i)
  elif np.sum(data[338] - image) == 0:
    trol_indices.append(i)
  else:
    new_data.append(image)
    new_labels.append(labels[i])

data = np.array(new_data)
labels = np.array(new_labels)

In [26]:


#split into train and test
X_train_val, X_test, y_train_val, y_test = train_test_split(data, labels, test_size=0.4, random_state=seed)

labels = np.expand_dims(labels, axis=1)
#split into train and validation
y_train_val = np.expand_dims(y_train_val, axis=1)
y_test = np.expand_dims(y_test, axis=1)

labels = tfk.utils.to_categorical(labels)

print("X_train shape: ", X_train_val.shape)
print("X_test shape: ", X_test.shape)

print("y_train shape: ", y_train_val.shape)
print("y_test shape: ", y_test.shape)

# Show the first 10 images
#show_images(X_train_val, y_train_val)


#Counting occurrences of target classes:')
#print(pd.DataFrame(y_train_val, columns=['digit'])['digit'].value_counts())


y_train_val = tfk.utils.to_categorical(y_train_val)
y_test = tfk.utils.to_categorical(y_test)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=seed)

# Compare categorical label and "default" label representation
print('Categorical label:', y_train[0])           # Display the categorical label
print('"Default" label:', np.argmax(y_train[0]))   # Display the equivalent numeric label


# Define key model parameters
input_shape = X_train.shape[1:]  # Input shape for the model
output_shape = y_train.shape[1]  # Output shape for the model
batch_size = 32               # Batch size for training
epochs = 100                   # Number of training epochs

# Print the defined parameters
print("Epochs:", epochs)
print("Batch Size:", batch_size)
print("Input Shape:", input_shape)
print("Output Shape:", output_shape)

X_train shape:  (3002, 96, 96, 3)
X_test shape:  (2002, 96, 96, 3)
y_train shape:  (3002, 1)
y_test shape:  (2002, 1)
Categorical label: [0. 1.]
"Default" label: 1
Epochs: 100
Batch Size: 32
Input Shape: (96, 96, 3)
Output Shape: 2


In [27]:
def buildCNN(input_shape, output_shape):
    # Define the model
    preprocessing = tf.keras.Sequential([
        tfkl.RandomFlip("horizontal"),
        tfkl.RandomRotation(0.2),
        tfkl.RandomBrightness(0.2, value_range=(0,1)),
        tfkl.RandomTranslation(0.2,0.2),
    ], name='preprocessing')

    input_layer = tfkl.Input(shape=input_shape)
    x = preprocessing(input_layer)
    x = tfkl.Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same', name='conv_1')(x)
    x = tfkl.MaxPooling2D(pool_size=(2, 2), name='pool_1')(x)
    x = tfkl.Dropout(0.25, name='dropout1')(x)
    x = tfkl.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same', name='conv_2')(x)
    x = tfkl.MaxPooling2D(pool_size=(2, 2), name='pool_2')(x)
    x = tfkl.Dropout(0.25, name='dropout2')(x)
    x = tfkl.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same', name='conv_3')(x)
    x = tfkl.MaxPooling2D(pool_size=(2, 2), name='pool_3')(x)
    x = tfkl.Dropout(0.25, name='dropout3')(x)
    x = tfkl.Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same', name='conv_4')(x)
    x = tfkl.MaxPooling2D(pool_size=(2, 2), name='pool_4')(x)
    x = tfkl.Dropout(0.25, name='dropout4')(x)
    x = tfkl.Flatten(name='flatten')(x)
    x = tfkl.Dense(256, activation='relu', name='dense_1')(x)
    x = tfkl.Dense(64, activation='relu', name='dense_2')(x)
    output_layer = tfkl.Dense(output_shape, activation='softmax', name='output')(x)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [28]:
early_stopping = tfk.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)

In [29]:
model_checkpoint = tfk.callbacks.ModelCheckpoint('best_model_finetune.h5', monitor='val_accuracy', save_best_only=True, mode='max')

In [30]:
from sklearn.utils.class_weight import compute_class_weight
#class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
#class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights_dict = {0: 0.81, 1: 1.29} #precomputed

print(f"Class weights: {class_weights_dict}")

Class weights: {0: 0.81, 1: 1.29}


In [31]:
def scheduler(epoch, lr):
  if epoch < 50:
    return lr
  else:
    return lr * tf.math.exp(-0.1)
lrscheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
cnn_model = buildCNN(input_shape, output_shape)
cnn_model.summary()

history = cnn_model.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        callbacks=[early_stopping, model_checkpoint, lrscheduler],
                        validation_data=(X_val, y_val))