In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from glob import glob
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, Callback
from itertools import cycle
import tensorflow as tf

In [None]:
# Constants
SIZE = 128
NUM_CLASSES = 7
BATCH_SIZE = 128
EPOCHS = 100

In [None]:
# Paths
train_dir = r"C:\Users\guita\Desktop\miniproject1\HAM10000\train"
val_dir = r"C:\Users\guita\Desktop\miniproject1\HAM10000\val"
test_dir = r"C:\Users\guita\Desktop\miniproject1\HAM10000\test"

In [None]:
# Load images and labels
def load_images_and_labels(image_dir, label_encoder):
    image_paths = glob(os.path.join(image_dir, '*', '*.jpg'))
    images = []
    labels = []
    
    for path in image_paths:
        img = Image.open(path).resize((SIZE, SIZE))
        images.append(np.asarray(img))
        label = os.path.basename(os.path.dirname(path))
        labels.append(label_encoder.transform([label])[0])
    
    images = np.array(images) / 255.0  # Normalize images
    labels = to_categorical(labels, num_classes=NUM_CLASSES)
    
    return images, labels

In [None]:
# Label encoding to numeric values
labels = sorted(os.listdir(train_dir))
le = LabelEncoder()
le.fit(labels)

# Load training, validation, and test data
x_train, y_train = load_images_and_labels(train_dir, le)
x_val, y_val = load_images_and_labels(val_dir, le)
x_test, y_test = load_images_and_labels(test_dir, le)

# Split training data into 5 subsets
x_train_splits = np.array_split(x_train, 5)
y_train_splits = np.array_split(y_train, 5)

In [None]:
# Model definition
model = Sequential([
    Conv2D(256, (3, 3), activation="relu", input_shape=(SIZE, SIZE, 3)),
    MaxPool2D(pool_size=(2, 2)),
    Dropout(0.3),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPool2D(pool_size=(2, 2)),
    Dropout(0.3),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPool2D(pool_size=(2, 2)),
    Dropout(0.3),
    Flatten(),
    Dense(32),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
model.summary()

In [None]:
# Metrics callback for precision, recall, F1 score
class MetricsCallback(Callback):
    def __init__(self, validation_data):
        super(MetricsCallback, self).__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        x_val, y_val = self.validation_data
        y_pred = self.model.predict(x_val)
        y_pred_classes = np.argmax(y_pred, axis=1)
        y_true = np.argmax(y_val, axis=1)

        val_loss = logs['val_loss']
        val_accuracy = logs.get('val_accuracy', logs.get('val_acc'))  # Handle different versions

        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred_classes, average='weighted')

        print(f'Epoch {epoch + 1} - '
              f'Validation Loss: {val_loss:.4f}, '
              f'Validation Accuracy: {val_accuracy:.4f}, '
              f'Validation Error: {1 - val_accuracy:.4f}, '
              f'Validation Precision: {precision:.4f}, '
              f'Validation Recall: {recall:.4f}, '
              f'Validation F1 Score: {f1_score:.4f}')

In [None]:
# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [None]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Metrics callback
metrics_callback = MetricsCallback(validation_data=(x_val, y_val))

In [None]:
# Train the model on each subset sequentially
for i in range(5):
    print(f"Training on subset {i+1}...")
    history = model.fit(
        datagen.flow(x_train_splits[i], y_train_splits[i], batch_size=BATCH_SIZE),
        epochs=EPOCHS,
        validation_data=(x_val, y_val),
        callbacks=[early_stopping, metrics_callback],
        verbose=2
    )

In [None]:
# Evaluate the model on the validation set
val_score = model.evaluate(x_val, y_val)
print('Validation accuracy:', val_score[1])

# Evaluate the model on the test set
test_score = model.evaluate(x_test, y_test)
print('Test accuracy:', test_score[1])

# Plot the training and validation accuracy and loss at each epoch
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

In [None]:
# Plot Training & Validation Loss
axs[0, 0].plot(history.history['loss'], label='Train Loss')
axs[0, 0].plot(history.history['val_loss'], label='Validation Loss')
axs[0, 0].legend()
axs[0, 0].set_title('Training & Validation Loss')

# Plot Training & Validation Accuracy
axs[0, 1].plot(history.history['acc'], label='Train Accuracy')
axs[0, 1].plot(history.history['val_acc'], label='Validation Accuracy')
axs[0, 1].legend()
axs[0, 1].set_title('Training & Validation Accuracy')

# Plot Validation Error
axs[1, 0].plot(1 - np.array(history.history['val_acc']), label='Validation Error')
axs[1, 0].legend()
axs[1, 0].set_title('Validation Error')

plt.tight_layout()
plt.show()

In [None]:
# Predictions on validation data
y_val_pred = model.predict(x_val)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)
y_val_true = np.argmax(y_val, axis=1)

In [None]:
# Confusion matrix for validation data
cm_val = confusion_matrix(y_val_true, y_val_pred_classes)
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Validation Data')
plt.show()

In [None]:
# AUC-ROC curve for validation data
y_val_true_binary = label_binarize(y_val_true, classes=np.unique(y_val_true))
y_val_pred_binary = label_binarize(y_val_pred_classes, classes=np.unique(y_val_true))

fpr_val = dict()
tpr_val = dict()
roc_auc_val = dict()

for i in range(NUM_CLASSES):
    fpr_val[i], tpr_val[i], _ = roc_curve(y_val_true_binary[:, i], y_val_pred_binary[:, i])
    roc_auc_val[i] = roc_auc_score(y_val_true_binary[:, i], y_val_pred_binary[:, i])

plt.figure(figsize=(8, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])

for i, color in zip(range(NUM_CLASSES), colors):
    plt.plot(fpr_val[i], tpr_val[i], color=color, lw=2, label=f'Class {i} (AUC = {roc_auc_val[i]:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Validation Data')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Predictions on test data
y_test_pred = model.predict(x_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)
y_test_true = np.argmax(y_test, axis=1)

In [None]:
# Confusion matrix for test data
cm_test = confusion_matrix(y_test_true, y_test_pred_classes)
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Test Data')
plt.show()

# AUC-ROC curve for test data
y_test_true_binary = label_binarize(y_test_true, classes=np.unique(y_test_true))
y_test_pred_binary = label_binarize(y_test_pred_classes, classes=np.unique(y_test_true))

fpr_test = dict()
tpr_test = dict()
roc_auc_test = dict()

for i in range(NUM_CLASSES):
    fpr_test[i], tpr_test[i], _ = roc_curve(y_test_true_binary[:, i], y_test_pred_binary[:, i])
    roc_auc_test[i] = roc_auc_score(y_test_true_binary[:, i], y_test_pred_binary[:, i])

plt.figure(figsize=(8, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])

for i, color in zip(range(NUM_CLASSES), colors):
    plt.plot(fpr_test[i], tpr_test[i], color=color, lw=2, label=f'Class {i} (AUC = {roc_auc_test[i]:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Test Data')
plt.legend(loc='lower right')
plt.show()