In [1]:
# First we need to install Augmentor
!pip install Augmentor

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import os
import Augmentor

from google.colab import drive

# Connect Google account to access train and test folders
drive.mount('/content/drive')

# Define paths for the train and test images 
# Note for local run: Code locally consumes a lot of memory and time, using Google Colab helps. Not recommended to run locally. 
# Note for mount folder: Folder name changed from 'Skin cancer ISIC The International Skin Imaging Collaboration' to 'Cancer_Dataset' to meet Google Drive requirement.
# Note for coding standards: Folders were renamed from 'Train' to 'train' and from 'Test' to 'test' to meet coding practices.
train_dir = "/content/drive/MyDrive/Cancer_Dataset/train/"
test_dir = "/content/drive/MyDrive/Cancer_Dataset/test/"

# Dataset Creation with ImageDataGenerator
# ImageDataGenerator is used to preprocess the images and create batches with data augmentation
batch_size = 32  # Number of images to be processed in one go (batch size)
img_height, img_width = 180, 180  # Resize all images to 180x180 pixels

# Create an instance of ImageDataGenerator for training and validation, rescaling images to 0-1 range
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Rescale pixel values between 0 and 1

# Load training images from directory, split the data for training and validation (80% training, 20% validation)
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='sparse',  # Labels are integer encoded
    subset='training'  # Training subset
)

# Load validation images from directory
validation_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='sparse',  # Labels are integer encoded
    subset='validation'  # Validation subset
)

# Visualizing the Dataset
# Display images from each class to understand how they look
class_names = list(train_generator.class_indices.keys())  # Get class names (disease categories)

# Plot 9 images from the dataset
plt.figure(figsize=(10, 10))
for images, labels in train_generator:
    for i in range(9):
        plt.subplot(3, 3, i+1)
        plt.imshow(images[i])  # Show the image
        plt.title(class_names[int(labels[i])])  # Display the class name
        plt.axis('off')  # Hide the axes
    break  # Display one batch only
plt.show()

# Model Building
# Define a function to create a Convolutional Neural Network (CNN) model
def create_model(input_shape=(img_height, img_width, 3), num_classes=9):
    model = models.Sequential()  # Start a sequential model
    
    # First Convolutional Block (32 filters, 3x3 filter size, ReLU activation, followed by Max Pooling)
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))  # Reduce size using max pooling
    
    # Second Convolutional Block (64 filters, 3x3 filter size)
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))  # Another max pooling layer
    
    # Third Convolutional Block (128 filters, 3x3 filter size)
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))  # Max pooling again
    
    # Fully Connected Layers
    model.add(layers.Flatten())  # Flatten the output (convert 2D matrix to 1D)
    model.add(layers.Dense(128, activation='relu'))  # Fully connected dense layer with 128 units
    model.add(layers.Dropout(0.5))  # Dropout to prevent overfitting (50% of nodes dropped)
    model.add(layers.Dense(num_classes, activation='softmax'))  # Output layer for multiclass classification (softmax gives probabilities)

    return model  # Return the created model

# Model Compilation
cnn_model = create_model()  # Create the CNN model

# Compile the model with Adam optimizer, sparse categorical crossentropy (for integer-encoded labels), and accuracy metric
cnn_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Dynamic Augmentation for Under-represented Classes using Augmentor
parent_directory = "/content/drive/MyDrive/Cancer_Dataset/train"

# List of all 9 classes
classes = [
    'actinic keratosis', 
    'basal cell carcinoma', 
    'dermatofibroma', 
    'melanoma', 
    'nevus', 
    'pigmented benign keratosis', 
    'seborrheic keratosis', 
    'squamous cell carcinoma', 
    'vascular lesion'
]

# Apply augmentation for each class
for class_name in classes:
    class_path = os.path.join(parent_directory, class_name)
    
    # Check if the directory exists
    if os.path.exists(class_path):
        print(f"Processing class: {class_path}")
        
        # Apply augmentation using Augmentor
        p = Augmentor.Pipeline(class_path)
        p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
        p.zoom_random(probability=0.5, percentage_area=0.8)
        p.flip_left_right(probability=0.5)
        p.sample(1000)  # Generate 1000 images for each class
    else:
        print(f"Directory does NOT exist: {class_path}")

# Model Training
epochs = 20  # Number of times to go through the entire dataset
history = cnn_model.fit(
    train_generator,  # Training data
    validation_data=validation_generator,  # Validation data
    epochs=epochs  # Number of epochs
)

# Evaluate Overfitting or Underfitting
# Plotting training and validation accuracy and loss to see if the model is overfitting/underfitting
acc = history.history['accuracy']  # Training accuracy
val_acc = history.history['val_accuracy']  # Validation accuracy

loss = history.history['loss']  # Training loss
val_loss = history.history['val_loss']  # Validation loss

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')  # Plot training accuracy
plt.plot(epochs_range, val_acc, label='Validation Accuracy')  # Plot validation accuracy
plt.legend(loc='lower right')  # Add a legend
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')  # Plot training loss
plt.plot(epochs_range, val_loss, label='Validation Loss')  # Plot validation loss
plt.legend(loc='upper right')  # Add a legend
plt.title('Training and Validation Loss')
plt.show()

# Data Augmentation for Overfitting
# Augmenting data (rotation, shifting, zooming, flipping) to prevent overfitting
train_datagen_augmented = ImageDataGenerator(
    rescale=1./255,  # Rescale pixel values between 0 and 1
    rotation_range=40,  # Randomly rotate images by 40 degrees
    width_shift_range=0.2,  # Randomly shift images horizontally
    height_shift_range=0.2,  # Randomly shift images vertically
    shear_range=0.2,  # Shear the images
    zoom_range=0.2,  # Randomly zoom in on images
    horizontal_flip=True,  # Randomly flip images horizontally
    fill_mode='nearest',  # Fill in missing pixels after transformations
    validation_split=0.2  # Reserve 20% for validation
)

# Creating augmented training data
train_generator_augmented = train_datagen_augmented.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='sparse',
    subset='training'  # Only for training
)

# Training the model on augmented data
history_augmented = cnn_model.fit(
    train_generator_augmented,  # Augmented training data
    validation_data=validation_generator,  # Validation data
    epochs=20  # Number of epochs
)

# Step 4: Train the model on rectified class imbalance data for 30 epochs
# Reload the balanced data using ImageDataGenerator
train_datagen_balanced = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator_balanced = train_datagen_balanced.flow_from_directory(
    train_dir,  # The directory with rectified (augmented) data
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='sparse',
    subset='training'
)

# Train the model on the rectified dataset for 30 epochs
history_balanced = cnn_model.fit(
    train_generator_balanced,  # Balanced training data
    validation_data=validation_generator,  # Validation data
    epochs=30  # Train for 30 epochs
)


Collecting Augmentor
  Obtaining dependency information for Augmentor from https://files.pythonhosted.org/packages/f3/86/5a91176650eb229ea2cd95551c34c36fba6cd95da3bdc4a5c73fbb1536ca/Augmentor-0.2.12-py2.py3-none-any.whl.metadata
  Downloading Augmentor-0.2.12-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading Augmentor-0.2.12-py2.py3-none-any.whl (38 kB)
Installing collected packages: Augmentor
Successfully installed Augmentor-0.2.12


ModuleNotFoundError: No module named 'tensorflow'