# Practice Lab: Chest Cancer Detection

Welcome to the Practice Lab! You will be using the `Chest CT-Scan Dataset` from [kaggle](https://www.kaggle.com/datasets/mohamedhanyyy/chest-ctscan-images) dataset to train a model that can detect chest cancer from ct-scan images. For this, you will use `Functional API` and `Transfer Learning` using base model of `MobileNetv2` to train your dataset.

Let's get started!

Note: 
1. The dataset is slightly modified to fit this practice lab
2. Try to use data augmentations using tools like tensforlow ImageDataGenerator
3. Try to modify the networks to your own preference
4. You can also use another pre-trained base model from [Tensforlow Model Hub](https://tfhub.dev/) or [Model Zoo](https://modelzoo.co/framework/tensorflow)

In [None]:
# Import the necessary libraries
import os
import zipfile
import random
import shutil
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from sklearn.model_selection import train_test_split 
from random import randint
import cv2

Download the dataset by running the cell below. 

Note that the `zip` file that contains the images is unzipped under the `/tmp` directory.

In [None]:
# If the URL doesn't work, visit https://drive.google.com/uc?id=1CRuANxqfiSqUAGxlqoxkZWjnINHg5fUC
# And download manually

!gdown "1CRuANxqfiSqUAGxlqoxkZWjnINHg5fUC&confirm=t"

In [None]:
!unzip chest_ct-scan.zip

Now the images are stored within the `/chest_ct-scan` directory. The directory is splitted to train, valid, test. 
There is a subdirectory for each class.

In [None]:
source_path = '/content/chest_ct-scan'

# Deletes corrupt images files
class_names = ['adenocarcinoma', 'squamous-cell-carcinoma', 'large-cell-carcinoma', 'normal']

for _class in class_names:
  folder_path = os.path.join(source_path, _class)
  for img_file in os.listdir(folder_path):
    path = os.path.join(folder_path, img_file)
    try:
      image=tf.keras.preprocessing.image.load_img(path)
    except:
      print(f'Removing {img_file} in {_class}')
      os.remove(path)

# os.listdir returns a list containing all files under the given path
for _class in class_names:
  print(f"There are {len(os.listdir(os.path.join(source_path, _class)))} images of {_class}.")

In [None]:
def preview_sample(SOURCE_DIR, NUM_OF_IMGS):
  """
  Preview sample images from directories
  
  Args:
    SOURCE_DIR (string): directory path containing the images
    NUM_OF_IMGS (int): number sample of images to preview
    
  Returns:
    None
  """
  for i in range(NUM_OF_IMGS):
    img_class = random.choice(['adenocarcinoma', 'squamous-cell-carcinoma', 'large-cell-carcinoma', 'normal'])
    folder_path = os.path.join(SOURCE_DIR, img_class)
    img_path = os.listdir(folder_path)
    img = cv2.imread(os.path.join(folder_path, random.choice(img_path)))
    plt.imshow(img)
    plt.show()


Displaying random sample images from dataset

In [None]:
img_path = preview_sample(source_path, 3)

Read Images and it's labels

In [None]:
def load_dataset(SOURCE_DIR, CLASS_NAME, TARGET_SIZE):
  """
  Load images and labels from directories
  
  Args:
    SOURCE_DIR (string): directory path containing the images
    CLASS_NAME (array_like): Array of class names
    TARGET_SIZE (array_like): Dimension of the images
    
  Returns:
    array_like: Images data
    array_like: Images label
  """

  dataset = []
  label = []

  for _class in CLASS_NAME:
    tmp_dataset = []
    folder_path = os.path.join(SOURCE_DIR, _class)
    for img_file in os.listdir(folder_path):
      path = os.path.join(folder_path, img_file)
      try:
        image=tf.keras.preprocessing.image.load_img(path, color_mode='rgb', 
              target_size=TARGET_SIZE)
      except:
        print(f'File {img_file} in {_class} is corrupted')
      else:
        image=tf.keras.preprocessing.image.load_img(path, color_mode='rgb', 
            target_size=TARGET_SIZE)
        image=np.array(image)
        tmp_dataset.append(image)
        del image
        label.append(_class)
    dataset.append(np.asarray(tmp_dataset))
    del tmp_dataset
  dataset = np.concatenate(dataset, axis=0)
  
  # Label converter
  labels = []
  for _label in label:
    labels.append(CLASS_NAME.index(_label))
    
  # One hot encoder for multiple classes
  if len(CLASS_NAME) > 2:
    labels = tf.keras.utils.to_categorical(labels).astype(int)
  del label

  return dataset, np.asarray(labels)

In [None]:
class_name = ['adenocarcinoma', 'squamous-cell-carcinoma', 'large-cell-carcinoma', 'normal']
img_size = (160,160)
dataset, labels = load_dataset(source_path, class_name, img_size)

In [None]:
# Checking dataset and labels shape
print(dataset.shape)
print(labels.shape)

In [None]:
# Train test split
train_data, test_data, label_train, label_test = train_test_split(dataset, labels, train_size=0.9, shuffle=True)

In [None]:
# Checking train, test, and it's label shape
print(train_data.shape)
print(label_train.shape)
print(test_data.shape)
print(label_test.shape)

Defining our transfer learning model using MobileNetV2

In [None]:
def create_model():
  # Define out base model
  base_model = tf.keras.applications.MobileNetV2(input_shape=(160, 160, 3),
                                               include_top=False,
                                               weights='imagenet')
  base_model.trainable = False # Freeze the base model
  
  # We need to prepocess our input
  preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

  # Add classifier
  global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
  prediction_layer = tf.keras.layers.Dense(4, activation='softmax')

  # Let's Combine our model with Functional API
  inputs = tf.keras.Input(shape=(160, 160, 3))
  x = preprocess_input(inputs)
  x = base_model(x, training=False)
  x = global_average_layer(x)
  x = tf.keras.layers.Dropout(0.1)(x) # Adding the dropout to prevent overfitting
  outputs = prediction_layer(x)
  model = tf.keras.Model(inputs, outputs)
  
  # Compile our model
  model.compile(loss='categorical_crossentropy', 
              metrics=['categorical_accuracy'],
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
    

  return model

In [None]:
# Get the untrained model
model = create_model()

In [None]:
# Display model architecture
model.summary()

In [None]:
# Train the model
# Note that this may take some time.
history = model.fit(train_data, label_train,batch_size=64, validation_split=0.1, epochs=50)

Once training has finished, you can run the following cell to check the training and validation accuracy achieved at the end of each epoch.

In [None]:
# Retrieve a list of list results on training and test data
# sets for each training epoch
acc=history.history['categorical_accuracy']
val_acc=history.history['val_categorical_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

# Plot training and validation accuracy per epoch
plt.plot(epochs, acc, 'r', label='acc')
plt.plot(epochs, val_acc, 'b', label='val_acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()
print("")

# Plot training and validation loss per epoch
plt.plot(epochs, loss, 'r', label='loss')
plt.plot(epochs, val_loss, 'b', label='val_loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

Test our model

In [None]:
def make_prediction(model, data, label, CLASS_NAME):
  """
  Preview sample images from directories
  
  Args:
    model: trained model
    data (array-like): data to predict
    label (array-like) : true label of data
    CLASS_NAME (array_like): Array of class names
    
  Returns:
    None
  """

  idx = randint(0, label.shape[0]-1)
  x = np.expand_dims(data[idx], axis=0)
  y_pred = model.predict(x)
  y_pred = np.argmax(y_pred)
  y = np.argmax(label[idx])
  plt.imshow(data[idx])
  plt.xlabel(f'Predicted class: {CLASS_NAME[y_pred]}\nActual class: {CLASS_NAME[y]}')
  plt.show()


In [None]:
make_prediction(model, test_data, label_test, class_name)