<a href="https://colab.research.google.com/github/louisgraham333/cancer_image_detection_kaggle/blob/main/Image_Classification_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Histopathologic Cancer Detection
*This notebook is created for the Kaggle Histopathologic Cancer Detection Challenge. It pulls the data from Kaggle, analyses it, and creates a set of estimates for the validation set*

---



## Chapters


1. Prepare the script and pull data from Kaggle
2. Download the data, and move to correct folders
3. Create and fit the model (Keras-Tensorflow)
4. Predict on the validation set

## Chapter 1: Prepare the script
Import packages, set up the file directory, and set up the link to Kaggle

In [None]:
###Install and import packages
!pip install kaggle
!pip install -U -q kaggle
import json
import zipfile
import os
import pandas as pd
import numpy as np
from glob import glob
import time
import shutil
import skimage.io as io
from skimage.io import imread
import matplotlib.pyplot as plt
import random
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow import set_random_seed

In [None]:
#Authenticate Google Drive (to allow saving)
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()                       
drive = GoogleDrive(gauth)

In [None]:
#Set seed object
seed_object = np.random.seed(7654)
random.seed(seed_object)
set_random_seed(101)

In [None]:
#Create directory structure for image generator to work off
base_dir = 'base_dir'
os.mkdir(base_dir)
train_all_dir = os.path.join(base_dir, 'train_all_dir')
os.mkdir(train_all_dir)
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)
no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)
no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)
test_images = os.path.join(test_dir, 'test_images')
os.mkdir(test_images)

In [None]:
###Set up connection to Kaggle and upload API file
!mkdir -p ~/.kaggle
from google.colab import files
files.upload()

In [None]:
###Store API file, make private and view API
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions list   

In [None]:
###Listfiles in the competition
!kaggle competitions files histopathologic-cancer-detection

##Chapter 2: Download the data, and move to correct folders
Pull the data from the Kaggle API to colab, and then shift into the correct folders for the data generator to work off

In [None]:
###Download sample submission, unzip and delete zip
os.chdir('/content/base_dir')
!kaggle competitions download -f sample_submission.csv histopathologic-cancer-detection
!unzip -q sample_submission.csv
os.remove("sample_submission.csv.zip")

In [None]:
###Download train labels, unzip and delete zip
!kaggle competitions download -f train_labels.csv histopathologic-cancer-detection
!unzip -q train_labels.csv
os.remove("train_labels.csv.zip")

In [None]:
###Download train, unzip and delete zip
os.chdir('/content/base_dir/train_all_dir')
!kaggle competitions download -f train.zip histopathologic-cancer-detection
!unzip -q train
os.remove("train.zip")

In [None]:
###Download test, unzip and delete zip
os.chdir('/content/base_dir/test_dir/test_images')
!kaggle competitions download -f test.zip histopathologic-cancer-detection
!unzip -q test
os.remove("test.zip")

In [None]:
#Check number of objects in each
print(sum([len(files) for r, d, files in os.walk("/content/base_dir/train_all_dir")]))
print(sum([len(files) for r, d, files in os.walk("/content/base_dir/test_dir")]))

In [None]:
#Open and inspect labels
os.chdir('/content/base_dir')
labels = pd.read_csv("train_labels.csv")
labels.head(10)

In [None]:
#Split labels into train and validation
msk = np.random.rand(len(labels)) < 0.9
train_labels = labels[msk]
val_labels = labels[~msk]

In [None]:
#Split labels into has tumor and no tumor
train_has_labels = train_labels['id'].loc[train_labels['label'] == 1]
train_no_labels = train_labels['id'].loc[train_labels['label'] == 0]
val_has_labels = val_labels['id'].loc[val_labels['label'] == 1]
val_no_labels = val_labels['id'].loc[val_labels['label'] == 0]

In [None]:
#Shift train with labels to correct directory
for i in train_has_labels:
  shutil.copyfile("train_all_dir/" + str(i) + ".tif", "train_dir/b_has_tumor_tissue/" + str(i) + ".tif")

In [None]:
#Shift train without labels to correct directory
for i in train_no_labels:
  shutil.copyfile("train_all_dir/" + str(i) + ".tif", "train_dir/a_no_tumor_tissue/" + str(i) + ".tif")

In [None]:
#Shift val with labels to correct directory
for i in val_has_labels:
  shutil.copyfile("train_all_dir/" + str(i) + ".tif", "val_dir/b_has_tumor_tissue/" + str(i) + ".tif")

In [None]:
#Shift val without labels to correct directory
for i in val_no_labels:
  shutil.copyfile("train_all_dir/" + str(i) + ".tif", "val_dir/a_no_tumor_tissue/" + str(i) + ".tif")

In [None]:
#Check numbers to make sure that they add up
print("All train: " + str(sum([len(files) for r, d, files in os.walk("/content/base_dir/train_all_dir")])))
print("Train no tumour: " + str(sum([len(files) for r, d, files in os.walk("/content/base_dir/train_dir/a_no_tumor_tissue")])))
print("Train with tumour: " + str(sum([len(files) for r, d, files in os.walk("/content/base_dir/train_dir/b_has_tumor_tissue")])))
print("Val no tumour: " + str(sum([len(files) for r, d, files in os.walk("/content/base_dir/val_dir/a_no_tumor_tissue/")])))
print("Val with tumour: " + str(sum([len(files) for r, d, files in os.walk("/content/base_dir/val_dir/b_has_tumor_tissue")])))

##Chapter 3: Create and fit the model
The model is created using Keras, and the inbuilt data generation functions (as not all data can be held in memory)

In [None]:
#Inspect images
f, axarr = plt.subplots(2,4)
axarr[0,0].imshow(io.imread("train_dir/b_has_tumor_tissue/" + str(train_has_labels.iloc[random.randint(1,71429)])+".tif"))
axarr[0,1].imshow(io.imread("train_dir/b_has_tumor_tissue/" + str(train_has_labels.iloc[random.randint(1,71429)])+".tif"))
axarr[0,2].imshow(io.imread("train_dir/b_has_tumor_tissue/" + str(train_has_labels.iloc[random.randint(1,71429)])+".tif"))
axarr[0,3].imshow(io.imread("train_dir/b_has_tumor_tissue/" + str(train_has_labels.iloc[random.randint(1,71429)])+".tif"))
axarr[1,0].imshow(io.imread("train_dir/a_no_tumor_tissue/" + str(train_no_labels.iloc[random.randint(1,104722)])+".tif"))
axarr[1,1].imshow(io.imread("train_dir/a_no_tumor_tissue/" + str(train_no_labels.iloc[random.randint(1,104722)])+".tif"))
axarr[1,2].imshow(io.imread("train_dir/a_no_tumor_tissue/" + str(train_no_labels.iloc[random.randint(1,104722)])+".tif"))
axarr[1,3].imshow(io.imread("train_dir/a_no_tumor_tissue/" + str(train_no_labels.iloc[random.randint(1,104722)])+".tif"))

In [None]:
#Variables needed for data generators
train_path = "train_dir"
val_path = "val_dir"
image_size = 96
train_batch_size = 32
train_length = sum([len(files) for r, d, files in os.walk("/content/base_dir/train_dir/a_no_tumor_tissue")]) + sum([len(files) for r, d, files in os.walk("/content/base_dir/train_dir/b_has_tumor_tissue")])
train_steps = np.ceil(train_length / train_batch_size)
val_batch_size = 1024
val_length = sum([len(files) for r, d, files in os.walk("/content/base_dir/val_dir/a_no_tumor_tissue")]) + sum([len(files) for r, d, files in os.walk("/content/base_dir/val_dir/b_has_tumor_tissue")])
val_steps = np.ceil(val_length / val_batch_size)

In [None]:
#Set up the data generators
os.chdir('/content/base_dir')
datagen_train = ImageDataGenerator(rescale=1.0/255)
datagen_val = ImageDataGenerator(rescale=1.0/255)
train_gen = datagen_train.flow_from_directory(train_path,
                                        target_size=(image_size,image_size),
                                        batch_size=train_batch_size,
                                        class_mode='categorical')
val_gen = datagen_val.flow_from_directory(val_path,
                                        target_size=(image_size,image_size),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')

In [None]:
#Set up the model (3 conv layers, then fully connected)
kernel_size = (3,3)
pool_size = (2,2)
first_filters = 32
second_filters = 64
third_filters = 128
dropout = 0.3

model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu',  input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout))

model.add(Conv2D(second_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(second_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(second_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout))

model.add(Conv2D(third_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(third_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(third_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout))


model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout))
model.add(Dense(2, activation = "softmax"))

model.summary()

In [None]:
#Compile the model
model.compile(Adam(lr=0.0001), loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
#Save files, and allow the learning rate to adjust if results plateau
filepath = "model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
callbacks_list = [checkpoint, reduce_lr]

In [None]:
#Get IDs of the files in the folder
file_list = drive.ListFile({'q': "'1jplSxJOM7qGPlGjt0gw_WgQrN7QJQhq_' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

In [None]:
#Download models from Google Drive
file_obj = drive.CreateFile({'id': '1ctcFJi_zmAArhtc5bBJpq6qJq_z15TSZ'})                       
file_obj.GetContentFile('model_end.h5')
file_obj = drive.CreateFile({'id': '13SZS14FnMm_VJAD4WF16P7FHSW7y_PHO'})                       
file_obj.GetContentFile('model.h5')

In [None]:
#Load model
os.chdir('/content/base_dir')
model.load_weights("model_end.h5")

In [None]:
#Train the model
history = model.fit_generator(train_gen, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=5, verbose=1,
                    callbacks=callbacks_list)

In [None]:
#Save the final model
model.save('model_end.h5')

In [None]:
#Refresh tokens
gauth.Refresh()
drive = GoogleDrive(gauth)

In [None]:
#Save model to Google Drive (best, and current)
model_file = drive.CreateFile({'title' : 'Base_Model_end_32_5.h5', "parents": [{
  "kind": "drive#childList",
  "id": ""}]})                       
model_file.SetContentFile('model_end.h5')                       
model_file.Upload()
model_file = drive.CreateFile({'title' : 'Base_Model_best_32_5.h5', "parents": [{
  "kind": "drive#childList",
  "id": ""}]})                       
model_file.SetContentFile('model.h5')                       
model_file.Upload()

##Section 4: Predict on the validation set

In [None]:
#Pull best model and check accuracy
os.chdir("/content/base_dir")
model.load_weights('model.h5')
val_loss, val_acc = model.evaluate_generator(val_gen, steps=val_length)
print('val_loss:', val_loss)
print('val_acc:', val_acc)

In [None]:
#Set up generator for the test data
test_path ="test_dir"
datagen = ImageDataGenerator(rescale=1.0/255)
test_gen = datagen.flow_from_directory(test_path,
                                        target_size=(image_size,image_size),
                                        batch_size=1024,
                                        class_mode='categorical',
                                        shuffle=False)

In [None]:
#Make predictions
num_test_images = 57458
predictions = model.predict_generator(test_gen, steps=num_test_images, verbose=1)
len(predictions)

In [None]:
#Clean 
df_preds = pd.DataFrame(predictions, columns=['no_tumor_tissue', 'has_tumor_tissue'])
test_filenames = test_gen.filenames
df_preds['file_names'] = test_filenames

In [None]:
#Add the ID column
def extract_id(x):
    # split into a list
    a = x.split('/')
    # split into a list
    b = a[1].split('.')
    extracted_id = b[0]
    return extracted_id
df_preds['id'] = df_preds['file_names'].apply(extract_id)

In [None]:
#Create final file
y_pred = df_preds['has_tumor_tissue']
image_id = df_preds['id']
submission = pd.DataFrame({'id':image_id, 
                           'label':y_pred, 
                          }).set_index('id')
submission.head()

In [None]:
#Export predictions
os.chdir("/content/base_dir")
submission.to_csv('preds.csv', columns=['label'])
files.download('preds.csv')