In [1]:
# Import required libraries and modules

import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
import matplotlib.pyplot as plt

import keras,os
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten
from keras.preprocessing.image import ImageDataGenerator
import numpy as np


# Connect tensorflow to google drive where images are stored.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
source_path =  '/content/drive/MyDrive/image_data'

source_path_0 = os.path.join(source_path, 'no_defect')
source_path_1 = os.path.join(source_path, 'bubble_defect')
source_path_2 = os.path.join(source_path, 'burn_defect')


# os.listdir returns a list containing all files under the given path
print(f"There are {len(os.listdir(source_path_0))} images of no_defect.")
print(f"There are {len(os.listdir(source_path_1))} images of bubble_defect.")
print(f"There are {len(os.listdir(source_path_2))} images of burn-_defect.")

There are 1000 images of no_defect.
There are 1000 images of bubble_defect.
There are 1000 images of burn-_defect.


In [3]:
# Define root directory
root_dir = '/content/drive/MyDrive/augmented_images'

# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
  shutil.rmtree(root_dir)

# Create the directories for training and validation datasets
def create_train_val_dirs(root_path):

  os.makedirs(os.path.join(root_path, 'training'))
  os.makedirs(os.path.join(root_path, 'validation'))
  os.makedirs(os.path.join(f'{root_path}/training', 'no_defect'))
  os.makedirs(os.path.join(f'{root_path}/training', 'bubble_defect'))
  os.makedirs(os.path.join(f'{root_path}/training', 'burn_defect'))
  os.makedirs(os.path.join(f'{root_path}/validation', 'no_defect'))
  os.makedirs(os.path.join(f'{root_path}/validation', 'bubble_defect'))
  os.makedirs(os.path.join(f'{root_path}/validation', 'burn_defect'))

try:
  create_train_val_dirs(root_path=root_dir)
except FileExistsError:
  print("You should not be seeing this since the upper directory is removed beforehand")

In [4]:
#Testing the create_train_val_dirs function

for rootdir, dirs, files in os.walk(root_dir):
    for subdir in dirs:
        print(os.path.join(rootdir, subdir))

/content/drive/MyDrive/augmented_images/training
/content/drive/MyDrive/augmented_images/validation
/content/drive/MyDrive/augmented_images/training/no_defect
/content/drive/MyDrive/augmented_images/training/bubble_defect
/content/drive/MyDrive/augmented_images/training/burn_defect
/content/drive/MyDrive/augmented_images/validation/no_defect
/content/drive/MyDrive/augmented_images/validation/bubble_defect
/content/drive/MyDrive/augmented_images/validation/burn_defect


In [5]:
# Split the data into the traning and validation directories
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):


  # Shuffe the images
  shuffled_source = random.sample(os.listdir(SOURCE_DIR), len(os.listdir(SOURCE_DIR)))

  training_number = int(len(shuffled_source) * SPLIT_SIZE)

  i = 0
  target = TRAINING_DIR

  for item in shuffled_source:
    item_source = os.path.join(SOURCE_DIR, item)
    if  os.path.getsize(item_source) == 0:
      print(f'{item} is zero length, so ignoring.')
    else:
      copyfile(item_source, os.path.join(target, item))
      i += 1

      # Switch copy target to validation:
      if i == training_number:
        target = VALIDATION_DIR



In [6]:
# Testing the split_data function

# Define paths for training and validation directories

NO_DEFECT_SOURCE_DIR = "/content/drive/MyDrive/image_data/no_defect/"
BUBBLE_SOURCE_DIR = "/content/drive/MyDrive/image_data/bubble_defect/"
BURN_SOURCE_DIR = "/content/drive/MyDrive/image_data/burn_defect/"

TRAINING_DIR = "/content/drive/MyDrive/augmented_images/training/"
VALIDATION_DIR = "/content/drive/MyDrive/augmented_images/validation/"

TRAINING_NO_DEFECT_DIR = os.path.join(TRAINING_DIR, "no_defect/")
VALIDATION_NO_DEFECT_DIR = os.path.join(VALIDATION_DIR, "no_defect/")

TRAINING_BUBBLE_DEFECT_DIR = os.path.join(TRAINING_DIR, "bubble_defect/")
VALIDATION_BUBBLE_DEFECT_DIR = os.path.join(VALIDATION_DIR, "bubble_defect/")

TRAINING_BURN_DEFECT_DIR = os.path.join(TRAINING_DIR, "burn_defect/")
VALIDATION_BURN_DEFECT_DIR = os.path.join(VALIDATION_DIR, "burn_defect/")

# Empty directories in case you run this cell multiple times
if len(os.listdir(TRAINING_NO_DEFECT_DIR)) > 0:
  for file in os.scandir(TRAINING_NO_DEFECT_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_BUBBLE_DEFECT_DIR)) > 0:
  for file in os.scandir(TRAINING_BUBBLE_DEFECT_DIR):
    os.remove(file.path)
if len(os.listdir(TRAINING_BURN_DEFECT_DIR)) > 0:
  for file in os.scandir(TRAINING_BURN_DEFECT_DIR):
    os.remove(file.path)


if len(os.listdir(VALIDATION_NO_DEFECT_DIR)) > 0:
  for file in os.scandir(VALIDATION_NO_DEFECT_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_BUBBLE_DEFECT_DIR)) > 0:
  for file in os.scandir(VALIDATION_BUBBLE_DEFECT_DIR):
    os.remove(file.path)
if len(os.listdir(VALIDATION_BURN_DEFECT_DIR)) > 0:
  for file in os.scandir(VALIDATION_BURN_DEFECT_DIR):
    os.remove(file.path)


# Define the proportion of images used for training and validation
split_size = .9

# Call the function to split the data

split_data(NO_DEFECT_SOURCE_DIR, TRAINING_NO_DEFECT_DIR, VALIDATION_NO_DEFECT_DIR, split_size)
split_data(BUBBLE_SOURCE_DIR, TRAINING_BUBBLE_DEFECT_DIR, VALIDATION_BUBBLE_DEFECT_DIR, split_size)
split_data(BURN_SOURCE_DIR, TRAINING_BURN_DEFECT_DIR, VALIDATION_BURN_DEFECT_DIR, split_size)

# Check that the number of images matches the expected output


print(f"\n\nOriginal no_defect's directory has {len(os.listdir(NO_DEFECT_SOURCE_DIR))} images")
print(f"Original bubble_defect's directory has {len(os.listdir(BUBBLE_SOURCE_DIR))} images\n")
print(f"Original burn_defect's directory has {len(os.listdir(BURN_SOURCE_DIR))} images\n")

# Training images
print(f"There are {len(os.listdir(TRAINING_NO_DEFECT_DIR))} images of no_defect for training")
print(f"There are {len(os.listdir(TRAINING_BUBBLE_DEFECT_DIR))} images of bubble_defect for training")
print(f"There are {len(os.listdir(TRAINING_BURN_DEFECT_DIR))} images of burn_defect for training")

# Validation images
print(f"There are {len(os.listdir(VALIDATION_NO_DEFECT_DIR))} images of no_defect for validation")
print(f"There are {len(os.listdir(VALIDATION_BUBBLE_DEFECT_DIR))} images of bubble_defect for validation")
print(f"There are {len(os.listdir(VALIDATION_BURN_DEFECT_DIR))} images of burn_defect for validation")



Original no_defect's directory has 1000 images
Original bubble_defect's directory has 1000 images

Original burn_defect's directory has 1000 images

There are 900 images of no_defect for training
There are 900 images of bubble_defect for training
There are 900 images of burn_defect for training
There are 100 images of no_defect for validation
There are 100 images of bubble_defect for validation
There are 100 images of burn_defect for validation


In [7]:
# Create generators to flow images from directory into VGG16 network

def train_val_generators(TRAINING_DIR, VALIDATION_DIR):


  # Instantiate the ImageDataGenerator and normalise image pixel values for the training dataset. 
  train_datagen = ImageDataGenerator(rescale = 1/255.)

 # Flow images from training image directory and resize to 224 x 224px
  train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                      batch_size=45,
                                                      class_mode='categorical',
                                                      target_size=(224, 224))

  # Instantiate the ImageDataGenerator and normalise image pixel values for the validation dataset. 
  validation_datagen = ImageDataGenerator(rescale = 1/255.)

   # Flow images from validation image directory and resize to 224 x 224px
  validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
                                                                batch_size=5,
                                                                class_mode='categorical',
                                                                target_size=(224, 224))

  return train_generator, validation_generator

In [8]:
# Testing the generators output the correct quantity of images
train_generator, validation_generator = train_val_generators(TRAINING_DIR, VALIDATION_DIR)

Found 2700 images belonging to 3 classes.
Found 300 images belonging to 3 classes.


In [11]:
# Create the VGG16 Model

def VGG16():
  
  # Initialise the network weights
  initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
 
  # Build the VGG16 Network
  model = Sequential()
  model.add(Conv2D(input_shape=(224,224,3),filters=64,kernel_size=(3,3),padding="same", activation="relu"))
  model.add(Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu"))
  model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

  model.add(Flatten())
  model.add(Dense(units=4096,activation="relu", kernel_initializer=initializer))
  model.add(Dense(units=4096,activation="relu", kernel_initializer=initializer))
  model.add(Dense(units=3, activation="softmax", kernel_initializer=initializer))

  #Compile the network
  from keras.optimizers import Adam
  
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
                     loss=tf.keras.losses.CategoricalCrossentropy(),
                     metrics=[tf.keras.metrics.Accuracy(),
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall()
                             ]
                                                  )
      
  return model   
   
  

In [12]:
#Instantiate and train the VGG16 model

model = VGG16()


history = model.fit(train_generator,
                    steps_per_epoch=20,
                    epochs=20,
                    verbose=1,
                    validation_data=validation_generator)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Collect performance metrics from the VGG16 model

acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

# Plot training and validation accuracy per epoch
plt.plot(epochs, acc, 'r', label="Training Accuracy")
plt.plot(epochs, val_acc, 'b', label="Validation Accuracy")
plt.legend(loc='lower right')
plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
plt.savefig('img.svg')
plt.ylim(0,1)
plt.show()



# Plot training and validation loss per epoch
#plt.plot(epochs, loss, 'r', "Training Loss")
#plt.plot(epochs, val_loss, 'b', "Validation Loss")
#plt.show()


In [None]:
import pandas as pd
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history.history) 
hist_df


## Confusion Matrix

In [None]:
TEST_DIR = "/content/drive/MyDrive/test_data"

# Instantiate the ImageDataGenerator and normalise image pixel values for the test dataset. 
test_datagen = ImageDataGenerator(rescale = 1/255.)

# Flow images from validation image directory and resize to 224 x 224px
test_generator = test_datagen.flow_from_directory(directory=TEST_DIR,
                                                                batch_size=5,
                                                                class_mode='categorical',
                                                                target_size=(224, 224))

predictions = model.predict(test_generator)

In [None]:
import numpy as np

preds_cls_idx = predictions.argmax(axis=1)
idx_to_cls = {v: k for k, v in train_generator.class_indices.items()}
preds_cls = np.vectorize(idx_to_cls.get)(preds_cls_idx)
filenames_to_cls = list(zip(test_generator.filenames, preds_cls))

filenames_to_cls[:300]

In [None]:
import pandas as pd
results = pd.DataFrame(filenames_to_cls)
results