# Data Exploration

## Mounting Pipeline Project Directory in Google Cloud Drive to Google Colab

In [4]:
from google.colab import drive #importing the Google Colab module and drive function
import os # importing os module
drive.mount('/content/drive') # mounting Google Drive to Google Colab
os.chdir('/content/drive/MyDrive/University_of_Kent/PhD/Research/Experiments/Pipeline_project') #Changing to dir. with pipeline project files.
print(os.getcwd()) # getting the current directory path details.

Mounted at /content/drive
/content/drive/MyDrive/University_of_Kent/PhD/Research/Experiments/Pipeline_project


## Importing modules and libraries

In [5]:
import glob # importing the glob module which is a path traversing dir. library
import numpy as np
import tensorflow as tf

from os import path, listdir
from PIL import Image # Python image library
from sklearn.model_selection import train_test_split
from tensorflow.keras import datasets, layers, models
from tqdm import tqdm # provides progress details for for-loops

from skimage.io import imread # we call the imread function from the skimage module. imread function reads images.
from skimage.transform import resize # we call the resize function from the skimage module. resize resizes images.
from skimage.color import rgb2gray  # we call the rgb2gray function from the skimage.color module. rgb2gray function converts an RGB (colours) to greyscale (black and white).

## Accessing the Data

In [6]:
# Accessing the data in (colour or) greyscale

def images_ds2features(pathin, label, samples=0, greyscale=False): 
  """
  The function reads labelled images from directories. 

  Arguments (or Parameters):
      pathin: string
              The path of the images.
      label: int
             This is the label you want to assign to the images in the dir.
      samples: int
               The number of random samples we want to draw/ access/ retrieve from the dir.
               If 0, then no random sampling is chosen, instead the entire dataset is retrieved.
      greyscale: bool
                 boolean for retrieving greyscale images. 

  Returns:
      images: list
              list of images from specified directory
      labels: list 
              list of labels.
      
  Raises:
      Not Applicable at the moment.
  """

  images = [] # create a list called images.
  targets = [] # create a list called targets.
  count=0 # variable counter to be used for random sampling (if applicable).
  out_shape = (176, 320) if greyscale else (176, 320, 3) # We are defining the width (columns) = 320 and height (rows) = 176, output shape (that we want for) the images.
  img_files_list = listdir(pathin) # We get a list of files from the pathin dir. 

  for img_file_path in tqdm(img_files_list, total=len(img_files_list)): # Iterates through every filename in img_files_list and assigns each file to img_file_path (from img_files_list) with every run of the loop.
    if not img_file_path.endswith(".jpg") and not img_file_path.endswith(".png"): # checks if it is an image file.
      continue # if not an image file, program ignores file and moves to check next file.

    full_path = path.join(pathin, img_file_path)

    if (0 <= count < samples) or samples==0: # sampling occurs 
      _image = imread(full_path, as_gray=greyscale) # we read the image from file img_file_path. 
      
      _image = resize(_image, output_shape=out_shape, mode='constant', anti_aliasing=True) # we resize the image to the dimensions set earlier
      _image = _image.astype('float32') # we make sure it is 32 bit floats instead of 64 which is default. Faster training no loss of accuracy

      images.append(_image) # we are appending the refactored (e.g. resized, retyped etc.) image array to the images list.
      targets.append(label) # we are appending the label. Note: we are going to end with the same amount of labels as the nos.of images in the sample.
      count+=1 

  return images,targets 

In [7]:
# Access the image data in colour (RGB): deprecated (obselete function - has been replaced with images_ds2features(pathin, label, samples=0, greyscale=False)function)
# Function is not called in program.

def read_imgs_from(pathin, samples=0): # A function definition named read_imgs_from, which has pathin parameter passed to it.
  images = [] # create a list called images.
  count=0
  for f in glob.iglob(path.join(pathin, '*')): # For every ('*') image in file in pathin, 
    if (samples>0 and samples >count) or samples==0:
      images.append(np.asarray(Image.open(f))) # open the image file and append to a numpy array and then append to the list called images. 
      count+=1
  images=np.array(images) # Convert the list named images (the outer list) into a numpy array as well.

  print(pathin)
  print(images.shape) 
  return images 

In [None]:
# get the path for high, medium etc.
path_train = path.join('pipeline_corrosion_dataset', 'Train_Data') # Name the path 'path_train'.
paths_high = path.join(path_train, 'high') # Take the path 'path_train' and add the string 'high' to it - in other words, construct a new path that leads to the path: 'pipeline_corrosion_dataset', 'Train_Data', 'high'.
paths_med = path.join(path_train, 'medium') # Take the path 'path_train' and add the string 'med' to it - in other words, construct a new path that leads to the path: 'pipeline_corrosion_dataset', 'Train_Data', 'med'.
paths_low = path.join(path_train, 'low')# Take the path 'path_train' and add the string 'low' to it - in other words, construct a new path that leads to the path: 'pipeline_corrosion_dataset', 'Train_Data', 'low'.
paths_no = path.join(path_train, 'no') # Take the path 'path_train' and add the string 'no' to it - in other words, construct a new path that leads to the path: 'pipeline_corrosion_dataset', 'Train_Data', 'no'.

# from the paths read each dataset to its own list of images
samples=0
X_high, y_high = images_ds2features(paths_high, 3, samples=samples, greyscale=False) #samples (5x image files) is read from the constructed paths_high, and passes them into the function read_imgs_from, and then passes the output from the function into the container(variable) called images_high.
X_med, y_med = images_ds2features(paths_med, 2,  samples=samples, greyscale=False) #samples (5x image files) is read from the constructed paths_med, and passes them into the function read_imgs_from, and then passes the output from the function into the container(variable) called images_med.
X_low, y_low = images_ds2features(paths_low, 1, samples=samples, greyscale=False) #samples (5x image files) is read from the constructed paths_low, and passes them into the function read_imgs_from, and then passes the output from the function into the container(variable) called images_low.
X_no, y_no = images_ds2features(paths_no, 0, samples=samples, greyscale=False) #samples (5x image files) is read from the constructed paths_no, and passes them into the function read_imgs_from, and then passes the output from the function into the container(variable) called images_no

# Feature Preparation/ or Engineering

In [None]:
# Feature prep/ engineering

#Aligning the data (the features X must match the labels in vector y: there must be a one to one (1-1) mapping)
y = y_high + y_med + y_low + y_no # We place the medium image files before the no image files in the list into vector y.
y = np.array(y) # Convert the list y into a numpy array.
print(y.shape) 

X = X_high + X_med + X_low + X_no # We create a vertical stack, placing the medium images on top and the no images at the bottom, similar to line 9 above.
X = np.array(X)
#X.reshape(-1,1)
print(X.shape)

# Data Augmentation

In [None]:
# Data Augmentation (if necessary)

# Data Splitting

In [None]:
# Split data into train and test ( and validation if necessary)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=42, shuffle=True)

# Defining Model Architecture

In [None]:
# Define model architecture
# This section creates the first part of the convolutional neural network architecture, which prepares the images and provides feature augmentation of the original images.
model = models.Sequential() # Calls the (empty) Sequential model from Keras 
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(176, 320,3))) #A Convolutional layer is added to the empty sequential model.
model.add(layers.MaxPooling2D((2, 2))) #Applying MaxPooling compression technique to images coming from the above convolutional layer.
model.add(layers.Conv2D(64, (3, 3), activation='relu')) #Another Convolutional layer is added to the empty sequential model.
model.add(layers.MaxPooling2D((2, 2))) #Applying MaxPooling compression technique to images coming from the above convolutional layer.
model.add(layers.Conv2D(64, (3, 3), activation='relu')) #Another Convolutional layer is added to the empty sequential model.

model.summary() # displays the created model architecture to the screen.

#fully connected dnn
model.add(layers.Flatten()) #This statement flattens the stack of images received from the convolutional layer and flattens them into a 1-dimensional array.
model.add(layers.Dense(64, activation='relu')) # Adds a layer (a hidden layer).
model.add(layers.Dense(4)) # Adds the output layer.

model.summary()

# Optimization 
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])




# Model Training

In [None]:
# Fit the training data into the model (training) 
# %debug
history = model.fit(X_train, y_train, epochs=7) # training dataset is passed to model for training.

# Performance Evaluation

In [None]:
# Assess model using test set ( or validation set too) ( Performance evaluation)
scores = model.evaluate(x=X_test, y=y_test) # test is passed to model to evaluate performance of model.
print(scores)