**Data augmentation for Esca dataset**

---



# STEP 1 - Download dataset from Mendeley repository

In [1]:
dataset_name = "esca_dataset"
# Url to repo (repo temporary saved in Google Drive but intended to Mendeley repo)
dataset_url =  "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/89cnxc58kj-1.zip"   # Google Drive -> to change with Mendely Link
# Trick to use wget with gDrive: use 'https://docs.google.com/uc?export=download&id=FILEID' 
# where FILEID is extracted from the virtual link provided from Google drive
dataset_url4wget = "https://docs.google.com/uc?export=download&id=1qO997Wy5drvRpVbAOCL20w82FEGiDpmV"
# Download the archive directly from url
!wget -r --no-check-certificate "$dataset_url" -O $dataset_name".zip"
!ls
# Unzip data
!unzip  $dataset_name".zip"
!ls

'wget' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.
'ls' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.
'unzip' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.
'ls' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.


# STEP 2 - Data augmentation

In [2]:
# Input pameters choosen by user
# transformation_array -> User can choose what transformations apply simply comment/uncomment (#) the row corresponding to the transformation name.
# enable_show -> User can choose to display (True) or none (False) the transformations applied to original images. Enabling this option, the execution speed will be reduced.

transformation_array = [
                        "horizontalFlip",
                        "verticalFlip", 
                        "rotation", 
                        "widthShift", 
                        "heightShift",  
                        "shearRange",
                        "zoom", 
                        "blur",
                        "brightness", 
                        "contrast",
                        "saturation",
                        "hue",
                        "gamma"
                        ];
enable_show = False;

In [None]:
# The new dataset 'augmented_esca_dataset' will be created.
# This dataset contains the augmented images create by the ImageGenerator class and the orginal images, 
# in order to obtain an expanded version of the orginal dataset ready-to-use

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img 
import tensorflow as tf
import os
from numpy import expand_dims
import cv2
import matplotlib.pyplot as plt
from pathlib import Path



def blur(img):
    return (cv2.blur(img,(30,30)))

def horizontal_flip(img):
    return (tf.image.flip_left_right(img))

def vertical_flip(img):
    return (tf.image.flip_up_down(img))
 
def contrast(img):
    return (tf.image.adjust_contrast(img, 0.5))

def saturation(img):
    return (tf.image.adjust_saturation(img, 3))

def hue(img):
    return (tf.image.adjust_hue(img, 0.1)) 

def gamma(img):
    return (tf.image.adjust_gamma(img, 2))



new_dataset = 'augmented_esca_dataset'
classes = ['esca', 'healthy']
for class_tag in classes:
  input_path = '/content/' + dataset_name + '/' + class_tag + '/'
  output_path = '/content/' + dataset_name + '/' + new_dataset + '/' + class_tag + '/'
  print(input_path)
  print(output_path)
  # TMP
  !rm -rf $output_path
  # END TMP
  try:
    if not os.path.exists(output_path):
      os.makedirs(output_path)
  except OSError:
      print ("Creation of the directory %s failed\n\n" % output_path)
  else:
      print ("Successfully created the directory %s\n\n" % output_path)

  for filename in os.listdir(input_path):
    if filename.endswith(".jpg"):
      # Copy the original image in the new dataset
      original_file_path = input_path + filename
      original_newname_file_path = output_path + Path(filename).stem + "_original.jpg"
      %cp $original_file_path $original_newname_file_path
      # Initialising the ImageDataGenerator class. 
      # We will pass in the augmentation parameters in the constructor. 
      for transformation in transformation_array:
        if transformation == "horizontalFlip":
              #datagen = ImageDataGenerator(horizontal_flip = True)                 # for random flip
              datagen = ImageDataGenerator(preprocessing_function=horizontal_flip)  # all imgs flipped
        elif transformation == "verticalFlip":
              #datagen = ImageDataGenerator(vertical_flip = True)                   # for random flip
              datagen = ImageDataGenerator(preprocessing_function=vertical_flip)    # all imgs flipped
        elif transformation == "rotation":
              datagen = ImageDataGenerator(rotation_range = 40, fill_mode='nearest') 
        elif transformation == "widthShift":
              datagen = ImageDataGenerator(width_shift_range = 0.2, fill_mode='nearest')
        elif transformation == "heightShift":
              datagen = ImageDataGenerator(height_shift_range = 0.2, fill_mode='nearest')         
        elif transformation == "shearRange":
              datagen = ImageDataGenerator(shear_range = 0.2)   
        elif transformation == "zoom":
              datagen = ImageDataGenerator(zoom_range = [0.5, 1.0])
        elif transformation == "blur":
              datagen = ImageDataGenerator(preprocessing_function=blur)        
        elif transformation == "brightness":
              #Values less than 1.0 darken the image, e.g. [0.5, 1.0], 
              #whereas values larger than 1.0 brighten the image, e.g. [1.0, 1.5], 
              #where 1.0 has no effect on brightness.
              datagen = ImageDataGenerator(brightness_range = [1.1, 1.5])
        elif transformation == "contrast": 
              datagen = ImageDataGenerator(preprocessing_function=contrast)
        elif transformation == "saturation": 
              datagen = ImageDataGenerator(preprocessing_function=saturation)      
        elif transformation == "hue": 
              datagen = ImageDataGenerator(preprocessing_function=hue)    
        elif transformation == "gamma": 
              datagen = ImageDataGenerator(preprocessing_function=gamma)      

        # Loading a sample image 
        img = load_img(input_path + filename) 
        # Converting the input sample image to an array 
        data = img_to_array(img) 
        # Reshaping the input image expand dimension to one sample
        samples = expand_dims(data, 0) 
        # Plot original image
        print("Original image:")
        print(filename)
        if enable_show:
          plt.imshow(img)
          plt.show()
          print("\n\n")

        # Generating and saving n_augmented_images augmented samples
        print("Apply " + transformation + ".")
        # prepare iterator
        it = datagen.flow(samples, batch_size = 1, 
                    save_to_dir = output_path, 
                    save_prefix = Path(filename).stem + "_" + transformation,
                    save_format ='jpg')
        batch = it.next()
        # Plot trasnformed image
        image = batch[0].astype('uint8')
        if enable_show:
          print("Transformed image:")
          plt.imshow(image)
          plt.show()
        print("\n\n")

print("Done!\n\n")

# [Opzional STEP] - Visualize some images generated from data augmentation

In [None]:
# Visualize N images with data aumentation applied, 
# where N = n_images_shown can be chosen by the user.

import os
import glob
from numpy import expand_dims
import cv2
import matplotlib.pyplot as plt
from pathlib import Path


n_images_shown = 4              # This parameter can be modified by the user

for class_tag in classes:
  input_path = '/content/' + dataset_name + '/' + class_tag + '/'
  output_path = '/content/' + dataset_name + '/' + new_dataset + '/' + class_tag + '/'

  counter = 0
  for filename in os.listdir(input_path):
    if filename.endswith(".jpg"):

      if counter > n_images_shown:
        break

      # Show the obtained AUGMENTATED IMAGES (plus ORIGINAL IMAGE) for the specific orginal image
      print(filename)
      transformation_array_size = len(transformation_array)
      # Define subplot
      fig = plt.figure(figsize=(30,20))
      columns = 3
      rows = transformation_array_size / columns + 1  
      # Sort images by creation date to obtain "original" as first image
      files_sorted = list(filter(os.path.isfile, glob.glob(output_path + Path(filename).stem + "*.jpg")))
      files_sorted.sort(key=lambda x: os.path.getmtime(x))
      #print(files_sorted)
      index = 0
      for filename_out in files_sorted:
        #print(filename_out)
        # Load image
        aug_img = load_img(filename_out) 
        # Converting the input sample image to an array 
        data = img_to_array(aug_img) 
        # Reshaping the input image expand dimension to one sample
        samples = expand_dims(data, 0) 
        # Plot augmented image
        ax1 = fig.add_subplot(rows, columns, index + 1)
        ax1.title.set_text((Path(filename_out).stem).split('_')[3])
        plt.imshow(aug_img)
        index = index + 1
      print("Augmented images:")
      plt.show()
      print("\n\n")
    counter = counter + 1
    


# Step 3 - Save augmented dataset

In [None]:
#@title Save your data to Google drive
filename = "augmented_esca_dataset" #@param {type:"string"}
folders_to_save = "/content/esca_dataset/augmented_esca_dataset" #@param {type:"string"}

from google.colab import files
from google.colab import auth
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build


def save_file_to_drive(name, path):
    file_metadata = {
    'name': name,
    'mimeType': 'application/octet-stream'
    }
    media = MediaFileUpload(path, 
                  mimetype='application/octet-stream',
                  resumable=True)
    created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print('File ID: {}'.format(created.get('id')))
    return created


# Create archive
extension_zip = ".zip"
zip_file = filename + extension_zip
!zip -r $zip_file {folders_to_save}

# Save archive to your Google Drive
auth.authenticate_user()
drive_service = build('drive', 'v3')
destination_name = zip_file
path_to_file = zip_file
save_file_to_drive(destination_name, path_to_file)
!ls

print("Done!")


# BACKUP

- EXTRACT DATA FROM ARCHIVE


In [None]:
'''
import zipfile
from google.colab import drive

drive.mount('/content/drive/')
!ls

zip_ref = zipfile.ZipFile("/content/drive/My Drive/Colab Notebooks/esca_dataset.zip", 'r')
zip_ref.extractall("/content/drive/My Drive/Colab Notebooks/esca_dataset")
zip_ref.close()
'''

- ARCHIVE DATA AND SAVE ZIP FILE TO YOUR PC

In [None]:
'''
# An archive file of the augmented dataset will be created 
# and automatically downloaded to the path selected by the user in the browse window that will appear at the end of the script

from google.colab import files
import requests
from multiprocessing.pool import ThreadPool
from pathlib import Path


# Create archive
data_folder_tozip = "/content/" + dataset_name + "/" + new_dataset
archive_name = new_dataset + ".zip"
print(data_folder_tozip)

#!zip -r $archive_name $data_folder_tozip
#files.download(archive_name)

# We have a large archive, so we create a split zip archive, that is multiple archives of the same folder (each part of 100 MB)
!zip -r -s 100m "archive.zip" $data_folder_tozip

# Download the split archive, so we have multiple archives on the chosen path 
def download_archive(archive):
  print("Downloading: ",archive)
  files.download(archive)
 
archive_array = []
for path in Path('/content/').rglob('archive.z*'):
    print(path.name)
    string = path.name
    archive_array.append(string)
print(archive_array)
print(len(archive_array))

# Run multiple threads to download the multiple archives in parallel
ThreadPool(len(archive_array)).imap_unordered(download_archive, archive_array)


# NOTE: After download the multiple archives, the use must reconstruct the archive:
# - For Linux users:
# To unzip the file, first convert a split archive to a single-file archive:
# zip -s 0 archive.zip --out unsplit.zip
# Then you can unzip the "unsplit" file
# unzip unsplit.zip
#
# - For Windows:
# 
'''