# **Preprocessing Dataset**

**UNZIP DATASET**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_name = "/content/drive/MyDrive/augmented_esca_dataset"
dataset_destination = "/content/augmented_esca_dataset"

!unzip  $dataset_name".zip" -d $dataset_destination
!ls

**LIBRARIES**

In [None]:
from PIL import Image
import numpy as np
from numpy import random


import os
import pathlib
import random

**DIRECTORY**

In [None]:
# directory of dataset
dir_original = "/content/augmented_esca_dataset/content/esca_dataset/augmented_esca_dataset"

# name of new dataset
dir_processed = "/content/augmented_esca_dataset_splited"


**PARAMETERS**

In [None]:
# size of new images
size = 1280, 720

**EXTRACTION OF DATASET INFORMATION**

In [None]:
data_dir = pathlib.Path(dir_original)

set_samples = ['train', 'validation', 'test']
print("set_samples: ", set_samples, "\n")

CLASS_NAMES = np.array([item.name for item in sorted(data_dir.glob('*'))])												
print("class: ", CLASS_NAMES, "\n")

N_IMAGES = np.array([len(list(data_dir.glob(item.name+'/*.jpg'))) for item in sorted(data_dir.glob('*'))])			# number of images for class
print("number of images for class: ", N_IMAGES, "\n")

N_samples = np.array([(int(np.around(n*60/100)), int(np.around(n*15/100)), int(np.around(n*25/100))) for n in N_IMAGES])	# number of images for set (train,validation,test)
print("split of dataset: \n ", N_samples, "\n")



set_samples:  ['train', 'validation', 'test'] 

class:  ['esca' 'healthy'] 

number of images for class:  [12432 12348] 

split of dataset: 
  [[7459 1865 3108]
 [7409 1852 3087]] 



**PREPROCESSING DATASET**

In [None]:
# Create the new dataset
# Split Dataset								(also resize and rotate)



# create the dataset folder			***********************************
os.makedirs(dir_processed)

for set_tag in set_samples:
	os.makedirs(dir_processed + '/' + set_tag)

	for class_name in CLASS_NAMES:
		os.makedirs(dir_processed + '/' + set_tag + '/' + class_name)



# SPLIT DATASET (and resize)		*************************************
print("Split dataset.....")

i=0
j=0
k=0
for class_name in CLASS_NAMES:														# "j" cambia con il tipo di pianta [0,3]
	
    print("class name: ", class_name)

    contatore_samples = 0
    k=0

    array = sorted(os.listdir(dir_original + '/' + class_name))
    #random.shuffle(array)

    for image_name in array:	                                       	# "contatore" si azzera ad ogni campo 'train' 'validation' 'test'
	
        print("image: ", i)
        i=i+1

        if contatore_samples==N_samples[j][k]:										    # "k" cambia con train, validation, e test
            k+=1
            contatore_samples=0


        img=Image.open(dir_original +'/'+class_name+'/'+image_name)
        l,_ = img.size
        l=int(l)
        
        
        if l==1080 or l==720:
        
            transposed = img.transpose(Image.ROTATE_90)
            transposed.thumbnail(size)
            transposed.save(dir_processed+'/'+set_samples[k]+'/'+class_name+'/'+image_name)
        
        else:
        
            img.thumbnail(size)
            img.save(dir_processed+'/'+set_samples[k]+'/'+class_name+'/'+image_name)

        contatore_samples+=1	

    j+=1




# **MODEL for ESCA DATASET**



LIBRARY

In [None]:
import tensorflow as tf
 
from tensorflow import keras
 
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
 
from tensorflow.keras.preprocessing import image_dataset_from_directory
 
import numpy as np
import matplotlib.pyplot as plt
import os
import time

DIRECTORY

In [None]:
# cartelle contenenti il dataset
 
PATH_DATASET = '/content/augmented_esca_dataset_splited'
 
train_data_dir = os.path.join(PATH_DATASET, 'train')
validation_data_dir = os.path.join(PATH_DATASET, 'validation')
test_data_dir = os.path.join(PATH_DATASET, 'test')
 
 
 
# nomi dei file da creare
 
PATH_MODELS = '/content/Colab Notebooks/PAPER_1'
 
name_model_small = os.path.join(PATH_MODELS, 'model_small_b32.h5')
name_model_medium = os.path.join(PATH_MODELS, 'model_medium_b32.h5')
name_model_large = os.path.join(PATH_MODELS, 'model_large_b32.h5')

PARAMETERS

In [None]:
batch_size = 32
 
nb_train_samples = 14868
nb_validation_samples = 3717
nb_test_samples = 6195
 
n_class = 2
 
epochs = 50

# **MODEL LARGE**

In [None]:
start = time.time()

# image size (Model Medium)
img_width, img_height = 1280, 720

# input shape
if keras.backend.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)



# ***********************************************************************
# ************        DATASET       *************************************
# ***********************************************************************

train_dataset = image_dataset_from_directory(train_data_dir,
                                             shuffle=True,
                                             batch_size=batch_size,
                                             image_size=(img_width, img_height),
                                             label_mode='categorical')


validation_dataset = image_dataset_from_directory(validation_data_dir,
                                                  shuffle=True,
                                                  batch_size=batch_size,
                                                  image_size=(img_width, img_height),
                                                  label_mode='categorical')


test_dataset = image_dataset_from_directory(test_data_dir,
                                            shuffle=True,
                                            batch_size=batch_size,
                                            image_size=(img_width, img_height),
                                            label_mode='categorical')


# preprocessing: input scaling (./255)
train_dataset = train_dataset.map(lambda images, labels: (images/255, labels))
validation_dataset = validation_dataset.map(lambda images, labels: (images/255, labels))
test_dataset = test_dataset.map(lambda images, labels: (images/255, labels))


# Configure the dataset for performance

#AUTOTUNE = tf.data.experimental.AUTOTUNE

#train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
#validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
#test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)




# ***********************************************************************
# **************        MODEL       *************************************
# ***********************************************************************

model_large = Sequential()
model_large.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape))
model_large.add(Activation('relu'))
model_large.add(MaxPooling2D(pool_size=(2, 2)))

model_large.add(Conv2D(32, (3, 3), padding='same'))
model_large.add(Activation('relu'))
model_large.add(MaxPooling2D(pool_size=(2, 2)))

model_large.add(Conv2D(64, (3, 3), padding='same'))
model_large.add(Activation('relu'))
model_large.add(MaxPooling2D(pool_size=(2, 2)))

model_large.add(Conv2D(64, (3, 3), padding='same'))
model_large.add(Activation('relu'))
model_large.add(MaxPooling2D(pool_size=(2, 2)))

model_large.add(Conv2D(32, (3, 3), padding='same'))
model_large.add(Activation('relu'))
model_large.add(MaxPooling2D(pool_size=(2, 2)))

model_large.add(Flatten())
model_large.add(Dense(64))
model_large.add(Activation('relu'))
model_large.add(Dropout(0.5))
model_large.add(Dense(2))			#because we have 2 class
model_large.add(Activation('softmax'))

model_large.summary()


# ***********************************************************************
# *******************        COMPILATION       **************************
# ***********************************************************************


model_large.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adadelta(learning_rate=1, name='Adadelta'),
            metrics=['accuracy'])



# ***********************************************************************
# *******************        TRAINING       *****************************
# ***********************************************************************


with tf.device('/device:GPU:0'):

  history = model_large.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset)



# ***********************************************************************
# *****************        SAVE MODEL        ****************************
# ***********************************************************************


model_large.save(name_model_large)



# ***********************************************************************
# ********************        PLOT RESULTS        ***********************
# ***********************************************************************


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy_'+str(img_width)+' x '+str(img_height))

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss_'+str(img_width)+' x '+str(img_height))
plt.show()



# ***********************************************************************
# ***********************        TEST        ****************************
# ***********************************************************************

with tf.device('/device:GPU:0'):

  test_result = model_large.evaluate(test_dataset)

  
print("size of images: ", img_width,img_height)
print("test_result: ", test_result)


print ('Time taken for development model small {} sec\n'.format(time.time() - start))

# **MODEL SMALL**

In [None]:
start = time.time()
 
# image size (Model Small)
img_width, img_height = 80, 45
 
# input shape
if keras.backend.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)
 
 
 
# ***********************************************************************
# ************        DATASET       *************************************
# ***********************************************************************
 
train_dataset = image_dataset_from_directory(train_data_dir,
                                             shuffle=True,
                                             batch_size=batch_size,
                                             image_size=(img_width, img_height),
                                             label_mode='categorical')
 
 
validation_dataset = image_dataset_from_directory(validation_data_dir,
                                                  shuffle=True,
                                                  batch_size=batch_size,
                                                  image_size=(img_width, img_height),
                                                  label_mode='categorical')
 
 
test_dataset = image_dataset_from_directory(test_data_dir,
                                            shuffle=True,
                                            batch_size=batch_size,
                                            image_size=(img_width, img_height),
                                            label_mode='categorical')
 
 
# preprocessing: input scaling (./255)
train_dataset = train_dataset.map(lambda images, labels: (images/255, labels))
validation_dataset = validation_dataset.map(lambda images, labels: (images/255, labels))
test_dataset = test_dataset.map(lambda images, labels: (images/255, labels))
 
 
# Configure the dataset for performance
 
#AUTOTUNE = tf.data.experimental.AUTOTUNE
 
#train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
#validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
#test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)
 
 
 
 
# ***********************************************************************
# **************        MODEL       *************************************
# ***********************************************************************
 
model_small = Sequential()
model_small.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape))
model_small.add(Activation('relu'))
model_small.add(MaxPooling2D(pool_size=(2, 2)))
 
model_small.add(Conv2D(32, (3, 3), padding='same'))
model_small.add(Activation('relu'))
model_small.add(MaxPooling2D(pool_size=(2, 2)))
 
model_small.add(Conv2D(64, (3, 3), padding='same'))
model_small.add(Activation('relu'))
model_small.add(MaxPooling2D(pool_size=(2, 2)))
 
model_small.add(Conv2D(64, (3, 3), padding='same'))
model_small.add(Activation('relu'))
model_small.add(MaxPooling2D(pool_size=(2, 2)))
 
model_small.add(Conv2D(32, (3, 3), padding='same'))
model_small.add(Activation('relu'))
model_small.add(MaxPooling2D(pool_size=(2, 2)))
 
model_small.add(Flatten())
model_small.add(Dense(64))
model_small.add(Activation('relu'))
model_small.add(Dropout(0.5))
model_small.add(Dense(2))           #because we have 2 class
model_small.add(Activation('softmax'))
 
model_small.summary()
 
 
# ***********************************************************************
# *******************        COMPILATION       **************************
# ***********************************************************************
 
 
model_small.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adadelta(learning_rate=1, name='Adadelta'),
            metrics=['accuracy'])
 
 
 
# ***********************************************************************
# *******************        TRAINING       *****************************
# ***********************************************************************
 
 
with tf.device('/device:GPU:0'):
 
  history = model_small.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset)
 
 
 
# ***********************************************************************
# *****************        SAVE MODEL        ****************************
# ***********************************************************************
 
 
model_small.save(name_model_small)
 
 
 
# ***********************************************************************
# ********************        PLOT RESULTS        ***********************
# ***********************************************************************
 
 
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
 
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs_range = range(epochs)
 
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy_'+str(img_width)+' x '+str(img_height))
 
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss_'+str(img_width)+' x '+str(img_height))
plt.show()
 
 
 
# ***********************************************************************
# ***********************        TEST        ****************************
# ***********************************************************************
 
with tf.device('/device:GPU:0'):
 
  test_result = model_small.evaluate(test_dataset)
 
  
print("size of images: ", img_width,img_height)
print("test_result: ", test_result)
 
 
print ('Time taken for development model small {} sec\n'.format(time.time() - start))

On extrait les datasets en fichier .npy

In [None]:
start = time.time()
 
# image size (Model Small)
img_width, img_height = 80, 45
 
# input shape
if keras.backend.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)
 
 
 
# ***********************************************************************
# ************        DATASET       *************************************
# ***********************************************************************
 
train_dataset = image_dataset_from_directory(train_data_dir,
                                             shuffle=True,
                                             batch_size=batch_size,
                                             image_size=(img_width, img_height),
                                             label_mode='categorical')
 
 
validation_dataset = image_dataset_from_directory(validation_data_dir,
                                                  shuffle=True,
                                                  batch_size=batch_size,
                                                  image_size=(img_width, img_height),
                                                  label_mode='categorical')
 
 
test_dataset = image_dataset_from_directory(test_data_dir,
                                            shuffle=True,
                                            batch_size=batch_size,
                                            image_size=(img_width, img_height),
                                            label_mode='categorical')
 

for images, labels in validation_dataset:  # only take first element of dataset
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()

np.save("validation_x_set", numpy_images)
np.save("validation_y_set", numpy_labels)

Found 14868 files belonging to 2 classes.
Found 3717 files belonging to 2 classes.
Found 6195 files belonging to 2 classes.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp "/content/validation_x_set.npy" "/content/drive/My Drive/validation_x_set.npy"
!cp "/content/validation_y_set.npy" "/content/drive/My Drive/validation_y_set.npy"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **MODEL MEDIUM**

In [None]:
start = time.time()

# image size (Model Medium)
img_width, img_height = 320, 180

# input shape
if keras.backend.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)



# ***********************************************************************
# ************        DATASET       *************************************
# ***********************************************************************

train_dataset = image_dataset_from_directory(train_data_dir,
                                             shuffle=True,
                                             batch_size=batch_size,
                                             image_size=(img_width, img_height),
                                             label_mode='categorical')


validation_dataset = image_dataset_from_directory(validation_data_dir,
                                                  shuffle=True,
                                                  batch_size=batch_size,
                                                  image_size=(img_width, img_height),
                                                  label_mode='categorical')


test_dataset = image_dataset_from_directory(test_data_dir,
                                            shuffle=True,
                                            batch_size=batch_size,
                                            image_size=(img_width, img_height),
                                            label_mode='categorical')


# preprocessing: input scaling (./255)
train_dataset = train_dataset.map(lambda images, labels: (images/255, labels))
validation_dataset = validation_dataset.map(lambda images, labels: (images/255, labels))
test_dataset = test_dataset.map(lambda images, labels: (images/255, labels))


# Configure the dataset for performance

#AUTOTUNE = tf.data.experimental.AUTOTUNE

#train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
#validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
#test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)




# ***********************************************************************
# **************        MODEL       *************************************
# ***********************************************************************

model_medium = Sequential()
model_medium.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape))
model_medium.add(Activation('relu'))
model_medium.add(MaxPooling2D(pool_size=(2, 2)))

model_medium.add(Conv2D(32, (3, 3), padding='same'))
model_medium.add(Activation('relu'))
model_medium.add(MaxPooling2D(pool_size=(2, 2)))

model_medium.add(Conv2D(64, (3, 3), padding='same'))
model_medium.add(Activation('relu'))
model_medium.add(MaxPooling2D(pool_size=(2, 2)))

model_medium.add(Conv2D(64, (3, 3), padding='same'))
model_medium.add(Activation('relu'))
model_medium.add(MaxPooling2D(pool_size=(2, 2)))

model_medium.add(Conv2D(32, (3, 3), padding='same'))
model_medium.add(Activation('relu'))
model_medium.add(MaxPooling2D(pool_size=(2, 2)))

model_medium.add(Flatten())
model_medium.add(Dense(64))
model_medium.add(Activation('relu'))
model_medium.add(Dropout(0.5))
model_medium.add(Dense(2))			#because we have 2 class
model_medium.add(Activation('softmax'))

model_medium.summary()


# ***********************************************************************
# *******************        COMPILATION       **************************
# ***********************************************************************


model_medium.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adadelta(learning_rate=1, name='Adadelta'),
            metrics=['accuracy'])



# ***********************************************************************
# *******************        TRAINING       *****************************
# ***********************************************************************


with tf.device('/device:GPU:0'):

  history = model_medium.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset)



# ***********************************************************************
# *****************        SAVE MODEL        ****************************
# ***********************************************************************


model_medium.save(name_model_medium)



# ***********************************************************************
# ********************        PLOT RESULTS        ***********************
# ***********************************************************************


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy_'+str(img_width)+' x '+str(img_height))

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss_'+str(img_width)+' x '+str(img_height))
plt.show()



# ***********************************************************************
# ***********************        TEST        ****************************
# ***********************************************************************

with tf.device('/device:GPU:0'):

  test_result = model_medium.evaluate(test_dataset)

  
print("size of images: ", img_width,img_height)
print("test_result: ", test_result)


print ('Time taken for development model small {} sec\n'.format(time.time() - start))