In [4]:
import numpy as np
import os
import PIL
import PIL.Image
import tensorflow as tf
import shutil
import pathlib
import re  # Regex
import time

2023-05-10 14:53:00.054209: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-10 14:53:00.098057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing

### Fabi implementation

This part will use the train/val/test splits provided in the dataseet.

In [1]:
import scipy.io
mat = scipy.io.loadmat('./data/17flowers/datasplits.mat')

image_size = 224
batch_size = 32
epochs = 300

# The image ids for each split
train_ids = mat["trn1"][0]
val_ids = mat["val1"][0]
test_ids = mat["tst1"][0]

# The amount of images in each split
train_size = len(train_ids)
val_size = len(val_ids)
test_size = len(test_ids)

print(f"Training  : {train_size} images.")
print(f"Validation: {val_size} images.")
print(f"Testing   : {test_size} images.")

Training  : 680 images.
Validation: 340 images.
Testing   : 340 images.


create the 17 subfolders for each split

In [3]:
import os
# list of class labels
with open("17flowers_labels.txt", "r") as f:
    flower_labels = [line.strip() for line in f]

# set the path to the folder containing the images
path_to_data = "data/17flowers/"

def create_subfolders(path):
    """
    Create train/val/test subfolders with 17 subfolders each (1 for each label).
    """
    for split_name in ["train/", "val/", "test/"]:
        for label in flower_labels:
            os.makedirs(path + split_name + label, exist_ok=True)
    print(f"All subfolders created at {path}.")

create_subfolders(path_to_data)

All subfolders created at data/17flowers/.


In [5]:
# move the images into the subfolders
def move_images_to_subfolders(path):
    """
    Copy images from `path/jpg` to `path` subfolders train/val/test and their labels.
    """
    src_path = path + "jpg/"
    for filename in os.listdir(src_path):
        if filename.endswith(".jpg"):
            # Get the id of the image from its filename
            
            file_id = int(re.findall(r'\d+', filename)[0])
            # file_id = int(filename[6:-4])

            # Check which split the file belongs to
            if file_id in train_ids:
                split = "train/"
            elif file_id in val_ids:
                split = "val/"
            elif file_id in test_ids:
                split = "test/"
            else:
                print(f"{filename} isn't associated with any splits.")

            # calculate the subfolder to move the image into
            subfolder_id = (file_id-1) // 80  # File ids start from 1, so subtract one. 80 images per label
            subfolder_name = path + split + flower_labels[subfolder_id]

            # move the image into the subfolder
            shutil.copy(os.path.join(src_path, filename), os.path.join(subfolder_name, filename))
    print(f"Images copied successfully to {path} test/train/val subfolders.")


move_images_to_subfolders(path_to_data)

Images copied successfully to data/17flowers/ test/train/val subfolders.


In [6]:
# Split directories
train_dir = pathlib.Path(path_to_data + "train")
val_dir = pathlib.Path(path_to_data + "val")
test_dir = pathlib.Path(path_to_data + "test")

# Image counts
train_count = len(list(train_dir.glob('*/*.jpg')))
val_count = len(list(val_dir.glob('*/*.jpg')))
test_count = len(list(test_dir.glob('*/*.jpg')))

print(f"Number of images at {path_to_data}: {train_count}/{train_size} (train), {val_count}/{val_size} (val), {test_count}/{test_size} (test)")

# The number of images in each folder should be the same as the amount of ids provided by the datasplits.mat file
assert train_count == train_size, f"Expected {train_size} images, but {train_dir} only has {train_count}"
assert val_count == val_size, f"Expected {train_size} images, but {val_dir} only has {val_count}"
assert test_count == test_size, f"Expected {train_size} images, but {test_dir} only has {test_count}"

Number of images at data/17flowers/: 680/680 (train), 340/340 (val), 340/340 (test)


In [11]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  train_dir,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

val_ds = tf.keras.utils.image_dataset_from_directory(
  val_dir,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

test_ds = tf.keras.utils.image_dataset_from_directory(
  test_dir,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

Found 680 images belonging to 17 classes.
Found 340 images belonging to 17 classes.
Found 340 images belonging to 17 classes.


### cc dataset

In [None]:
path_to_data_cc = "data/17flowers/cc/"
create_subfolders(path_to_data_cc)

train_dir_cc = pathlib.Path(path_to_data_cc + "train")
val_dir_cc = pathlib.Path(path_to_data_cc + "val")
test_dir_cc = pathlib.Path(path_to_data_cc + "test")

train_count_cc = len(list(train_dir_cc.glob('*/*.jpg')))
val_count_cc = len(list(val_dir_cc.glob('*/*.jpg')))
test_count_cc = len(list(test_dir_cc.glob('*/*.jpg')))

print(f"Number of images at {path_to_data_cc}: {train_count_cc}/{train_size} (train), {val_count_cc}/{val_size} (val), {test_count_cc}/{test_size} (test)")

assert train_count_cc == train_size, f"Expected {train_size} images, but {train_dir_cc} only has {train_count_cc}"
assert val_count_cc == val_size, f"Expected {train_size} images, but {val_dir_cc} only has {val_count_cc}"
assert test_count_cc == test_size, f"Expected {train_size} images, but {test_dir_cc} only has {test_count_cc}"

In [None]:
train_ds_cc = tf.keras.utils.image_dataset_from_directory(
  train_dir_cc,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

val_ds_cc = tf.keras.utils.image_dataset_from_directory(
  val_dir_cc,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

test_ds_cc = tf.keras.utils.image_dataset_from_directory(
  test_dir_cc,
  seed=123,
  image_size=(image_size, image_size),
  batch_size=batch_size
)

In [None]:
normalization_layer = tf.keras.layers.Rescaling(1./255)

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5)

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=25)

## Base model function

### base model for cc models

In [None]:
from keras.applications import VGG16
from processing.grey_world.cc_layers import WhitePatch
from keras import models
from keras import layers
from keras import optimizers
 

def ccModel(cc_layers=None):
    # Load the VGG model
    vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(image_size, image_size, 3))

    # Freeze all the layers except for the last layer: 
    for layer in vgg_conv.layers[:-4]:
        layer.trainable = False
    
    # Create the model
    model = models.Sequential()
    model.add(normalization_layer)

    # model.add(tf.keras.layers.Rescaling(1./255))
    if cc_layers != None:
        # Add cc layers
        model.add(cc_layers)
    
    # Add the vgg convolutional base model
    model.add(vgg_conv)

    # Add new layers
    model.add(layers.Flatten())
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(17, activation='softmax'))
    
    # Compile the model
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=optimizers.RMSprop(lr=2e-4),
                metrics=['accuracy'])

    # Build the model
    model.build((None, image_size, image_size, 3))
    
    return model

cc model for batch normalization

In [None]:
def ccBachModel():
    # Load the VGG model
    vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(image_size, image_size, 3))

    # Freeze all the layers except for the last layer: 
    for layer in vgg_conv.layers[:-4]:
        layer.trainable = False
    
    # Create the model
    model = models.Sequential()
    model.add(normalization_layer)

    # model.add(tf.keras.layers.Rescaling(1./255))
    model.add(tf.keras.layers.Conv2D(3, 5, padding="same", input_shape=(image_size, image_size, 3)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.ReLU())
    # Add the vgg convolutional base model
    model.add(vgg_conv)

    # Add new layers
    model.add(layers.Flatten())
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(17, activation='softmax'))
    
    # Compile the model
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=optimizers.RMSprop(lr=2e-4),
                metrics=['accuracy'])

    # Build the model
    model.build((None, image_size, image_size, 3))
    
    return model

In [None]:
import time

def experiment(model, train_ds, val_ds, test_ds, n_trials=10):
    metrics = {
        "train_time": [],
        "test_time": [],
        "train_acc": [],
        "train_loss": [],
        "val_acc": [],
        "val_loss": [],
        "test_acc": [],
        "test_loss": [],
        "history": []
    }

    for i in range(n_trials):        
        model.compile(
            optimizer=tf.keras.optimizers.RMSprop(lr=2e-4),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=['accuracy'])

        start_time = time.perf_counter()
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=epochs,
            callbacks=[reduce_lr, early_stop],
            verbose=0)
        end_time = time.perf_counter()
        training_time = end_time - start_time

        start_time = time.perf_counter()
        test_loss, test_acc = model.evaluate(test_ds, verbose=0)
        end_time = time.perf_counter()
        test_time = end_time - start_time

        metrics["train_time"].append(training_time)
        metrics["test_time"].append(test_time)
        metrics["train_acc"].append(history.history["accuracy"][-1])
        metrics["train_loss"].append(history.history["loss"][-1])
        metrics["val_acc"].append(history.history["val_accuracy"][-1])
        metrics["val_loss"].append(history.history["val_loss"][-1])
        metrics["test_acc"].append(test_acc)
        metrics["test_loss"].append(test_loss)
        metrics["history"].append(history)
    return metrics

### Experiments

In [None]:
from processing.grey_world.cc_layers import GreyWorld, WhitePatch, GreyEdge

grey_world_layer = GreyWorld()
white_patch_layer = WhitePatch()
grey_edge_layer = GreyEdge()

# Create new models
model_base = ccModel()
model_batch = ccBachModel()
model_gw = ccModel(grey_world_layer)
model_ge = ccModel(grey_edge_layer)
model_wp = ccModel(white_patch_layer)
model_fc4 = ccModel()

# Run experiments
n_trials = 1
metrics = {}
metrics["Base"] = experiment(model_base, train_ds, val_ds, test_ds, n_trials=n_trials)
metrics["BatchNorm"] = experiment(model_batch, train_ds, val_ds, test_ds, n_trials=n_trials)
metrics["GreyWorld"] = experiment(model_gw, train_ds, val_ds, test_ds, n_trials=n_trials)
metrics["GreyEdge"] = experiment(model_ge, train_ds, val_ds, test_ds, n_trials=n_trials)
metrics["WhitePatch"] = experiment(model_wp, train_ds, val_ds, test_ds, n_trials=n_trials)
metrics["FC4"] = experiment(model_fc4, train_ds_cc, val_ds_cc, test_ds_cc, n_trials=n_trials)

### Saving the results

In [None]:
import pandas as pd
from datetime import datetime
# Get the current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d %H-%M-%S")

# Export data to Excel sheet
dst_path = f"./out/{timestamp}_experiments_17flowers.xlsx"
with pd.ExcelWriter(dst_path, engine='xlsxwriter',) as writer:
    end_data = pd.concat({k: pd.DataFrame(v) for k, v in metrics.items()}, axis=0, names=["Algorithm", "Trial"])
    end_data.drop("history", axis=1, inplace=True)
    end_data.to_excel(writer, "Final Data", merge_cells=False)

    for k, metric in metrics.items():
        histories = metric["history"]
        algo_data = pd.concat({f"{i}": pd.DataFrame(history.history) for i, history in enumerate(histories)}, axis=1)
        algo_data.to_excel(writer, f"{k} History", merge_cells=False)

print(f"Data saved to {dst_path}")
