In [1]:
import pathlib
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetB0
from keras_preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, TensorBoard
from keras.engine import base_layer

from augmentation import RandomColorDistortion
from callbacks import scheduler, TimeStopping

In [2]:
plt.rcParams["figure.dpi"] = 200

In [3]:
DATASET_PATH = "../../dataset/"

In [4]:
IMAGE_SIZE = (224, 224)
COLOR_MODE = "rgb"
BATCH_SIZE = 32

In [5]:
data = []
for file in pathlib.Path(DATASET_PATH).glob("*/*"):
    data.append({"filename": file.resolve().as_posix(), "label": file.resolve().parent.name})

In [6]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,filename,label
0,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger
1,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger
2,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger
3,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger
4,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger


In [7]:
df[["dataset", "time"]] = df["filename"].str.split("/").str[-1].str.extract(r"^([^_]+)_[^_]+_([^_]+)")
df["time"].unique()

array(['unspecified', 'day', 'night'], dtype=object)

In [8]:
df.head()

Unnamed: 0,filename,label,dataset,time
0,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger,CCT20,unspecified
1,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger,CCT20,unspecified
2,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger,CCT20,unspecified
3,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger,CCT20,unspecified
4,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,badger,CCT20,unspecified


In [9]:
df.groupby("label")["filename"].nunique()

label
None_of_the_above    3400
badger               2447
bear                 2184
bird                 2777
boar                 2737
cat                  6739
chicken               680
cow                  2592
deer                 6652
dog                  4775
fox                  2736
hare                 6392
horse                  62
human                2980
squirrel             2775
vehicle              2829
weasel               3026
Name: filename, dtype: int64

In [10]:
df.groupby(["label", "time"]).agg(count = ("filename", "count"))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,time,Unnamed: 2_level_1
None_of_the_above,day,3000
None_of_the_above,night,400
badger,day,955
badger,night,1474
badger,unspecified,18
bear,day,985
bear,night,420
bear,unspecified,779
bird,unspecified,2777
boar,day,1287


In [11]:
#df.query("label != 'horse'", inplace = True)
df.loc[df["label"].isin(["chicken", "horse"]), "label"] = "None_of_the_above"

In [12]:
df.groupby("label")["filename"].nunique()

label
None_of_the_above    4142
badger               2447
bear                 2184
bird                 2777
boar                 2737
cat                  6739
cow                  2592
deer                 6652
dog                  4775
fox                  2736
hare                 6392
human                2980
squirrel             2775
vehicle              2829
weasel               3026
Name: filename, dtype: int64

In [13]:
df_classes = df.query("label != 'None_of_the_above'")

In [14]:
MIN_COUNT = df_classes.groupby("label")["filename"].nunique().min()  # 3000
MIN_COUNT

2184

In [15]:
def balance_dataset(x):
    return resample(x, replace = False, n_samples = min(MIN_COUNT, len(x)), random_state = 42, stratify = x[["dataset", "time"]])

df_balanced = df_classes.groupby("label", group_keys = False).apply(balance_dataset)
df_balanced = df_balanced.sample(frac = 1, random_state = 42)
df_balanced.reset_index(drop = True, inplace = True)
df_balanced.head()

Unnamed: 0,filename,label,dataset,time
0,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,hare,CCT20,unspecified
1,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,deer,NTLNP,night
2,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,bear,NTLNP,night
3,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,bear,NTLNP,day
4,C:/Users/Ulixe/Desktop/ml-piane-di-sopra/src/d...,dog,ENA24,unspecified


In [16]:
df_balanced = pd.concat([df_balanced, df.query("label == 'None_of_the_above'")], axis = 0, ignore_index = True)

In [17]:
df_balanced.groupby("label")["filename"].nunique()

label
None_of_the_above    4142
badger               2184
bear                 2184
bird                 2184
boar                 2184
cat                  2184
cow                  2184
deer                 2184
dog                  2184
fox                  2184
hare                 2184
human                2184
squirrel             2184
vehicle              2184
weasel               2184
Name: filename, dtype: int64

In [18]:
df_balanced.groupby(["label", "time", "dataset"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,filename
label,time,dataset,Unnamed: 3_level_1
None_of_the_above,day,UPENN,1000
None_of_the_above,day,casanova,1000
None_of_the_above,day,google,1000
None_of_the_above,night,casanova,400
None_of_the_above,unspecified,ENA24,742
badger,day,NTLNP,852
badger,night,NTLNP,354
badger,night,Sheffield,962
badger,unspecified,CCT20,16
bear,day,NTLNP,985


In [19]:
df_train, df_valid = train_test_split(df_balanced, 
                                      test_size = 0.2, 
                                      shuffle = True, 
                                      random_state = 42, 
                                      stratify = df_balanced[["label", "dataset", "time"]])

In [20]:
df_valid.groupby(["label", "time", "dataset"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,filename
label,time,dataset,Unnamed: 3_level_1
None_of_the_above,day,UPENN,200
None_of_the_above,day,casanova,200
None_of_the_above,day,google,200
None_of_the_above,night,casanova,80
None_of_the_above,unspecified,ENA24,148
badger,day,NTLNP,170
badger,night,NTLNP,71
badger,night,Sheffield,192
badger,unspecified,CCT20,3
bear,day,NTLNP,197


In [21]:
df_train.groupby("label")["filename"].nunique()

label
None_of_the_above    3314
badger               1748
bear                 1747
bird                 1747
boar                 1746
cat                  1747
cow                  1747
deer                 1747
dog                  1748
fox                  1747
hare                 1748
human                1747
squirrel             1747
vehicle              1747
weasel               1747
Name: filename, dtype: int64

In [22]:
df_valid.groupby("label")["filename"].nunique()

label
None_of_the_above    828
badger               436
bear                 437
bird                 437
boar                 438
cat                  437
cow                  437
deer                 437
dog                  436
fox                  437
hare                 436
human                437
squirrel             437
vehicle              437
weasel               437
Name: filename, dtype: int64

In [23]:
datagen = ImageDataGenerator()
valid_datagen = ImageDataGenerator()

train_generator = datagen.flow_from_dataframe(dataframe = df_train,
                                              directory = None, 
                                              x_col = "filename",
                                              y_col = "label",
                                              batch_size = BATCH_SIZE,
                                              seed = 42,
                                              shuffle = True,
                                              class_mode = "categorical",
                                              target_size = IMAGE_SIZE)

valid_generator = valid_datagen.flow_from_dataframe(dataframe = df_valid,
                                                    directory = None, 
                                                    x_col = "filename",
                                                    y_col = "label",
                                                    batch_size = BATCH_SIZE,
                                                    seed = 42,
                                                    shuffle = True,
                                                    class_mode = "categorical",
                                                    target_size = IMAGE_SIZE)

Found 27774 validated image filenames belonging to 15 classes.
Found 6944 validated image filenames belonging to 15 classes.


In [None]:
# Define labels ordered according to dataset storage.
LABELS = list(train_generator.class_indices.keys())
NUM_CLASSES = len(LABELS)

In [None]:
plt.figure(figsize = (10, 10))
for i in range(6):
    batch_index = 0
    images, labels = next(train_generator)
    preprocessed_image = images
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(preprocessed_image[batch_index].astype("uint8"))
    plt.title(np.array(LABELS)[labels[batch_index] == 1][0])
    plt.tight_layout()
    plt.axis("off")

In [None]:
# Data augmentation layers.
data_augmentation = keras.Sequential([
                                      #layers.RandomRotation(factor = (-0.1, 0.1), fill_mode = "wrap"),
                                      #layers.RandomTranslation(height_factor = (-0.1, 0.1), width_factor = (-0.1, 0.1), fill_mode = "wrap"),
                                      layers.RandomFlip(),
                                      #RandomColorDistortion(brightness_max_delta = 0.2, 
                                      #                      saturation_delta = (0.5, 0.9),
                                      #                      hue_max_delta = 0.2, 
                                      #                      contrast_delta = (0.5, 0.9)),
                                     ], name = "data_augmentation")

In [None]:
plt.figure(figsize = (10, 10))
for i in range(6):
    batch_index = 0
    images, labels = next(train_generator)
    preprocessed_image = images
    augmented_images = data_augmentation(preprocessed_image)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(augmented_images[batch_index].numpy().astype("uint8"))
    plt.title(np.array(LABELS)[labels[batch_index] == 1][0])
    plt.tight_layout()
    plt.axis("off")

In [None]:
def scheduler(epoch, lr):
    """Learning scheduler."""
    if epoch <= 200:
        return 0.01
    elif epoch > 200 and epoch <= 300:
        return 0.001
    else:
        return 0.0001

In [None]:
def build_model(num_classes):
    inputs = layers.Input(shape = (IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
    
    inputs_augmented = data_augmentation(inputs)
    
    model = EfficientNetB0(include_top = False, input_tensor = inputs_augmented, weights = "imagenet")
    
    # Freeze the pretrained weights.
    model.trainable = False

    # Unfreeze the top layers while leaving BatchNorm layers frozen.
    for layer in model.layers[-3:]:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True

    # Rebuild top.
    x = layers.GlobalAveragePooling2D(name = "avg_pool")(model.output)
    x = layers.BatchNormalization()(x)

    top_dropout_rate = 0.5
    x = layers.Dropout(top_dropout_rate, name = "top_dropout")(x)
    outputs = layers.Dense(NUM_CLASSES, activation = "softmax", name = "prediction")(x)

    # Compile.
    model = tf.keras.Model(inputs, outputs, name = "EfficientNet")
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-2)
    model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])
    
    return model

In [None]:
model = build_model(NUM_CLASSES)
model.summary()

In [None]:
epochs = 300

lr_scheduler = LearningRateScheduler(scheduler)
cp_callback = ModelCheckpoint(filepath = "./checkpoints/weights.h5",
                              save_weights_only = True,
                              save_best_only = True,
                              verbose = 1)
tb_callback = TensorBoard("./logs")
history = model.fit(train_generator, 
                    validation_data = valid_generator, 
                    epochs = epochs, 
                    validation_freq = 5,
                    callbacks = [lr_scheduler, tb_callback])

In [None]:
plt.plot(history.history["accuracy"], label = "accuracy")
plt.plot(history.history["val_accuracy"], label = "val_accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.ylim([0, 1])
plt.legend(loc = "best")

In [None]:
model.save_weights("weights.h5")

In [None]:
with open("labels", "wb") as fp:   
    pickle.dump(LABELS, fp)

In [None]:
with open("shape", "wb") as fp:   
    pickle.dump(IMAGE_SIZE, fp)