In [2]:
# Basic python packages
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict
import glob

# General machine learning packages
from sklearn.model_selection import train_test_split

# Packages related to images
from PIL import Image
import PIL

# Packages for neural networks
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, Embedding
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
# Check if GPU works
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [22]:
#Paths to different folders/files
image_dir = "../Data/Rijksmuseum/jpg2"
train_dir = "../Data/Rijksmuseum/train_set"
resized_train_dir = "..Data/Rijksmuseum/resized_jpg2"
test_image_dir = "../Data/Rijksmuseum/test_set"
labels_file = "../Data/Rijksmuseum/xml_files.csv"
training_path = "../Data/Rijksmuseum/training_data/"
validation_path = "../Data/Rijksmuseum/validation_data/"

img_size = (200, 200) #Size of the input of the neural networks
IMG_SHAPE = img_size + (3,)
batch_size = 32
n_labels = 6,622

In [23]:
labels = pd.read_csv(labels_file)
# labels[['Identifier', 'Creator']]

labels = labels[['Identifier', 'Creator']]
# subs = subs.set_index('Identifier')
labels

Unnamed: 0,Identifier,Creator
0,SK-A-4878,"Everdingen, Caesar Boëtius van"
1,SK-A-4877,"Maris, Matthijs"
2,SK-A-4881,"Maes, Nicolaes"
3,RP-P-1992-35,"Coornhert, Dirck Volckertsz"
4,RP-P-1992-36,"Coornhert, Dirck Volckertsz"
...,...,...
112034,AK-RBK-14763-A-2,anoniem
112035,RP-P-OB-86.512,"Bos, Maarten"
112036,NG-NM-7753,anoniem
112037,NG-NM-8358,"Coenraads, Jacobus (Senior)"


# Image size
Most machine learning requires that the input is always of the same size. Because our images are not always of the same size. We have to resize them

In [17]:
def resize_images(img_names, img_dir, new_img_dir):
    #Deprecated, replaced by the flow from directory
    for img in img_names:
        Image.open(img_dir + "/" + img + ".jpg").resize(img_size).save(new_img_dir + "/" + img + ".jpg")

resize_images(labels["Identifier"], image_dir, resized_train_dir)

# Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(labels["Identifier"].to_numpy(), labels["Creator"].to_numpy(), test_size=0.2, random_state=42)
X_train

array(['RP-P-BI-250', 'RP-P-BI-6678', 'RP-P-1880-A-4126', ...,
       'RP-P-1944-663', 'SK-C-221', 'RP-P-1918-2049'], dtype=object)

# Reformat data for learning
To be able to load the data with a generator. We must split the training and validation data and place them into a folder based on their label.

In [None]:
def generate_label_folders(image_path, image_names, image_labels, destination_path):
    """
    Splits a single folder with images into multiple folders where images are placed based on their labels.

    :image_path: path to the folder with the images
    :image_names: A numpy array with the names of all images
    :image_labels: A numpy array with the labels of all images
    :destination_path: Path of the folder where the images are placed into
    :return: Nothing
    """ 
    for i in range(len(image_names)):
        # Check if the directory exists. Else, make one
        isExist = os.path.exists(destination_path + str(image_labels[i]))
        if not isExist:
            os.makedirs(destination_path + str(image_labels[i]))
            
        # Copy the image
        img = Image.open(image_path + "/" + image_names[i] + ".jpg")
        img.save(destination_path + "/" + str(image_labels[i]) + "/" + image_names[i] + ".jpg")
        

# generate_label_folders(image_dir, X_train, y_train, training_path)
# generate_label_folders(image_dir, X_test, y_test, test_image_dir)

# Image Loading
Because the dataset is so large, we cant just load it into our memory. Instead we generate batches of images. These images are then altered a little bit to create higher variance between images and artificially increase the size of our training data.

In [None]:
def normalize(image):
    image = tf.cast(image, tf.float32)
    image = (image / 127.5) - 1
    return image

train_datagen = ImageDataGenerator(
        preprocessing_function=normalize,
        shear_range=2,
        zoom_range=0.2,
        rotation_range = 2,
        horizontal_flip=True)

test_datagen = ImageDataGenerator(
        preprocessing_function=normalize)

train_generator = train_datagen.flow_from_directory(
        training_path,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical')

validation_generator = test_datagen.flow_from_directory(
        validation_path,
        target_size=img_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical')

test_generator = test_datagen.flow_from_directory(
        test_image_dir,
        target_size=img_size,
        batch_size=batch_size,
        shuffle=False,
        class_mode='categorical')

In [None]:
def train_model(model, steps_per_epoch=150, epochs=3, validation_steps=20, workers=7, checkpoint_loc=""):
    """
    Trains a given model

    :steps_per_epoch: Amount of batches uploaded per epoch. Cant be higher than +- 200
    :epochs: Amount of times the model trains on the data
    :validation_steps: Amount of batches used for validation. Cant be higher than +- 50
    :workers: Amount of processes used to load the data
    :checkpoint_loc: Place for the model checkpoints to be saved
    :return: The trained model and some training data
    """ 
    # Create a callback that saves the model's weights
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_loc,
                                                     save_weights_only=True,
                                                     verbose=1)
    begin_time = datetime.datetime.now()
    history = model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=validation_generator, validation_steps=validation_steps, workers=workers, callbacks=[cp_callback])
    print(datetime.datetime.now() - begin_time)
    return (model, history)

In [None]:
def plot_history(history):
    #Plots the training data.
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
def make_model(n_labels):
    base_model = tf.keras.applications.EfficientNetV2L(input_shape=IMG_SHAPE,
                          include_top=False,
                          weights='imagenet')
    base_model.trainable = False


    global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
    prediction_layer = tf.keras.layers.Dense(n_labels)

    model = tf.keras.Sequential([
      base_model,
      global_average_layer,
      prediction_layer
    ])

    base_learning_rate = 0.0001
    model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=base_learning_rate),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    return model

InceptionResNetV2model = make_model(n_labels)
# InceptionResNetV2model.summary()

In [None]:
InceptionResNetV2model, history_InceptionResNetV2 = train_model(InceptionResNetV2model, steps_per_epoch=100, epochs=5, validation_steps=50, checkpoint_loc="../Model_weights/InceptionResNetV2/")


In [None]:
plot_history(history_InceptionResNetV2)

In [None]:
# Save model
InceptionResNetV2model.save("models/_InceptionResNetV2model")

# Load model
# InceptionResNetV2model = tf.keras.models.load_model('./models/_InceptionResNetV2model')
# Epochs trained: 5, 5, 15