In [44]:
# Basic python packages
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict
import glob
import splitfolders
import cv2
import random

# General machine learning packages
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Packages related to images
from PIL import Image
import PIL

# Packages for neural networks
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, Embedding
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D,Convolution2D,BatchNormalization
from tensorflow.keras.layers import Flatten,MaxPooling2D,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image

In [45]:
# !pip install --upgrade tensorflow_hub

In [46]:
# Check if GPU works
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [47]:
def clean_data(path):
    df = pd.read_csv(path)
    df = df[df['Creator'] != ''] 
    df = df[df['Creator'] != 'anoniem']
    df = df[df['Creator'] != 'onbekend']
    df = df[df['Creator'].notna()]
    df = df.replace('Koning, Cornelis (?-1671)', 'Koning, Cornelis')
    return df

def preprocess_data(df, num_artworks = 10):
    df = df.groupby("Creator").filter(lambda x: len(x) >= num_artworks)
    le = preprocessing.LabelEncoder()
    le.fit(df.Creator)
    df['Creator_cat'] = le.transform(df.Creator)
    le.fit(df.Type)
    df['Type_cat'] = le.transform(df.Type)
    return df

df = clean_data('../Data/Rijksmuseum/xml_files.csv')
rijksdata = preprocess_data(df, 100)

In [48]:
rijksdata

Unnamed: 0,Identifier,Creator,Title,Date,Type,Creator_cat,Type_cat
3,RP-P-1992-35,"Coornhert, Dirck Volckertsz",Keizer Karel V omgeven door zijn overwonnen te...,1555 - 1555,prent,50,77
4,RP-P-1992-36,"Coornhert, Dirck Volckertsz",Gevangename van Frans I tijdens de slag van Pa...,1555 - 1556,prent,50,77
7,RP-P-1992-70,"Hondius, Hendrick (I)",Vanitasstilleven met een schedel met lauwerkra...,1626 - 1626,prent,105,77
100,RP-T-1905-158(R),"Fokke, Simon","Jonge edelman, naar links in gebogen houding",1710 - 1779,tekening,76,104
101,RP-T-1905-158(V),"Fokke, Simon",Zittende man,ca. 1710 - ca. 1779,tekening,76,104
...,...,...,...,...,...,...,...
111961,RP-P-OB-103.253,"Galle, Cornelis (I)",Maria met Kind en de H. Bernardus van Siena en...,1612 - 1633,prent,82,77
111962,RP-P-OB-103.254,"Galle, Cornelis (I)",Petrus doopt de H. Prisca van Rome,ca. 1586 - ca. 1610,prent,82,77
111965,RP-P-OB-103.259,"Natalis, Michel",H. Franciscus van Assisi,1620 - 1668,prent,140,77
111966,RP-P-OB-103.260,"Sadeler, Johann (I)",Heilige Apollonia (),1583 - 1587,prent,166,77


In [17]:
#Paths to different folders/files
image_dir = "../Data/Rijksmuseum/jpg2/"
split_image_dir = "../Data/Rijksmuseum/jpg2_split_100/"
training_path = "../Data/Rijksmuseum/output_100/train/"
validation_path = "../Data/Rijksmuseum/output_100/val/"
testing_path = "../Data/Rijksmuseum/output_100/test/"

img_size = (200, 200) #Size of the input of the neural networks
IMG_SHAPE = img_size + (3,)
batch_size = 256
n_labels_c1 = len(rijksdata.groupby('Creator').count())
n_labels_c2 = len(rijksdata.groupby('Type').count())

In [49]:
labels = rijksdata[['Identifier', 'Creator_cat', 'Type_cat']]
labels
labels[labels['Identifier'] == 'RP-P-1878-A-550']

Unnamed: 0,Identifier,Creator_cat,Type_cat
29196,RP-P-1878-A-550,18,77


In [57]:
train_dict = {}
val_dict = {}

for fol in os.listdir(training_path):
    for file in os.listdir(os.path.join(training_path, fol)):
        im_path = os.path.join(os.path.join(training_path, fol), file)
        train_dict[im_path] = (labels[labels['Identifier'] == file[:-4]]['Creator_cat'].values[0], labels[labels['Identifier'] == file[:-4]]['Type_cat'].values[0])
        
for fol in os.listdir(validation_path):
    for file in os.listdir(os.path.join(validation_path, fol)):
        im_path = os.path.join(os.path.join(validation_path, fol), file)
        val_dict[im_path] = (labels[labels['Identifier'] == file[:-4]]['Creator_cat'].values[0], labels[labels['Identifier'] == file[:-4]]['Type_cat'].values[0])

In [67]:
def create_model_inception(c_1, c_2):
    inputs = tf.keras.layers.Input(shape=[img_size[0], img_size[1], 3], name='main_input')
    main_branch = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/inception_v3/classification/4")(inputs)
    main_branch = tf.keras.layers.Flatten()(main_branch)
    main_branch = tf.keras.layers.Dense(1024, activation='relu')(main_branch)

    class1_pred = Dense(c_1, activation='softmax', name='c1_output')(main_branch)
    class2_pred = Dense(c_2, activation='softmax', name='c2_output')(main_branch)

    model = Model(inputs = inputs, outputs = [class1_pred, class2_pred])
    
    for layer in model.layers[:126]:
        layer.trainable = False
    for layer in model.layers[126:]:
        layer.trainable = True
    
    return model

In [70]:
def create_model_xception(c_1, c_2):
    base_model = tf.keras.applications.Xception(input_shape = IMG_SHAPE, 
                                                   weights = 'imagenet', 
                                                   include_top=False)

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
#     x = BatchNormalization()(x)

    class1_pred = Dense(c_1, activation='softmax', name='c1_output')(x)
    class2_pred = Dense(c_2, activation='softmax', name='c2_output')(x)

    model = Model(inputs = base_model.input, outputs = [class1_pred, class2_pred])
    
    for layer in model.layers[:126]:
        layer.trainable = False
    for layer in model.layers[126:]:
        layer.trainable = True
    
    return model

In [65]:
def train_gen(training_dict, batch_size=10):
    """
    Source: https://github.com/daveboat/multitask-image-classification-keras-example
    Our image generator. This should load a batch of images of size batch_size using our training dict, resize them
    all to 200x200, and then stack them together into a (batch_size, 200, 200, 3) tensor, or a stack of (200, 200, 3)
    images
    Target is a stack of [targets_c1, targets_c2]
    Should return [image batch, target]
    """
    training_list = list(training_dict.items())
    training_len = len(training_list)
#     print('training_len = %d' % training_len)
    random.shuffle(training_list)
    list_index = 0
    current_batch_size = 0

    # yield loop
    while 1:
        images = []
        targets_c1 = []
        targets_c2 = []
        while current_batch_size < batch_size:
            images.append( image.img_to_array( image.load_img( training_list[list_index][0], target_size=img_size ) ) / 255.0 )
            targets_c1.append(training_list[list_index][1][0])
            targets_c2.append(training_list[list_index][1][1])
            list_index+=1
            current_batch_size += 1
            #print("list_index = %d, current_batch_size = %d" %(list_index, current_batch_size))
            if list_index >= training_len: list_index = 0
        current_batch_size = 0
        targets = [np.array(targets_c1), np.array(targets_c2)]

        yield [np.stack(images, axis=0)], targets

In [73]:
batch_size = 128
epochs = 10

# initialize training and validation generators
gen = train_gen(train_dict, batch_size)
val_gen = train_gen(val_dict, batch_size)
steps = len(train_dict) // batch_size
val_steps = len(val_dict) // batch_size

# initialize and compile model
model = create_model_inception(n_labels_c1, n_labels_c2)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss={'c1_output': 'sparse_categorical_crossentropy', 'c2_output': 'sparse_categorical_crossentropy'},
              loss_weights={'c1_output': 1., 'c2_output': .001},
              metrics={'c1_output': 'accuracy', 'c2_output': 'accuracy'})

# fit model
model.fit(gen, steps_per_epoch=steps, epochs=epochs, validation_data=val_gen, validation_steps=val_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b193df1640>

In [72]:
# initialize and compile model
model2 = create_model_xception(n_labels_c1, n_labels_c2)

model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss={'c1_output': 'sparse_categorical_crossentropy', 'c2_output': 'sparse_categorical_crossentropy'},
              loss_weights={'c1_output': 1., 'c2_output': .001},
              metrics={'c1_output': 'accuracy', 'c2_output': 'accuracy'})

# fit model
model2.fit(gen, steps_per_epoch=steps, epochs=epochs, validation_data=val_gen, validation_steps=val_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b048b4f070>

In [None]:
# Save model
# model.save("models/_Xception_adam")

# Load model
# model = tf.keras.models.load_model('./models/_ResNet152V2model')
# model.load_weights('../Model_weights/ResNet152V2model/')


# Predict the test set
We also have to predict the real test set

In [None]:
# def predict_test(model, labels):
#     preds = model.predict(test_generator)
#     preds_cls_idx = preds.argmax(axis=-1)
#     idx_to_cls = {v: k for k, v in train_generator.class_indices.items()}
#     preds_cls = np.vectorize(idx_to_cls.get)(preds_cls_idx)
#     filenames_to_cls = list(zip(test_generator.filenames, preds_cls))
    
#     l = []
#     n = []
#     t = []
#     for p in filenames_to_cls:
#         n.append(p[0].split("\\")[-1][:-4])
#         l.append(p[1])
#         t.append(labels[labels['Identifier'] == p[0].split("\\")[-1][:-4]]['Creator'].values[0])
#     return pd.DataFrame(list(zip(n, l, t)), columns=['img_name', 'predicted label', 'true label'])

# res = predict_test(model, labels)

In [None]:
# res

In [None]:
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(res['true label'], res['predicted label'])
# accuracy