# **Classification of Paintings**
*   Authors: Nils Berns, John Kimani
*   Version date: Feb 2, 2021
*   Project name: Classification of Paintings
*   Done as the final project of the course "Deep Learning" at opencampus.sh

In [None]:
import os
import requests
import time
import csv
import numpy as np
import tensorflow as tf
import random
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from random import uniform, randint

from google.colab import drive
drive.mount('/content/gdrive')

# Set up General Parameters

In [None]:
image_size = 128 # size of the images used in training (squared images)
min_paintings = 200 # minimum number of paintings an artist has to have to be included
re_size = 300 # length of squared image in pixels the fake paintings are resized to before they are copied to data set folders, should not exceed 512
download_BestArtworks = False # set to True if you want to download the data, makes sense on the first run
download_MonetStylised = False # set to True if you want to download the data, makes sense on the first run
use_fakes = False # classify only true paintings or include the artificials (fake) as well
use_default_model = False # set to True if you want to use our best model architecture

# best results were obtained with
# data augmentation
# image_size = 480
# min_paintings = 300
# use_default_model = True

# Download the Datasets
From Kaggle.

In [None]:
# Download the datasets from Kaggle 
# follow these instruction: https://medium.com/analytics-vidhya/how-to-fetch-kaggle-datasets-into-google-colab-ea682569851a

try:
  os.mkdir('/content/gdrive/My Drive/Kaggle')
except OSError:
  pass
# changing the working directory
%cd /content/gdrive/My Drive/Kaggle/

if download_BestArtworks == True:
  # /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive
  os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/My Drive/Kaggle'
  # dowload and extract the data from Kaggle, once done its found in myDrive
  !kaggle datasets download -d ikarus777/best-artworks-of-all-time
  # unzipping the zip files and deleting the zip files
  !unzip \*.zip  && rm *.zip

if download_MonetStylised == True:
  # /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive
  os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/My Drive/Kaggle'
  # dowload and extract the data from Kaggle, once done its found in myDrive
  !kaggle datasets download -d shcsteven/paired-landscape-and-monetstylised-image
  # unzipping the zip files and deleting the zip files
  !unzip \*.zip  && rm *.zip
  destination = '/content/gdrive/My Drive/Kaggle/Monet_stylised/'
  try:
    os.mkdir(destination)
  except OSError:
    pass
  for source in ['/content/gdrive/My Drive/Kaggle/monet_style_dataset/monet_style_dataset_A/stylized_A/', '/content/gdrive/My Drive/Kaggle/monet_style_dataset/monet_style_dataset_B/stylized_B/']:
    for fname in os.listdir(source):
      shutil.copyfile(source + fname, destination + fname)
  shutil.rmtree('/content/gdrive/My Drive/Kaggle/monet_style_dataset/')

From https://thisartworkdoesnotexist.com/.

In [None]:
# Download artificial paintings from https://thisartworkdoesnotexist.com/

destination = '/content/gdrive/My Drive/Kaggle/thisartworkdoesnotexist/'
try:
  os.mkdir(destination)
except OSError:
    pass
nof = len(os.listdir(destination)) # number of images in the directory
desired_nof = 1000 # desired number of images from this source
if nof < desired_nof:
    for ii in range(nof, desired_nof):
        url = 'https://thisartworkdoesnotexist.com/'
        r = requests.get(url)
        fname = f'thisdoesnotexist_fake_painting_{ii}.jpg'
        with open(destination + fname, 'wb') as f:
            f.write(r.content)
        time.sleep(1) # time-out of ~1s required for requesting a novel image

From https://boredhumans.b-cdn.net/art/.

In [None]:
# Download artificial paintings from https://boredhumans.b-cdn.net/art/

destination = '/content/gdrive/My Drive/Kaggle/boredhumans_stylegan2/'
random.seed(100)
try:
    os.mkdir(destination)
except OSError:
    pass
filenumbers = os.listdir(destination)
nof = len(filenumbers) # number of images in the directory
desired_nof = 1000 # desired number of images from this source
for jj in range(nof):
    filenumbers[jj] = int(filenumbers[jj][20:-4])
if nof < desired_nof:
    randomlist = []
    for ii in range(nof, desired_nof):
        n = random.randint(1,5001)
        while (n in randomlist) or (n in filenumbers):
            n = random.randint(1,5001)
        randomlist.append(n)
        url = f'https://boredhumans.b-cdn.net/art/{n}.jpg'
        r = requests.get(url)
        fname = f'bored_fake_painting_{n}.jpg'
        with open(destination + fname, 'wb') as f:
            f.write(r.content)

# Pre-processing the Data
Read data about the artists from csv-file.

In [None]:
def read_artists():
  artists_names = []
  num_of_paintings = []
  genres = []
  with open('artists.csv') as artists_file:
    csv_reader = csv.reader(artists_file, delimiter=',') # delimiter is comma
    next(csv_reader) # skip header
    for row in csv_reader:
      if row[1] == 'Albrecht Dürer':
        temp_name = 'Albrecht Durer'
      elif row[1] == 'Vasiliy Kandinskiy':
        temp_name = 'Wassily Kandinsky'
      else:  
        temp_name = row[1]
      genres.append(row[3])
      num_of_paintings.append(int(row[-1]))
      artists_names.append(temp_name)
  return artists_names, num_of_paintings, genres

artists_names_csv, num_of_paintings_csv, genre_csv = read_artists()

The directory with respect to classification problem is chosen.

In [None]:
if use_fakes == True:
  data_directory = '/content/gdrive/My Drive/Kaggle/painting_classification/fake_detector/'
  print('Include fake paintings!')
else:
  data_directory = '/content/gdrive/My Drive/Kaggle/painting_classification/'
  print('Only true paintings data')

Truncate the list of artists with respect to the desired minimum number of paintings per artist.

In [None]:
# exclude artists with less than min_paintings paintings to improve validation accuracy
artists_names = []
num_of_paintings = []
genre = []
for ii in range(len(artists_names_csv)):
  if num_of_paintings_csv[ii] > min_paintings:
    artists_names.append(artists_names_csv[ii])
    num_of_paintings.append(num_of_paintings_csv[ii])
    genre.append(genre_csv[ii])

if use_fakes == True:
  num_of_classes = len(artists_names) + 1
else:
  num_of_classes = len(artists_names)

print(f'{len(artists_names)} out fo 50 artists have more than {min_paintings} painting(s).')

for ii in range(len(genre)):
  print(f'{artists_names[ii]}: {genre[ii]}')
# 4 out of 11 are from the Renaissance

Create required directories for sorting the data.

In [None]:
def create_directory(DIRs):
  if type(DIRs) == list:
    pass
  elif type(DIRs) == str:
    DIRs = [DIRs]
  else:
    print('No directory created. Input type neither list nor string.')
    return
  for DIR in DIRs:
    try:
      os.mkdir(DIR)
    except OSError:
      pass
  return

make_directories = ['/content/gdrive/My Drive/Kaggle/images/',
                    '/content/gdrive/My Drive/Kaggle/images/images/'
                   ]

directories = ['/content/gdrive/My Drive/Kaggle/painting_classification/',
               '/content/gdrive/My Drive/Kaggle/painting_classification/fake_detector/'
               ]

for directory in directories:
  make_directories.append(directory)
  names = []
  if directory[-14:] == 'fake_detector/':
    names = artists_names.copy()
    names.append('Fake')
  else:
    names = artists_names.copy()
  for folder in ['training/', 'development/', 'testing/']:
    make_directories.append(directory + folder)
    for artist in names:
        make_directories.append(directory + folder + artist + '/')

create_directory(make_directories)

Define the size of the training set etc.

In [None]:
random.seed(1)
train_size = 0.8
dev_size = (1 - train_size)*0.5
test_size = (1 - train_size)*0.5

Move the images of the desired artists to the training, development, testing folders. If a class of fraudulent paintings is added, copy the images to the folders within the fake_detector directory too. Also copy resized versions of the images from the fake class.

In [None]:
source = '/content/gdrive/My Drive/Kaggle/images/images/'
destination = '/content/gdrive/My Drive/Kaggle/painting_classification/'

artists_folder = os.listdir(source)
index = []
for ii in range(len(artists_folder)):
  if artists_folder[ii][:8] == 'Albrecht':
    index.append(ii)
if len(index) > 1:
  for ii in index[1:]:
    shutil.rmtree(source + artists_folder[ii] + '/')

artists_folder = os.listdir(source)
for ii in range(len(artists_folder)):
  if artists_folder[ii][:8] == 'Albrecht':
    artist = 'Albrecht Durer'
  elif artists_folder[ii] == 'Vasiliy_Kandinskiy':
    artist = 'Wassily Kandinsky'
  else:
    artist = artists_folder[ii]
    artist = artist.replace('_', ' ')
  if artist in artists_names:
    print(f'{artist}: copy data to generator folder')
    source_artist = source + artists_folder[ii] + '/'
    random.seed(ii)
    images = os.listdir(source_artist)
    test_length = int(test_size*len(images))
    dev_length = int(dev_size*len(images))
    train_length = len(images)- test_length - dev_length
    shuffled_set = random.sample(images, len(images))
    train_set = shuffled_set[:train_length]
    dev_set = shuffled_set[train_length:train_length+dev_length]
    test_set = shuffled_set[train_length+dev_length:]
    sets = {
        'training/': train_set,
        'development/': dev_set,
        'testing/': test_set
    }
    for folder in ['training/', 'development/', 'testing/']:
      for fname in sets[folder]:
        destination_set = destination + folder + artist + '/'
        shutil.copyfile(source_artist + fname, destination_set + fname)
    shutil.rmtree(source_artist)

for artist in os.listdir(destination + 'training/'):
  if artist not in artists_names:
    print(f'{artist}: copy data to source folder')
    create_directory(source + artist + '/')
    for folder in ['training/', 'development/', 'testing/']:
      for fname in os.listdir(destination + folder + artist + '/'):
        shutil.copyfile(destination + folder + artist + '/' + fname, source + artist + '/' + fname)
      shutil.rmtree(destination + folder + artist + '/')

if use_fakes == True:
  source = '/content/gdrive/My Drive/Kaggle/painting_classification/'
  destination = '/content/gdrive/My Drive/Kaggle/painting_classification/fake_detector/'
  for folder in ['training/', 'development/', 'testing/']:
    for artist in artists_names:
      if len(os.listdir(destination + folder + artist + '/')) == 0:
        if folder == 'training/':
          print(f'{artist}: copy data to fake_detector generator folder')
        for fname in os.listdir(source + folder + artist + '/'):
          img = Image.open(source + folder + artist + '/' + fname)
          img = img.resize((re_size, re_size))
          img.save(destination + folder + artist + '/' + fname)
  for artist in os.listdir(destination + 'training/'):
    if (artist not in artists_names) and (artist != 'Fake'):
      for folder in ['training/', 'development/', 'testing/']:
        shutil.rmtree(destination + folder + artist + '/')
  if len(os.listdir(destination + 'training/Fake/')) == 0:
    random.seed(75)
    sources_fake = ['/content/gdrive/My Drive/Kaggle/boredhumans_stylegan2/',
                    '/content/gdrive/My Drive/Kaggle/Monet_stylised/',
                    '/content/gdrive/My Drive/Kaggle/thisartworkdoesnotexist/'
                   ]
    for source_fake in sources_fake:
      print(f'{source_fake}: copy data to fake_detector generator fake folder')
      images = os.listdir(source_fake)
      test_length = int(test_size*len(images))
      dev_length = int(dev_size*len(images))
      train_length = len(images)- test_length - dev_length
      shuffled_set = random.sample(images, len(images))
      train_set = shuffled_set[:train_length]
      dev_set = shuffled_set[train_length:train_length+dev_length]
      test_set = shuffled_set[train_length+dev_length:]
      sets = {
          'training/': train_set,
          'development/': dev_set,
          'testing/': test_set 
      }
      for folder in ['training/', 'development/', 'testing/']:
        for fname in sets[folder]:
          img = Image.open(source_fake + fname)
          img = img.resize((re_size, re_size))
          img.save(destination + folder + 'Fake/' + fname)

Count the number of images used for the classification problem.

In [None]:
for folder in ['training', 'development', 'testing']:
  counter = 0
  for artist in os.listdir(data_directory + folder + '/'):
    counter = counter + len(os.listdir(data_directory + folder + '/' + artist + '/'))
    if folder == 'training':
      print(artist, len(os.listdir(data_directory + folder + '/' + artist + '/')))
    if len(os.listdir(data_directory + folder + '/' + artist + '/')) == 0:
      print(f'{folder} - {artist} is empty')
  print(f'Paintings in {folder}-set: {counter}')

Function used to generate the training and validation sets from directories. Data augmentation is included.

In [None]:
def create_generators(batch_size=128, image_size=128, DIR=data_directory):
  TRAINING_DIR = DIR + 'training/'
  train_datagen = ImageDataGenerator(rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
        )
  train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
                                                      batch_size=batch_size,
                                                      class_mode='categorical',
                                                      shuffle=True,
                                                      target_size=(image_size, image_size))

  VALIDATION_DIR = DIR + 'development/'
  validation_datagen = ImageDataGenerator(rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
        )
  validation_generator = validation_datagen.flow_from_directory(VALIDATION_DIR,
                                                                batch_size=batch_size,
                                                                class_mode='categorical',
                                                                shuffle=True,
                                                                target_size=(image_size, image_size))
  return train_generator, validation_generator

# Model Architecture
Function that creates a model based on the input arguments or default values. A default model architecture is included which gave us the best results.

In [None]:
def create_model(default_model=use_default_model, nDense=2, minNeurons=256, nConLay=4, minCons=32, con_size=3, pool_size=2, batch_norm=False, drop_rate=0.0, nOutputs=num_of_classes, image_size=image_size):
  # -> CONV/FC -> BatchNorm -> ReLu(or other activation) -> Dropout -> CONV/FC -> (Ioffe and Szegedy 2015)
  # However Szegedy is known to now prefer applying the batch normalisation after the activation.
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Input(shape=(image_size, image_size, 3)))
  if default_model == False:
    m = 0
    for ii in range(nConLay):
      if ii % 2 == 0 and ii > 0:
        m += 1
      model.add(tf.keras.layers.Conv2D(minCons*2**m, (con_size, con_size), activation='relu', padding='same')) #, use_bias=False, kernel_regularizer=l2(1e-4)),
      if batch_norm == True:
        model.add(tf.keras.layers.BatchNormalization())
      model.add(tf.keras.layers.MaxPooling2D(pool_size, pool_size))
      if ii+1 % 2 == 0:
        model.add(tf.keras.layers.Dropout(drop_rate))
    model.add(tf.keras.layers.Flatten())
    for ii in range(nDense):
      model.add(tf.keras.layers.Dense(2**(nDense-1-ii)*minNeurons, activation='relu'),)
      # model.add(tf.keras.layers.Dense(
      #                 2**(nDense-1-ii)*minNeurons,
      #                 kernel_initializer='ones',
      #                 kernel_regularizer=tf.keras.regularizers.L1(0.01),
      #                 activity_regularizer=tf.keras.regularizers.L2(0.01),
      #                 activation='relu'))
      # regularisation works the same with Conv2D layers
  else: # use our default architecture
    model.add(tf.keras.layers.Conv2D(16, (3,3), padding = "same", strides = 2, activation='relu'))
    model.add(tf.keras.layers.Conv2D(16, (3,3), padding = "same", strides = 2, activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(3,2))
    model.add(tf.keras.layers.Conv2D(32, (3,3), padding = "same", activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(3,2))
    model.add(tf.keras.layers.Conv2D(48, (3,3), padding = "same", activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(3,2))
    model.add(tf.keras.layers.Conv2D(64, (3,3), padding = "same", activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(3,2))
    model.add(tf.keras.layers.Conv2D(80, (3,3), padding = "same", activation='relu'))
    # model.add(tf.keras.layers.MaxPooling2D(3,2)) # even with an image size of 512 by 512 this pooling layer leads to an error due to size
    # model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(128, activation='relu'))

  model.add(tf.keras.layers.Dense(nOutputs, activation='softmax'))

  model.compile(optimizer=tf.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

  return model

# Random Grid Search for Hyperparameters
Define the parameters of the random grid search

In [None]:
random.seed(85)
len_arrays = 10 # not very important
batch_size_r = [randint(64, 256) for i in range(len_arrays)]
epochs = [randint(20, 100) for i in range(len_arrays)] # use comma for more values ex. [10, 50, 100]
minNeurons = [randint(128, 512) for i in range(len_arrays)]
nDense = [randint(1, 4) for i in range(len_arrays)]
nConLay = [randint(2, 6) for i in range(len_arrays)]
minCons = [randint(16, 64) for i in range(len_arrays)]
con_size= [randint(3, 5) for i in range(len_arrays)]
pool_size= [randint(2, 4) for i in range(len_arrays)]
# batch_norm=False
drop_rate = [uniform(0., 0.3) for i in range(len_arrays)]
# this is important
num_of_parameters_that_we_want_to_use = 9
param_random_grid = dict(batch_size=batch_size_r,
                         epochs=epochs,
                         minNeurons=minNeurons,
                         nDense=nDense,
                         nConLay=nConLay,
                         minCons=minCons,
                         con_size=con_size,
                         pool_size=pool_size,
                         drop_rate=drop_rate)
# what does the dictionary look like?
print(param_random_grid)

Run the random search for hyperparameters. The used way does not offer parallelised search iterations, unfortunately.

In [None]:
history = []
for ii in range(len_arrays):
  train_generator, validation_generator = create_generators(batch_size=batch_size_r[ii], image_size=image_size)
  random_model = create_model(default_model=use_default_model, nDense=nDense[ii], minNeurons=minNeurons[ii], nConLay=nConLay[ii], minCons=minCons[ii], con_size=con_size[ii], pool_size=2, batch_norm=False, drop_rate=drop_rate[ii], nOutputs=num_of_classes, image_size=image_size)
  random_model.summary()

  history.append(random_model.fit(train_generator,
                           epochs=epochs[ii],
                           verbose=1,
                           validation_data=validation_generator)
                )
  
  acc = history[ii].history['accuracy']
  val_acc = history[ii].history['val_accuracy']
  loss = history[ii].history['loss']
  val_loss = history[ii].history['val_loss']

  epochs_plot = range(len(acc))

  plt.plot(epochs_plot, acc, 'r', label='Training accuracy')
  plt.plot(epochs_plot, val_acc, 'b', label='Validation accuracy')
  plt.title(f'Training and validation accuracy of model {ii}')
  plt.legend()
  plt.figure()

  plt.plot(epochs_plot, loss, 'r', label='Training Loss')
  plt.plot(epochs_plot, val_loss, 'b', label='Validation Loss')
  plt.title(f'Training and validation loss of model {ii}')
  plt.legend()

  plt.show()

# Training the Model the Conservative Way

In [None]:
train_generator, validation_generator = create_generators(batch_size=128, image_size=image_size)
model = create_model() # the model architecture is defined through arguments here
model.summary()

history = model.fit(train_generator,
                           epochs=80,
                           verbose=1,
                           validation_data=validation_generator)

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_plot = range(len(acc))

plt.plot(epochs_plot, acc, 'r', label='Training accuracy')
plt.plot(epochs_plot, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs_plot, loss, 'r', label='Training Loss')
plt.plot(epochs_plot, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()