# Data preparation

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import pickle
from shutil import copy2
import tensorflow
import IPython

In [2]:
def mkdir_if_not_exist(path):
    if not os.path.exists(path):
        os.mkdir(path)
        print('Created:', path)
    else:
        print('Exists:', path)

In [3]:
# Import CSV and select top 10 painters
df = pd.read_csv('../all_data_info.csv')
mask = df['in_train']
train_3_df = df[mask]
top10 = train_3_df.groupby('artist').count().sort_values(by='title', ascending=False).head(10)
top10.index

Index(['Paul Cezanne', 'Giovanni Battista Piranesi', 'Martiros Saryan',
       'Ilya Repin', 'Camille Pissarro', 'Vincent van Gogh',
       'Theophile Steinlen', 'Pyotr Konchalovsky', 'Pierre-Auguste Renoir',
       'Boris Kustodiev'],
      dtype='object', name='artist')

In [11]:
# Copy pictures of top 10 artists to a separate folder
input_dir = '/media/mo/My Passport/painters/train/'
output_dir = '/home/mo/Downloads/painters/top10_/'
mkdir_if_not_exist(output_dir)

for i in range(len(top10.index)):
    files = list(train_3_df[(train_3_df['artist'] == top10.index[i])]['new_filename'])
    count = 0
    num = 2    
    for f in files:
        if (count < num) and os.path.exists(input_dir+f):
            copy2(input_dir+f, output_dir)
            count +=1
    print('Copied %d images of %s' % (count, top10.index[i]))

Copied 2 images of Paul Cezanne
Copied 2 images of Giovanni Battista Piranesi
Copied 2 images of Martiros Saryan
Copied 2 images of Ilya Repin
Copied 2 images of Camille Pissarro
Copied 2 images of Vincent van Gogh
Copied 2 images of Theophile Steinlen
Copied 2 images of Pyotr Konchalovsky
Copied 2 images of Pierre-Auguste Renoir
Copied 2 images of Boris Kustodiev


In [23]:
# Create folders
data_dir = '/home/mo/Downloads/painters/'
train_dir = data_dir + 'top10_x448_/'

working_train_dir = data_dir + "train_/"
working_test_dir = data_dir + "test_/"

mkdir_if_not_exist(train_dir)
mkdir_if_not_exist(working_train_dir)
mkdir_if_not_exist(working_test_dir)

Exists: /home/mo/Downloads/painters/top10_x448_/
Created: /home/mo/Downloads/painters/train_/
Created: /home/mo/Downloads/painters/test_/


In [16]:
# Resize images to train_dir
!mogrify -resize "448^>" -path {train_dir} {output_dir}*.jpg

In [24]:
# Create train and test folders for each painter

artist_dirs = []
test_artist_dirs = []
for a in top10.index:
    artist_dirs.append(working_train_dir + a)
    test_artist_dirs.append(working_test_dir + a)

for d in artist_dirs:    
    mkdir_if_not_exist(d)

for d in test_artist_dirs:    
    mkdir_if_not_exist(d)

Created: /home/mo/Downloads/painters/train_/Paul Cezanne
Created: /home/mo/Downloads/painters/train_/Giovanni Battista Piranesi
Created: /home/mo/Downloads/painters/train_/Martiros Saryan
Created: /home/mo/Downloads/painters/train_/Ilya Repin
Created: /home/mo/Downloads/painters/train_/Camille Pissarro
Created: /home/mo/Downloads/painters/train_/Vincent van Gogh
Created: /home/mo/Downloads/painters/train_/Theophile Steinlen
Created: /home/mo/Downloads/painters/train_/Pyotr Konchalovsky
Created: /home/mo/Downloads/painters/train_/Pierre-Auguste Renoir
Created: /home/mo/Downloads/painters/train_/Boris Kustodiev
Created: /home/mo/Downloads/painters/test_/Paul Cezanne
Created: /home/mo/Downloads/painters/test_/Giovanni Battista Piranesi
Created: /home/mo/Downloads/painters/test_/Martiros Saryan
Created: /home/mo/Downloads/painters/test_/Ilya Repin
Created: /home/mo/Downloads/painters/test_/Camille Pissarro
Created: /home/mo/Downloads/painters/test_/Vincent van Gogh
Created: /home/mo/Downlo

In [25]:
# Copy train and test images

train_num = 1 #Number of train images (366)
test_num = 1   #Number of test images (20)

for i in range(len(top10.index)):
    test_count = 0
    train_count = 0
    files = list(train_3_df[(train_3_df['artist'] == top10.index[i])]['new_filename'])
    random.shuffle(files)
    for f in files:
        if (test_count < test_num) and os.path.exists(train_dir+f) and not os.path.exists(test_artist_dirs[i]+f):
            copy2(train_dir+f, test_artist_dirs[i])
            test_count +=1
        elif (train_count < train_num) and os.path.exists(train_dir+f) and not os.path.exists(artist_dirs[i]+f):
            copy2(train_dir+f, artist_dirs[i])
            train_count +=1
    print('Copied to %s : %s images' % (str(test_artist_dirs[i]), str(test_count)))
    print('Copied to %s : %s images' % (str(artist_dirs[i]), str(train_count)))

Copied to /home/mo/Downloads/painters/test_/Paul Cezanne : 1 images
Copied to /home/mo/Downloads/painters/train_/Paul Cezanne : 1 images
Copied to /home/mo/Downloads/painters/test_/Giovanni Battista Piranesi : 1 images
Copied to /home/mo/Downloads/painters/train_/Giovanni Battista Piranesi : 1 images
Copied to /home/mo/Downloads/painters/test_/Martiros Saryan : 1 images
Copied to /home/mo/Downloads/painters/train_/Martiros Saryan : 1 images
Copied to /home/mo/Downloads/painters/test_/Ilya Repin : 1 images
Copied to /home/mo/Downloads/painters/train_/Ilya Repin : 1 images
Copied to /home/mo/Downloads/painters/test_/Camille Pissarro : 1 images
Copied to /home/mo/Downloads/painters/train_/Camille Pissarro : 1 images
Copied to /home/mo/Downloads/painters/test_/Vincent van Gogh : 1 images
Copied to /home/mo/Downloads/painters/train_/Vincent van Gogh : 1 images
Copied to /home/mo/Downloads/painters/test_/Theophile Steinlen : 1 images
Copied to /home/mo/Downloads/painters/train_/Theophile Ste