In [29]:
import re
import os
import keras
import numpy as np
import pandas as pd

from PIL import Image

from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from itertools import combinations

%matplotlib inline

from sklearn.feature_extraction import DictVectorizer

In [30]:
np.random.seed(seed=100500)

In [31]:
TRAIN_DATA_DIR = '/datasets/kaggle/painters/train/'

ENCODED_DATA_DIR = {
    'bottlenecks': 'out/bottlenecks',
    'dimensions': 'out/dimensions'
}

# number of samples
SAMPLES_N = 78668

In [32]:
# create sample from bottlecks and aspects of a pair of images
def get_sample(fid):
    bottleneck = np.fromfile(ENCODED_DATA_DIR['bottlenecks'] + '/' + str(fid), dtype=np.float32)
    dims = np.fromfile(ENCODED_DATA_DIR['dimensions'] + '/' + str(fid), dtype=np.float32)
    return bottleneck, dims

In [33]:
# get training info
info_df = pd.read_csv('train_info.csv')
info_df['fid'] = [int(re.findall(r'(\d+).jpg$', x)[0]) for x in info_df['filename']]
info_df = info_df.set_index('fid', drop=False)

In [34]:
# filter out entries w/o style
info_df = info_df.fillna(-1)
info_df = info_df[info_df['style'] != -1]

# conv styles to lower case
info_df['style'] = [x.lower() for x in info_df['style']]

In [35]:
styles = np.unique(info_df['style'].values).astype(np.str)
print "# styles: %d"%(len(styles))

# styles: 135


In [36]:
# fids in the training set
avail_fids = np.array(info_df.index.values, dtype=np.int32)
print "# fids in training set: %d"%(len(avail_fids))
SAMPLES_N = min(SAMPLES_N, len(avail_fids))

# fids in training set: 78668


In [37]:
# 1h-vectorize styles
vec = DictVectorizer()
g = info_df.copy()
del g['artist']
del g['genre']
del g['title']
del g['date']
del g['filename']
del g['fid']
styles_1h = vec.fit_transform(g.to_dict('records')).toarray()
styles_1h = np.array(styles_1h, dtype=np.float32)
info_df['style_1h'] = [x for x in styles_1h]

In [39]:
info_df['style_1h'].values

array([ array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.], dtype=float32),
       array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,


In [40]:
samples_d = info_df.sample(SAMPLES_N).to_dict('records')

In [41]:
X = []
y = []

for x in tqdm(samples_d):
    fid = x['fid']
    style_1h = x['style_1h']
    features, dims = get_sample(fid)
    ar = dims[0] / dims[1]
    y.append(style_1h)
    X.append(features)

100%|██████████| 78668/78668 [00:05<00:00, 13968.01it/s]


In [42]:
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

In [43]:
np.save('X.npy', X)
np.save('y.npy', y)

In [44]:
X.shape

(78668, 2048)

In [45]:
X[0]

array([ 0.3012237 ,  0.46022657,  0.18604535, ...,  0.88865691,
        0.03298956,  0.3192586 ], dtype=float32)

In [46]:
y.shape

(78668, 135)

In [47]:
y[1]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.], dtype=float32)

In [48]:
np.save('styles.npy', styles)

In [27]:
np.load('styles.npy')

array(['abstract art', 'abstract expressionism', 'academicism',
       'action painting', 'american realism', 'analytical cubism',
       'analytical\xc2\xa0realism', 'art brut', 'art deco', 'art informel',
       'art nouveau (modern)', 'automatic painting', 'baroque',
       'biedermeier', 'byzantine', 'cartographic art', 'classicism',
       'cloisonnism', 'color field painting', 'conceptual art',
       'concretism', 'constructivism', 'contemporary realism',
       'costumbrismo', 'cubism', 'cubo-expressionism', 'cubo-futurism',
       'dada', 'divisionism', 'early renaissance',
       'environmental (land) art', 'existential art', 'expressionism',
       'fantastic realism', 'fauvism', 'feminist art',
       'figurative expressionism', 'futurism', 'gongbi', 'gothic',
       'hard edge painting', 'high renaissance', 'hyper-realism',
       'ilkhanid', 'impressionism', 'indian space painting',
       'ink and wash painting', 'international gothic', 'intimism',
       'japonism', 'jo