In [1]:
import numpy as np
import pandas as pd

# Read database from disk

In [2]:
db_fname = "../data/db.csv"
nrows = None # None to get all of them
db = pd.read_csv(db_fname,nrows=nrows,na_values="?")

In [3]:
db.count()

_id                152014
artistname         152014
genre              149142
image              152014
image_size_data    152014
style              148475
title              152011
year               118672
dtype: int64

In [4]:
db.head()

Unnamed: 0,_id,artistname,genre,image,image_size_data,style,title,year
0,57727b22edc2cb3880e0d820,giovanni battista piranesi,design,https://uploads1.wikiart.org/images/giovanni-b...,"[{'sizekb': 13, 'width': 210, 'height': 266, '...",neoclassicism,"plan, elevation and details of doric temples i...",
1,57727b22edc2cb3880e0d830,giovanni battista piranesi,design,https://uploads0.wikiart.org/images/giovanni-b...,"[{'sizekb': 8, 'width': 210, 'height': 149, 'u...",neoclassicism,plans of elevations and sections of thermopolium,
2,57727b22edc2cb3880e0d840,giovanni battista piranesi,design,https://uploads3.wikiart.org/images/giovanni-b...,"[{'sizekb': 7, 'width': 210, 'height': 148, 'u...",neoclassicism,plants related to the houses opposite to that ...,
3,57727b22edc2cb3880e0d850,giovanni battista piranesi,sketch and study,https://uploads2.wikiart.org/images/giovanni-b...,"[{'sizekb': 6, 'width': 210, 'height': 161, 'u...",neoclassicism,pluto,
4,57727b23edc2cb3880e0d860,giovanni battista piranesi,sketch and study,https://uploads8.wikiart.org/images/giovanni-b...,"[{'sizekb': 10, 'width': 210, 'height': 296, '...",neoclassicism,pluto and proserpina,


The `image_size_data` column need some preprocessing: in the raw csv file every entry is a string representation of a list, so not easy to manipulate. Let's parse it and convert it into a dictionary:

In [5]:
from ast import literal_eval

In [6]:
def parse_size_data(s):
    """Takes a string like the ones in image_size_data 
    column and parse it into a dictionary."""
    return {el.pop("name"):el for el in literal_eval(s)}

In [7]:
db["size_data"] = db["image_size_data"].map(parse_size_data)

So now we have a `size_data` column with a sensible data structure. For example, to see all the available sizes for a given artwork, you just have to look at the keys:

In [8]:
db["size_data"][0].keys()

dict_keys(['pinterestsmall', 'pinterestlarge', 'portrait', 'blog', 'large', 'halfhd', 'hd', 'original'])

# Database size estimate

The `size_data` columns contains information about all the available versions of any given file, so we can use it to esimate how large the database would be if we decided to download all of them.

In [9]:
db["size_data"].head()

0    {'pinterestsmall': {'sizekb': 13, 'width': 210...
1    {'pinterestsmall': {'sizekb': 8, 'width': 210,...
2    {'pinterestsmall': {'sizekb': 7, 'width': 210,...
3    {'pinterestsmall': {'sizekb': 6, 'width': 210,...
4    {'pinterestsmall': {'sizekb': 10, 'width': 210...
Name: size_data, dtype: object

We create a table with the size in kb of all available images, so that the sum of each column is an estimate of how much diskspace we need if we download that image size

In [11]:
slist = [{size:d[size]["sizekb"] for size in d} for d in db["size_data"]]
sizes = pd.DataFrame(columns=db["size_data"][0].keys(),data=slist)

In [16]:
sizes.describe()

Unnamed: 0,pinterestsmall,pinterestlarge,portrait,blog,large,halfhd,hd,original
count,152014.0,152014.0,149943.0,137759.0,111673.0,72022.0,29415.0,152014.0
mean,10.328062,17.017702,23.260759,35.34638,60.607774,109.214157,222.0684,384.345179
std,6.630793,11.447309,14.369315,21.72622,39.831075,76.549911,147.65305,929.690996
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,11.0,17.0,25.0,40.0,68.0,139.0,76.0
50%,9.0,15.0,22.0,33.0,55.0,96.0,194.0,149.0
75%,13.0,21.0,28.0,43.0,75.0,137.0,276.0,314.0
max,365.0,641.0,438.0,670.0,1355.0,2680.0,5761.0,31682.0


In [65]:
10*152000/1024**2

1.4495849609375

The total diskspace necessary if we download every image will be, in Gb:

In [66]:
sizes.sum()/1024**2

pinterestsmall     1.497278
pinterestlarge     2.467088
portrait           3.326214
blog               4.643709
large              6.454708
halfhd             7.501432
hd                 6.229536
original          55.719231
dtype: float64

In [34]:
db.iloc[90000]["size_data"]["blog"]

{'sizekb': 64,
 'width': 500,
 'height': 401,
 'url': 'https://uploads0.wikiart.org/images/ozdemir-atlan/family-tree-2000.jpg!blog.jpg'}

# Dowload images

We'll download at first the smallest possible images, it's easy to replace them with a larger size if we see that the NN is not performing well.

So we build a list of urls we are interested in and use `wget` to download them.

In [60]:
size = "pinterestsmall"

In [61]:
urls = pd.concat([db["_id"],db["size_data"].map(lambda d: d[size]["url"])],axis=1)

In [62]:
urls.to_csv("img_urls.txt",sep=" ",header=False,index=False)

And then we use the `download_images.sh` script to actually get them.

# Keras