In [4]:
cd ../Python/

/home/mfruman/DataSci/PBN/Python


In [5]:
import readPBNData.description as rd
import readPBNData.images as ri
import PBNFeatures.paletteTools as pt
import PBNPCA.pca as pbnpca
import numpy as np

# Loading and sorting image- and metadata

In [6]:
csvs, fileLike = rd.openZip('../Data/train_info.csv.zip')
lines, head = rd.readCSV(fileLike[0])

In [7]:
cols = rd.columns(lines,head)
cols.keys()

['style', 'title', 'artist', 'filename', 'date', 'genre']

In [8]:
# find 10 artists with most paintings in data set
artistTable = rd.table(cols['artist'])
artistTable[:5]

[('3cc9a44380296d93e68b71a27643c25f', 413),
 ('96e7b1bc8d52e18caf0af34fec2e9bcb', 402),
 ('d8a3c897c506be7de91d8f892f14f934', 401),
 ('3f8dc381ccfe9d5cc88b75970262715b', 400),
 ('10bc951c2eb4a2f05fa773bdaace4e3b', 397)]

In [9]:
# images in train_1.zip and train_2.zip file
train_1_names = rd.imagesInZip('../Data/train_1.zip')
train_2_names = rd.imagesInZip('../Data/train_2.zip')

In [10]:
def loadPortfolios(artistTable):
    # number of paintings by artist in train_1 and train_2
    # distinct artists (slowish)
    distinctArtists = [a[0] for a in artistTable]
    paintingsIn12 = []
    print('scanning for paintings by top ten in full csv file . . .')
    for artist in distinctArtists:
        # print('    scanning artist ' + artist + ' . . . ')
        paintingsIn12.append(len(rd.sameArtist(artist, cols, imageList=train_1_names))
                            +len(rd.sameArtist(artist, cols, imageList=train_2_names)))
    artistTable12 = sorted(zip(distinctArtists, paintingsIn12), 
                           key=lambda item: item[1], reverse=True)
    print('top 10 artists . . . ')
    print(artistTable12[:10])

    # lists of paintings by the 10 most prolific artists in train_1 and train_2 
    # extract artist names from artistTable
    print('\nscanning for paintings by top ten in zip files . . .')
    leaders = [a[0] for a in artistTable12[:10]]
    portfolios1 = []
    portfolios2 = []
    for artist in leaders:
        print('    scanning artist ' + artist + ' . . . ')
        # paintings in train_1.zip
        portfolios1.append(rd.sameArtist(artist, cols, imageList=train_1_names))
        # paintings in train_2.zip
        portfolios2.append(rd.sameArtist(artist, cols, imageList=train_2_names))
    # list of artists and (separate) lists of paintings in train_1 and train_2
    portfolios = zip(leaders,portfolios1,portfolios2)
    return portfolios

In [11]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the filenames of their paintings
# contained in train_1.zip and train_2.zip
#
# else, scan zip files and create json index file
#
import os.path
import json
dataDir = '../Data/'
portfoliosFile = dataDir + 'portfolios_top10.json'
if (os.path.isfile(portfoliosFile)):
    with open(portfoliosFile,'r') as jsonfile:
        portfolios_uni = json.load(jsonfile)
        print('reading reading reading!!!')
else:
    # list of artists and single list of paintings
    portfolios = loadPortfolios(artistTable)
    portfolios_uni = [(artist, p1 + p2) for (artist,p1,p2) in portfolios]
    # create json text file with list of artists and filenames of paintings:
    print('\nwriting json index file . . .')
    with open(portfoliosFile,'w') as jsonfile:
        jsonfile.write(json.dumps(portfolios_uni,indent=2))

reading reading reading!!!


## Create miniatures and cutout samples

In [12]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the filenames of their paintings
# contained in train_1.zip and train_2.zip, assume miniatures
# and cutouts already exist (can refactor later to check)
#
# else, scan zip files, create miniatures and json index file
#
import os.path
import json
dataDir = '../Data/'
miniDir = dataDir + 'Mini/'
force = False
if (force):
    # list of artists and single list of paintings
    portfolios = loadPortfolios(artistTable)
    portfolios_uni = [(artist, p1 + p2) for (artist,p1,p2) in portfolios]

    # create miniatures of the paintings by most prolific artists and save them to disk
    # (if file exists, does nothing)
    print('\ncreating miniatures (if necessary) . . . ')
    for portfolio in portfolios:
        print('    artist ' + portfolio[0] + ' . . . ')
        minifiles_1 = ri.miniatures('../Data/train_1.zip', \
                                    portfolio[1], miniDir, \
                                    prefix='train_1',size=(100,100))
        minifiles_2 = ri.miniatures('../Data/train_2.zip', \
                                    portfolio[2], miniDir, \
                                    prefix='train_2',size=(100,100))

In [13]:
# if necessary, create (multiple) cutouts of each painting
# in portfolios list (cf. previous cell)
#
import os.path
import json
dataDir = '../Data/'
cutoutDir = dataDir + 'Cutout/'
force = True
if (force):
    # list of artists and single list of paintings
    portfolios = loadPortfolios(artistTable)
    portfolios_uni = [(artist, p1 + p2) for (artist,p1,p2) in portfolios]

    # create cutouts of the paintings by most prolific artists and save them to disk
    # (if file exists, does nothing)
    print('\ncreating cutouts . . . ')
    for portfolio in portfolios:
        print('    artist ' + portfolio[0] + ' . . . ')
        
        xstart = 0.25
        for i in xrange(2):
            xstart += 0.5*i
            ystart = 0.25
            for j in xrange(2):
                ystart += 0.5*j
                cutoutfiles_1 = ri.cutouts('../Data/train_1.zip', \
                                           portfolio[1], cutoutDir, \
                                           prefix='train_1',size=(100,100), \
                                           topleft=(xstart,ystart) \
                                          )
                cutoutfiles_2 = ri.cutouts('../Data/train_2.zip', \
                                           portfolio[2], cutoutDir, \
                                           prefix='train_2',size=(100,100), \
                                           topleft=(xstart,ystart) \
                                          )

scanning for paintings by top ten in full csv file . . .
top 10 artists . . . 
[('0eeac4ecff259dc515be795e1a76019a', 119), ('dd4989789d310581024ae2b9203d5439', 111), ('121fffad1eb6f7dff228b8a71b6aec72', 105), ('1a8d67dbb446bdc4298cc0be56932a38', 104), ('ce3d8977aae5986601232aa58d15282a', 103), ('c16781c4321948227193214b68477a5c', 101), ('3cc9a44380296d93e68b71a27643c25f', 100), ('d09f796f2b0aa11dffc88badd9806119', 100), ('5aabfc58470d01bb2362795a44a2603b', 99), ('db1318d32df7428076e03513ebf762bb', 98)]

scanning for paintings by top ten in zip files . . .
    scanning artist 0eeac4ecff259dc515be795e1a76019a . . . 
    scanning artist dd4989789d310581024ae2b9203d5439 . . . 
    scanning artist 121fffad1eb6f7dff228b8a71b6aec72 . . . 
    scanning artist 1a8d67dbb446bdc4298cc0be56932a38 . . . 
    scanning artist ce3d8977aae5986601232aa58d15282a . . . 
    scanning artist c16781c4321948227193214b68477a5c . . . 
    scanning artist 3cc9a44380296d93e68b71a27643c25f . . . 
    scanning artis

# Generate colour features

## 16-colour CGA palette distributions

In [23]:
# load all miniatures and simplify colour palette to 16-colour CGA
# save filenames and pixel-counts by colour in two lists
import PBNFeatures.paletteTools as pt
import os.path
import json
from PIL import Image
featureDir = '../Data/'
nc = 16
palette = pt.CGApalette(ncolours=nc)
cgaColoursFile = featureDir + 'cgacolours_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(cgaColoursFile)):
    with open(cgaColoursFile,'r') as jsonfile:
        data = json.load(jsonfile)
        print('reading CGA Colours file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all miniatures and compute saturation, value and hue
    # statistics for each (slow),
    # save filenames and pixel-counts by colour in two lists
    data = []
    names = []
    for portfolio in portfolios_uni:
        print('processing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            names.append(painting)
            mininame = os.path.splitext(painting)[0] + '_mini_100_x_100.jpg'
            mini = Image.open(featureDir + 'Mini/' + mininame)
            minip = pt.paletteConvert(mini,palette)
            colours = pt.completeColours(minip.getcolors(),nc)[:nc]
            data.append([c[0] for c in colours])
    # write cga colour data to json file
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(cgaColoursFile,'w') as jsonfile:
        print('writing CGA colours file . . .')
        jsonfile.write(json.dumps(data,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))

processing paintings by artist 0eeac4ecff259dc515be795e1a76019a . . .
processing paintings by artist dd4989789d310581024ae2b9203d5439 . . .
processing paintings by artist 121fffad1eb6f7dff228b8a71b6aec72 . . .
processing paintings by artist 1a8d67dbb446bdc4298cc0be56932a38 . . .
processing paintings by artist ce3d8977aae5986601232aa58d15282a . . .
processing paintings by artist c16781c4321948227193214b68477a5c . . .
processing paintings by artist 3cc9a44380296d93e68b71a27643c25f . . .
processing paintings by artist d09f796f2b0aa11dffc88badd9806119 . . .
processing paintings by artist 5aabfc58470d01bb2362795a44a2603b . . .
processing paintings by artist db1318d32df7428076e03513ebf762bb . . .

writing json index file . . .
writing CGA colours file . . .
writing names file . . .


## HSV distributions

In [24]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the hsv stats of their paintings
# contained in train_1.zip and train_2.zip
#
# else, process all miniatures and write json file
#
import PBNFeatures.cylindrical as cyl
from PIL import Image
import os.path
import json
featureDir = '../Data/'
hsvstatsFile = featureDir + 'hsvstats_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(hsvstatsFile)):
    with open(hsvstatsFile,'r') as jsonfile:
        hsvstats = json.load(jsonfile)
        print('reading hsvstats file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all miniatures and compute saturation, value and hue
    # statistics for each (slow),
    # save filenames and pixel-counts by colour in two lists
    hsvstats = []
    names = []
    for portfolio in portfolios_uni:
        print('processing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            names.append(painting)
            mininame = os.path.splitext(painting)[0] + '_mini_100_x_100.jpg'
            mini = Image.open(featureDir + 'Mini/' + mininame)
            hsvstats.append(cyl.hsv_stats(cyl.jpg_to_hsv(mini),hue_bins=16))
    # write hsvstats to json file
    # create json text file with list of hsv stats for each
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(hsvstatsFile,'w') as jsonfile:
        print('writing hsvstats file . . .')
        jsonfile.write(json.dumps(hsvstats,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))

processing paintings by artist 0eeac4ecff259dc515be795e1a76019a . . .
processing paintings by artist dd4989789d310581024ae2b9203d5439 . . .
processing paintings by artist 121fffad1eb6f7dff228b8a71b6aec72 . . .
processing paintings by artist 1a8d67dbb446bdc4298cc0be56932a38 . . .
processing paintings by artist ce3d8977aae5986601232aa58d15282a . . .
processing paintings by artist c16781c4321948227193214b68477a5c . . .
processing paintings by artist 3cc9a44380296d93e68b71a27643c25f . . .
processing paintings by artist d09f796f2b0aa11dffc88badd9806119 . . .
processing paintings by artist 5aabfc58470d01bb2362795a44a2603b . . .
processing paintings by artist db1318d32df7428076e03513ebf762bb . . .

writing json index file . . .
writing hsvstats file . . .
writing names file . . .


## Generate texture features

In [14]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the focus stats of their paintings
# contained in train_1.zip and train_2.zip
#
# else, process all cutouts and write json file
#
import PBNFeatures.cylindrical as cyl
import PBNFeatures.textureTools as tt
from PIL import Image
import os
import sys
import re
import json
featureDir = '../Data/'
focusstatsFile = featureDir + 'focusstats_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(focusstatsFile)):
    with open(focusstatsFile,'r') as jsonfile:
        focusstats = json.load(jsonfile)
        print('reading focusstats file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all cutouts and compute
    # the sharpness and the change in 
    # sharpness after smoothing twice each with 
    # both a 3-pixel and 5-pixel window
    #
    # save filenames and focus stats by colour in two lists
    #
    cutoutDir = featureDir + 'Cutout/'
    allCutouts = os.listdir(cutoutDir)
    #
    focusstats = []
    names = []
    for portfolio in portfolios_uni:
        print('\nprocessing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            names.append(painting)
            # loop over various cutouts of each painting
            filePattern = os.path.splitext(painting)[0] + '.*' + '.jpg' 
            cutoutFiles = sorted([f for f in allCutouts if re.match(filePattern,f)])
            sharpList = []
            for cutoutname in cutoutFiles:
                cutout = Image.open(cutoutDir + cutoutname)
                # convert image data to hsvlist
                hsvlist = cyl.jpg_to_hsv(cutout)
                # compute sharpness
                sharp2 = [tt.focusDetect(hsvlist,size=cutout.size)]
                # smooth with a 2-pixel window twice and append each to list
                hsvlist_s = tt.defocusValue(hsvlist,size=cutout.size,window=2)
                sharp2.append(tt.focusDetect(hsvlist_s,size=cutout.size))
                hsvlist_s = tt.defocusValue(hsvlist_s,size=cutout.size,window=2)
                sharp2.append(tt.focusDetect(hsvlist_s,size=cutout.size))
                # append to list of sharpnesses of each cutout from this image
                sharpList.append(sharp2)
            # add focus stats to list
            focusstats.append(sharpList)
            sys.stdout.write('.')
    # write focusstats to json file
    # create json text file with list of hsv stats for each
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(focusstatsFile,'w') as jsonfile:
        print('writing focusstats file . . .')
        jsonfile.write(json.dumps(focusstats,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))


processing paintings by artist 0eeac4ecff259dc515be795e1a76019a . . .
.......................................................................................................................
processing paintings by artist dd4989789d310581024ae2b9203d5439 . . .
...............................................................................................................
processing paintings by artist 121fffad1eb6f7dff228b8a71b6aec72 . . .
.........................................................................................................
processing paintings by artist 1a8d67dbb446bdc4298cc0be56932a38 . . .
........................................................................................................
processing paintings by artist ce3d8977aae5986601232aa58d15282a . . .
.......................................................................................................
processing paintings by artist c16781c4321948227193214b68477a5c . . .
................................