In [None]:
cd ../Python/

In [None]:
import readPBNData.description as rd
import readPBNData.images as ri

In [None]:
csvs, fileLike = rd.openZip('../Data/train_info.csv.zip')
lines, head = rd.readCSV(fileLike[0])

In [None]:
cols = rd.columns(lines,head)
cols.keys()

In [None]:
# find 10 artists with most paintings in data set
artistTable = rd.table(cols['artist'])
artistTable[:5]

In [None]:
# images in train_1.zip and train_2.zip file
train_1_names = rd.imagesInZip('../Data/train_1.zip')
train_2_names = rd.imagesInZip('../Data/train_2.zip')

In [None]:
# number of paintings by artist in train_1 and train_2
# distinct artists (slowish)
distinctArtists = [a[0] for a in artistTable]
paintingsIn12 = []
for artist in distinctArtists:
    paintingsIn12.append(len(rd.sameArtist(artist, cols, imageList=train_1_names))
                        +len(rd.sameArtist(artist, cols, imageList=train_2_names)))
artistTable12 = sorted(zip(distinctArtists, paintingsIn12), 
                       key=lambda item: item[1], reverse=True)
artistTable12[:10]

In [None]:
# lists of paintings by the 10 "leading" artists in train_1 and train_2 
leaders = [a[0] for a in artistTable12[:10]]
portfolios1 = []
portfolios2 = []
for artist in leaders:
    portfolios1.append(rd.sameArtist(artist, cols, imageList=train_1_names))
    portfolios2.append(rd.sameArtist(artist, cols, imageList=train_2_names))
portfolios = zip(leaders,portfolios1,portfolios2)

In [None]:
# create miniatures of the paintings by leading artists and save them to disk
miniportfolios = []
for portfolio in portfolios:
    minifiles_1 = ri.miniatures('../Data/train_1.zip', \
                                portfolio[1],prefix='train_1',size=(100,100))
    minifiles_2 = ri.miniatures('../Data/train_2.zip', \
                                portfolio[2],prefix='train_2',size=(100,100))
    miniportfolios.append((portfolio[0],minifiles_1,minifiles_2))

In [None]:
# create cutouts of the paintings by leading artists and save them to disk
cutoutportfolios = []
for portfolio in portfolios:
    cutoutfiles_1 = ri.cutouts('../Data/train_1.zip', \
                               portfolio[1],prefix='train_1',size=(100,100))
    cutoutfiles_2 = ri.cutouts('../Data/train_2.zip', \
                               portfolio[2],prefix='train_2',size=(100,100))
    cutoutportfolios.append((portfolio[0],cutoutfiles_1,cutoutfiles_2))

In [None]:
# prepare 1000 pairs of paintings by same artist 
# and 1000 pairs by different artists
import random
npairs = 200
randomSeed = 666
random.seed(randomSeed)
portfolios_uni = [(artist, p1 + p2) for (artist,p1,p2) in portfolios]
pairs = []
for isample in xrange(npairs):
    # randomly choose an artist
    portfolio = random.choice(portfolios_uni)
    # randomly choose two paintings
    paintings = random.sample(portfolio[1],2)
    pairs.append([paintings[0],paintings[1],1])
for isample in xrange(npairs):
    # randomly choose two artists
    portfoliopair = random.sample(portfolios_uni,2)
    # randomly choose a painting from each artist
    paintings = random.choice(portfoliopair[0][1]), random.choice(portfoliopair[1][1])
    pairs.append([paintings[0],paintings[1],0])

In [None]:
pairs[:8]

In [None]:
# load a single pair of miniatures
def loadPair(pair, mc = 'mini'):
    import os.path
    import PIL.Image as Image
    featureDir = '../Data/FeatureData/'
    name0 = os.path.splitext(pair[0])[0] + '_' + mc + '_100_x_100.jpg'
    mini0 = Image.open(featureDir + name0)
    name1 = os.path.splitext(pair[1])[0]+'_' + mc + '_100_x_100.jpg'
    mini1 = Image.open(featureDir + name1)
    return mini0, mini1

In [None]:
mini1, mini2 = loadMinis(pairs[999])
mini1

In [None]:
mini2

In [None]:
%matplotlib inline
reload(pt)
import matplotlib.pyplot as plt
import PBNFeatures.paletteTools as pt
palette = pt.CGApalette(ncolours=16)
mini1p = pt.paletteConvert(mini1,palette)
mini2p = pt.paletteConvert(mini2,palette)
pt.plotColourDistribution(mini1p.getcolors(),pt.unflatten(palette))
pt.plotColourDistribution(mini2p.getcolors(),pt.unflatten(palette))
plt.show()

In [None]:
pt.projectBW(mini1)

In [None]:
pt.projectBW(mini2)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import PBNFeatures.paletteTools as pt
#import math
palette = pt.CGApalette(ncolours=16)
nc = len(palette)/3
# for each pair in pairs, compute the log-difference between
# each colour in the palette in the miniature and in the cutout
diffs = []
for pair in pairs:
    mini1, mini2 = loadPair(pair,'mini')
    cutout1, cutout2 = loadPair(pair,'cutout')
    # convert to CGA palette
    mini1p, mini2p = pt.paletteConvert(mini1,palette), pt.paletteConvert(mini2,palette)
    cutout1p, cutout2p = pt.paletteConvert(cutout1,palette), pt.paletteConvert(cutout2,palette)
    colours1 = pt.completeColours(mini1p.getcolors(),nc)[:nc] \
               + pt.completeColours(cutout1p.getcolors(),nc)[:nc]
    #print([c[0] for c in colours1[:nc]])
    colours2 = pt.completeColours(mini2p.getcolors(),nc)[:nc] \
               + pt.completeColours(cutout2p.getcolors(),nc)[:nc]
    #print([c[0] for c in colours2[:nc]])
    diffs.append([abs(col[0]-colours2[i][0]) for i,col in enumerate(colours1)])
    #logdiff = [math.log(col[0]+1)-math.log(colours2[i][0]+1) for i,col in enumerate(colours1)]
    #print('\n')
    #print(diff)
    #print('\n')
    #print(logdiff)
                       


In [None]:
# compute PCA for 16-colour CGA colour distributions in miniatures
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import PBNFeatures.paletteTools as pt
import PBNPCA.pca as pbnpca
import os.path
import PIL.Image as Image
#import math
palette = pt.CGApalette(ncolours=16)
nc = len(palette)/3
featureDir = '../Data/FeatureData/'

In [None]:
# load all miniatures
data = []
for portfolio in portfolios_uni:
    for painting in portfolio[1]:
        mininame = os.path.splitext(painting)[0] + '_mini_100_x_100.jpg'
        mini = Image.open(featureDir + mininame)
        minip = pt.paletteConvert(mini,palette)
        colours = pt.completeColours(minip.getcolors(),nc)[:nc]
        data.append([c[0] for c in colours])


In [None]:
# compute the PC of data
reload(pbnpca)
ncomp = 15
pobj = pbnpca.pca(np.array(data),ncomp)

# pc:
pcs = []
for col in xrange(ncomp-1,-1,-1):
    # print(col)
    # column to row
    pc = [vec[col] for vec in pobj['eigvecs']]
    #print(pc)
    #print('\n')
    ### rescale by standard deviation and add meanvec
    ##pc = [pobj['stds'][col] * a + pobj['meanvec'][i] for i,a in enumerate(pc)]
    #print(pc)
    #print('\n')
    # construct a "colours" list out of the pc (clumsy but should be okay)
    pc = [(a,i) for i,a in enumerate(pc)]
    pcs.append(pc)


#print(pobj['stds'])
#print('\n')
#print(pobj['meanvec'])
#print('\n')
#print(pcs[:3])

In [None]:
print(pobj['stds'])
print('\n')
print(pobj['meanvec'])

In [None]:
# plot leading three eigenvectors (PC)
pt.plotColourDistribution(pcs[0],pt.unflatten(palette))
pt.plotColourDistribution(pcs[1],pt.unflatten(palette))
pt.plotColourDistribution(pcs[2],pt.unflatten(palette))
plt.show()

In [None]:
len(np.zeros((1,16)))

In [None]:
reload(pbnpca)
comps = pbnpca.pcaProject(data[0],pobj)[-3:]

In [None]:
print(data[750])
print('\n')
print(pbnpca.pcaTrunc(data[750],pobj,9))

#np.zeros((len(data[0]),1))
#np.array(pobj['meanvec']).reshape((len(data[0]),1))

In [None]:
(comps[2]*pobj['eigvecs'][:,10]).reshape((16,1)) + np.zeros((16,1))