In [None]:
cd ../Python/

In [None]:
import readPBNData.description as rd
import readPBNData.images as ri
import PBNFeatures.paletteTools as pt
import PBNPCA.pca as pbnpca
import numpy as np

# Loading and sorting image- and metadata

In [None]:
csvs, fileLike = rd.openZip('../Data/train_info.csv.zip')
lines, head = rd.readCSV(fileLike[0])

In [None]:
cols = rd.columns(lines,head)
cols.keys()

In [None]:
# find 10 artists with most paintings in data set
artistTable = rd.table(cols['artist'])
artistTable[:5]

In [None]:
# images in train_1.zip and train_2.zip file
train_1_names = rd.imagesInZip('../Data/train_1.zip')
train_2_names = rd.imagesInZip('../Data/train_2.zip')

In [None]:
# show (full) image
def showImage(imgName):
    if (imgName in train_1_names):
        ri.openZipImage('../Data/train_1.zip', \
                        imgName, \
                        prefix='train_1' \
                       ).show()
    else:
        ri.openZipImage('../Data/train_2.zip', \
                        imgName, \
                        prefix='train_2' \
                       ).show()

## Create miniatures and cutout samples

In [None]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the filenames of their paintings
# contained in train_1.zip and train_2.zip, assume miniatures
# and cutouts already exist (can refactor later to check)
#
# else, scan zip files, create miniatures, cutouts, json index file
#
import os.path
import json
featureDir = '../Data/'
portfoliosFile = featureDir + 'portfolios_top10.json'
if (os.path.isfile(portfoliosFile)):
    with open(portfoliosFile,'r') as jsonfile:
        portfolios_uni = json.load(jsonfile)
        print('reading reading reading!!!')
else:
    # if json file doesn't exist, then go through steps to create
    # miniatures and cutouts (if necessary) and then
    # create json file

    # number of paintings by artist in train_1 and train_2
    # distinct artists (slowish)
    distinctArtists = [a[0] for a in artistTable]
    paintingsIn12 = []
    print('scanning for paintings by top ten in full csv file . . .')
    for artist in distinctArtists:
        # print('    scanning artist ' + artist + ' . . . ')
        paintingsIn12.append(len(rd.sameArtist(artist, cols, imageList=train_1_names))
                            +len(rd.sameArtist(artist, cols, imageList=train_2_names)))
    artistTable12 = sorted(zip(distinctArtists, paintingsIn12), 
                           key=lambda item: item[1], reverse=True)
    print('top 10 artists . . . ')
    print(artistTable12[:10])

    # lists of paintings by the 10 most prolific artists in train_1 and train_2 
    # extract artist names from artistTable
    print('\nscanning for paintings by top ten in zip files . . .')
    leaders = [a[0] for a in artistTable12[:10]]
    portfolios1 = []
    portfolios2 = []
    for artist in leaders:
        print('    scanning artist ' + artist + ' . . . ')
        # paintings in train_1.zip
        portfolios1.append(rd.sameArtist(artist, cols, imageList=train_1_names))
        # paintings in train_2.zip
        portfolios2.append(rd.sameArtist(artist, cols, imageList=train_2_names))
    # list of artists and (separate) lists of paintings in train_1 and train_2
    portfolios = zip(leaders,portfolios1,portfolios2)
    # list of artists and single list of paintings
    portfolios_uni = [(artist, p1 + p2) for (artist,p1,p2) in portfolios]

    # create miniatures of the paintings by most prolific artists and save them to disk
    # (if file exists, does nothing)
    print('\ncreating miniatures (if necessary) . . . ')
    for portfolio in portfolios:
        print('    artist ' + portfolio[0] + ' . . . ')
        minifiles_1 = ri.miniatures('../Data/train_1.zip', \
                                    portfolio[1],prefix='train_1',size=(100,100))
        minifiles_2 = ri.miniatures('../Data/train_2.zip', \
                                    portfolio[2],prefix='train_2',size=(100,100))
    # create cutouts of the paintings by most prolific artists and save them to disk
    # (if file exists, does nothing)
    print('\ncreating cutouts (if necessary) . . . ')
    for portfolio in portfolios:
        print('    artist ' + portfolio[0] + ' . . . ')
        cutoutfiles_1 = ri.cutouts('../Data/train_1.zip', \
                                   portfolio[1],prefix='train_1',size=(100,100))
        cutoutfiles_2 = ri.cutouts('../Data/train_2.zip', \
                                   portfolio[2],prefix='train_2',size=(100,100))
        
    # create json text file with list of artists and filenames of paintings:
    print('\nwriting json index file . . .')
    with open(portfoliosFile,'w') as jsonfile:
        jsonfile.write(json.dumps(portfolios_uni,indent=2))

## Generate random pairs of paintings by same and different paintings

In [None]:
# prepare 1000 pairs of paintings by same artist 
# and 1000 pairs by different artists
import random
npairs = 1000
randomSeed = 667
random.seed(randomSeed)
pairs = []
for isample in xrange(npairs):
    # randomly choose an artist
    portfolio = random.choice(portfolios_uni)
    # randomly choose two paintings
    paintings = random.sample(portfolio[1],2)
    pairs.append([paintings[0],paintings[1],1])
for isample in xrange(npairs):
    # randomly choose two artists
    portfoliopair = random.sample(portfolios_uni,2)
    # randomly choose a painting from each artist
    paintings = random.choice(portfoliopair[0][1]), random.choice(portfoliopair[1][1])
    pairs.append([paintings[0],paintings[1],0])

In [None]:
pairs[:8]

# Generate colour features

In [None]:
# load all miniatures and simplify colour palette to 16-colour CGA
# save filenames and pixel-counts by colour in two lists
import PBNFeatures.paletteTools as pt
import os.path
import json
from PIL import Image
featureDir = '../Data/'
nc = 16
palette = pt.CGApalette(ncolours=nc)
cgaColoursFile = featureDir + 'cgacolours_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(cgaColoursFile)):
    with open(cgaColoursFile,'r') as jsonfile:
        data = json.load(jsonfile)
        print('reading CGA Colours file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all miniatures and compute saturation, value and hue
    # statistics for each (slow),
    # save filenames and pixel-counts by colour in two lists
    data = []
    names = []
    for portfolio in portfolios_uni:
        print('processing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            names.append(painting)
            mininame = os.path.splitext(painting)[0] + '_mini_100_x_100.jpg'
            mini = Image.open(featureDir + 'FeatureData/' + mininame)
            minip = pt.paletteConvert(mini,palette)
            colours = pt.completeColours(minip.getcolors(),nc)[:nc]
            data.append([c[0] for c in colours])
    # write cga colour data to json file
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(cgaColoursFile,'w') as jsonfile:
        print('writing CGA colours file . . .')
        jsonfile.write(json.dumps(data,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))

In [None]:
# load a single pair of miniatures
def loadPair(pair, mc = 'mini'):
    import os.path
    import PIL.Image as Image
    featureDir = '../Data/FeatureData/'
    name0 = os.path.splitext(pair[0])[0] + '_' + mc + '_100_x_100.jpg'
    mini0 = Image.open(featureDir + name0)
    name1 = os.path.splitext(pair[1])[0]+'_' + mc + '_100_x_100.jpg'
    mini1 = Image.open(featureDir + name1)
    return mini0, mini1

In [None]:
# load a pair by the same artist
pairnum = 999
mini1, mini2 = loadPair(pairs[pairnum])
# show (full) images
#showImage(pairs[pairnum][0])
#showImage(pairs[pairnum][1])


# compare colour distribution in simplified palette between
# members of the pair
%matplotlib inline
import matplotlib.pyplot as plt
palette = pt.CGApalette(ncolours=16)

mini1p = pt.paletteConvert(mini1,palette)
mini2p = pt.paletteConvert(mini2,palette)

pt.plotColourDistribution(mini1p.getcolors(),pt.unflatten(palette))
pt.plotColourDistribution(mini2p.getcolors(),pt.unflatten(palette))

plt.show()

In [None]:
# repeat for a pair by different artists
# load a pair by the same artist
pairnum = 1001
mini1, mini2 = loadPair(pairs[pairnum])
# show (full) images
#showImage(pairs[pairnum][0])
#showImage(pairs[pairnum][1])

mini1p = pt.paletteConvert(mini1,palette)
mini2p = pt.paletteConvert(mini2,palette)

pt.plotColourDistribution(mini1p.getcolors(),pt.unflatten(palette))
pt.plotColourDistribution(mini2p.getcolors(),pt.unflatten(palette))

plt.show()

## Pairwise comparisons based on CGA data

In [None]:
# for all pairs in "pairs", compute distances in colour space
import math
distance = []
for pair in pairs:
    inds = names.index(pair[0]), names.index(pair[1])
    distance.append(sum((x1-data[inds[1]][i])**2 for i,x1 in enumerate(data[inds[0]])))
print('mean distance for sames, diffs:')
print(np.mean(distance[:npairs]), np.mean(distance[-npairs:]))
print('\n')

# plot Euclidean distance in pixel-count space for
# pairs that are by same artist (blue) and
# pairs that are by different artists (red)
# (not very exciting!)
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),distance[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),distance[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Distance in 16-colour pixel-count space\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

# repeat with a different norm ... (but which?)
# what about just rank difference by colour
distance2 = []
ncol = len(data[0])
for pair in pairs:
    inds = names.index(pair[0]), names.index(pair[1])
    ranks1 = [aa[1] for aa in sorted(zip(data[inds[0]],range(ncol)))]
    ranks2 = [aa[1] for aa in sorted(zip(data[inds[1]],range(ncol)))]
    distance2.append(sum((r - ranks2[i])**2 for i,r in enumerate(ranks1)))
print('mean distance for sames, diffs:')
print(np.mean(distance2[:npairs]), np.mean(distance2[-npairs:]))
print('\n')

# plot rank distance in pixel-count space for
# pairs that are by same artist (blue) and
# pairs that are by different artists (red)
# (not exciting at all!!)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),distance2[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),distance2[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Distance in 16-colour pixel-count \"rank\" space\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

# repeat with yet another different norm ... (but which?)
# what about on/off switch if colour is more than 5%
distance3 = []
ncol = len(data[0])
thresh = 0.05*sum(data[0])
for pair in pairs:
    inds = names.index(pair[0]), names.index(pair[1])
    onoff1 = [int(x>thresh) for x in data[inds[0]]]
    onoff2 = [int(x>thresh) for x in data[inds[1]]]
    distance3.append(sum((r - onoff2[i])**2 for i,r in enumerate(onoff1)))
print('mean distance for sames, diffs:')
print(np.mean(distance3[:npairs]), np.mean(distance3[-npairs:]))
print('\n')

# plot distance in threshold space for
# pairs that are by same artist (blue) and
# pairs that are by different artists (red)
# (slightly more exciting :-())
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),distance3[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),distance3[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Distance in 16-colour pixel-count number-of-colours space\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

# repeat with yet another different norm ... (but which?)
# what about on/off switch if colour is more than 5%
# but this time count only common colours
distance4 = []
ncol = len(data[0])
thresh = 0.05*sum(data[0])
for pair in pairs:
    inds = names.index(pair[0]), names.index(pair[1])
    onoff1 = [int(x>thresh) for x in data[inds[0]]]
    onoff2 = [int(x>thresh) for x in data[inds[1]]]
    distance4.append(sum(int(r+onoff2[i]==2) for i,r in enumerate(onoff1)))
print('mean distance for sames, diffs:')
print(np.mean(distance4[:npairs]), np.mean(distance4[-npairs:]))
print('\n')

# plot distance in threshold space for
# pairs that are by same artist (blue) and
# pairs that are by different artists (red)
# (slightly more exciting :-())
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),distance4[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),distance4[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Distance in number-of-colours-in-comon space\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

In [None]:
# Was ist hier eigentlich los????
# Let's look at particular pairs
pairnum = 7
inds = names.index(pairs[pairnum][0]), names.index(pairs[pairnum][1])

# show (full) images
#showImage(pairs[pairnum][0])
#showImage(pairs[pairnum][1])

# distance1
print('Euclidean')
print(zip(data[inds[0]],data[inds[1]]))
print(distance[pairnum])

# distance2
print('\nRank')
ranks1 = [aa[1] for aa in sorted(zip(data[inds[0]],range(ncol)))]
ranks2 = [aa[1] for aa in sorted(zip(data[inds[1]],range(ncol)))]
print(zip(ranks1,ranks2))
print(distance2[pairnum])


# distance3
print('\nThreshold')
onoff1 = [int(x>thresh) for x in data[inds[0]]]
onoff2 = [int(x>thresh) for x in data[inds[1]]]
print(zip(onoff1,onoff2))
print(distance3[pairnum])

# distance4
print('\nCommon Threshold')
onoff1 = [int(x>thresh) for x in data[inds[0]]]
onoff2 = [int(x>thresh) for x in data[inds[1]]]
print(zip(onoff1,onoff2))
print(distance4[pairnum])

## Pairwise comparisons based on HSV data

In [None]:
# maybe hue/saturation/value is more interesting!!!
import PBNFeatures.cylindrical as cyl
reload(cyl)

pairnum=999

# show (full) images
#showImage(pairs[pairnum][0])
#showImage(pairs[pairnum][1])

# load miniatures
mini1,mini2 = loadPair(pairs[pairnum],mc='mini')

# compute hue/saturation/value statistics
hsvstats1 = cyl.hsv_stats(cyl.jpg_to_hsv(mini1),hue_bins=8)
hsvstats2 = cyl.hsv_stats(cyl.jpg_to_hsv(mini2),hue_bins=8)

# histograms of hue distribution
reload(cyl)
cyl.plotHueDistribution(hsvstats1['hue_bins'])
cyl.plotHueDistribution(hsvstats2['hue_bins'])

# hsvstats1, hsvstats2

In [None]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the hsv stats of their paintings
# contained in train_1.zip and train_2.zip
#
# else, process all miniatures and write json file
#
import PBNFeatures.cylindrical as cyl
from PIL import Image
import os.path
import json
featureDir = '../Data/'
hsvstatsFile = featureDir + 'hsvstats_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(hsvstatsFile)):
    with open(hsvstatsFile,'r') as jsonfile:
        hsvstats = json.load(jsonfile)
        print('reading hsvstats file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all miniatures and compute saturation, value and hue
    # statistics for each (slow),
    # save filenames and pixel-counts by colour in two lists
    hsvstats = []
    names = []
    for portfolio in portfolios_uni:
        print('processing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            names.append(painting)
            mininame = os.path.splitext(painting)[0] + '_mini_100_x_100.jpg'
            mini = Image.open(featureDir + 'FeatureData/' + mininame)
            hsvstats.append(cyl.hsv_stats(cyl.jpg_to_hsv(mini),hue_bins=16))
    # write hsvstats to json file
    # create json text file with list of hsv stats for each
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(hsvstatsFile,'w') as jsonfile:
        print('writing hsvstats file . . .')
        jsonfile.write(json.dumps(hsvstats,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))

In [None]:
# let's look at some hsv statistics for a set of paintings
# (whole population or a single painter, or whatever)
%matplotlib inline
def hsvStatsPlots(hsvst, nbins=20):
    sat_means = [h['sat_mean'] for h in hsvst]
    sat_stds = [h['sat_std'] for h in hsvst]
    val_means = [h['val_mean'] for h in hsvst]
    val_stds = [h['val_std'] for h in hsvst]
    dom_hue = [h['hue_bins'][0][0] for h in hsvst]
    dom_hue_count = [h['hue_bins'][0][1] for h in hsvst]
    hue_variety = [sum(int(hue[1]>1000) for hue in h['hue_bins']) for h in hsvst]


    # histograms
    import matplotlib.pyplot as plt
    n, bins, patches = plt.hist(sat_means, bins=nbins, normed=True, facecolor='green', alpha=0.75)
    plt.xlabel('Mean saturation')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    n, bins, patches = plt.hist(sat_stds, bins=nbins, normed=True, facecolor='red', alpha=0.75)
    plt.xlabel('Saturation std')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    n, bins, patches = plt.hist(val_means, bins=nbins, normed=True, facecolor='blue', alpha=0.75)
    plt.xlabel('Mean value (brightness)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    n, bins, patches = plt.hist(val_stds, bins=nbins, normed=True, facecolor='green', alpha=0.75)
    plt.xlabel('Value (brightness) std')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    n, bins, patches = plt.hist(dom_hue_count, bins=nbins, normed=True, facecolor='red', alpha=0.75)
    plt.xlabel('Pixels of dominant hue')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    # barplots
    tab = rd.table(hue_variety)
    wid = 0.8
    pos = [t[0] for t in tab]
    heights = [t[1] for t in tab]
    plt.xlabel('Number of important hue bins (out of 16)')
    plt.bar(pos, heights, width=wid)
    plt.show()

    tab = rd.table(dom_hue)
    cyl.plotHueDistribution(tab, nhues=16)
    plt.title('Occurrence of dominant hues')
    plt.ylabel('occurrence')
    plt.xlim([0,16])
    plt.show()

In [None]:
# stats for full population
hsvStatsPlots(hsvstats, nbins=20)

In [None]:
# painter 2
painterNum = 2
paintings = portfolios_uni[painterNum][1]
hsvst = [hsvstats[names.index(p)] for p in paintings]
hsvStatsPlots(hsvst, nbins=10)

In [None]:
# painter 3
painterNum = 3
paintings = portfolios_uni[painterNum][1]
hsvst = [hsvstats[names.index(p)] for p in paintings]

hsvStatsPlots(hsvst, nbins=10)

In [None]:
# for all pairs in "pairs", compute difference in 
# saturation and value means and std and the
# distance in dominant hue
import math
sat_mean_dist = []
sat_std_dist = []
val_mean_dist = []
val_std_dist = []
total_hue_dist = []
dom_hue_dist = []
dom_hue_count_dist = []
hue_variety_dist = []
nbins = len(hsvstats[0]['hue_bins'])
hue_ave = sum(pix for i,pix in hsvstats[0]['hue_bins'])/nbins

for pair in pairs:
    inds = names.index(pair[0]), names.index(pair[1])
    sat_mean_dist.append( \
                abs(hsvstats[inds[0]]['sat_mean'] - hsvstats[inds[1]]['sat_mean']))
    sat_std_dist.append( \
                abs(hsvstats[inds[0]]['sat_std'] - hsvstats[inds[1]]['sat_std']))
    val_mean_dist.append( \
                abs(hsvstats[inds[0]]['val_mean'] - hsvstats[inds[1]]['val_mean']))
    val_std_dist.append( \
                abs(hsvstats[inds[0]]['val_std'] - hsvstats[inds[1]]['val_std']))
    hues1 = cyl.completeSortHues(hsvstats[inds[0]]['hue_bins'],nhues=16)
    hues2 = cyl.completeSortHues(hsvstats[inds[1]]['hue_bins'],nhues=16)
    total_hue_dist.append( \
                sum((x1[1]-hues2[i][1])**2 for i,x1 in enumerate(hues1)))
    dom_hue_dist.append( \
                min( \
                    (hsvstats[inds[0]]['hue_bins'][0][0] - \
                     hsvstats[inds[1]]['hue_bins'][0][0]) % nbins, \
                    (hsvstats[inds[1]]['hue_bins'][0][0] - \
                     hsvstats[inds[0]]['hue_bins'][0][0]) % nbins) )
    dom_hue_count_dist.append( \
                abs(hsvstats[inds[0]]['hue_bins'][0][1] - \
                    hsvstats[inds[1]]['hue_bins'][0][1]) )
    # number of hue bins with more than the average number of pixels
    hue_variety_dist.append( \
                abs(sum(int(h[1]>1000) \
                        for h in hsvstats[inds[0]]['hue_bins']) - \
                    sum(int(h[1]>1000) \
                        for h in hsvstats[inds[1]]['hue_bins']) ) )

print('mean distances for sames, diffs:\n')
print('sat_mean:')
print(np.mean(sat_mean_dist[:npairs]), np.mean(sat_mean_dist[-npairs:]))
print('sat_std:')
print(np.mean(sat_std_dist[:npairs]), np.mean(sat_std_dist[-npairs:]))
print('val_mean:')
print(np.mean(val_mean_dist[:npairs]), np.mean(val_mean_dist[-npairs:]))
print('val_std:')
print(np.mean(val_std_dist[:npairs]), np.mean(val_std_dist[-npairs:]))
print('total_hue_dist:')
print(np.mean(total_hue_dist[:npairs]), np.mean(total_hue_dist[-npairs:]))
print('dom_hue:')
print(np.mean(dom_hue_dist[:npairs]), np.mean(dom_hue_dist[-npairs:]))
print('dom_hue_count:')
print(np.mean(dom_hue_count_dist[:npairs]), np.mean(dom_hue_count_dist[-npairs:]))
print('hue_variety:')
print(np.mean(hue_variety_dist[:npairs]), np.mean(hue_variety_dist[-npairs:]))
print('\n')

# plot Euclidean distance in hsv-stats space for
# pairs that are by same artist (blue) and
# pairs that are by different artists (red)
# (still not very exciting!)
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),sat_mean_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),sat_mean_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Difference in mean-saturation\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),val_mean_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),val_mean_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Difference in mean-value (brightness)\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),total_hue_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),total_hue_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Eucl. dist. between hue distributions (16 bins)\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),dom_hue_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),dom_hue_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Difference in dominant hue (1 of 16 bins)\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),dom_hue_count_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),dom_hue_count_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Difference in pixels in the dominant hue (1 of 16 bins)\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

fig = plt.figure()
ax = plt.axes()
plt.plot(range(npairs),hue_variety_dist[:npairs],linestyle='none',color='blue',marker='.')
plt.plot(range(npairs,2*npairs),hue_variety_dist[-npairs:],linestyle='none',color='red',marker='.')
plt.title('Difference in number of significant hues (of 16 bins)\n' \
        +'same painter (blue), different painter (red)')
plt.xlabel('pair number')
plt.show()

## Generate texture features

In [None]:
import numpy as np
import PBNFeatures.textureTools as tt
reload(tt)
reload(cyl)

pairnum = 500

# load cutouts
cutout1,cutout2 = loadPair(pairs[pairnum],mc='cutout')
print(cutout1.size)

# display cutout1
cutout1.show()

# compute tuple of hue/saturation/value for each pixel
hsvlist1 = cyl.jpg_to_hsv(cutout1)

# smooth the value channel, rebuild and display
hsvlist1 = tt.defocusValue(hsvlist1)
cutout1.putdata(cyl.hsv_to_rgb(hsvlist1))
cutout1.show()

# again
hsvlist1 = tt.defocusValue(hsvlist1)
cutout1.putdata(cyl.hsv_to_rgb(hsvlist1))
cutout1.show()

# again
hsvlist1 = tt.defocusValue(hsvlist1)
cutout1.putdata(cyl.hsv_to_rgb(hsvlist1))
cutout1.show()


In [None]:
# if it exists, load the json text file containing list of 
# 10 most prolific artists and the focus stats of their paintings
# contained in train_1.zip and train_2.zip
#
# else, process all cutouts and write json file
#
import PBNFeatures.cylindrical as cyl
import PBNFeatures.textureTools as tt
from PIL import Image
import os.path
import json
featureDir = '../Data/'
focusstatsFile = featureDir + 'focusstats_top10.json'
namesFile = featureDir + 'names_top10.json'
if (os.path.isfile(focusstatsFile)):
    with open(focusstatsFile,'r') as jsonfile:
        focusstats = json.load(jsonfile)
        print('reading hsvstats file . . .')
    with open(namesFile,'r') as jsonfile:
        names = json.load(jsonfile)
        print('reading names file . . .')
else:
    # load all cutouts and compute
    # the sharpness and the change in 
    # sharpness after smoothing twice each with 
    # both a 3-pixel and 5-pixel window
    #
    # save filenames and focus stats by colour in two lists
    focusstats = []
    names = []
    for portfolio in portfolios_uni:
        print('processing paintings by artist ' + portfolio[0] + ' . . .')
        for painting in portfolio[1]:
            #print(painting)
            names.append(painting)
            cutoutname = os.path.splitext(painting)[0] + '_cutout_100_x_100.jpg'
            cutout = Image.open(featureDir + 'FeatureData/' + cutoutname)
            #print('1 - image open')
            # convert image data to hsvlist
            hsvlist = cyl.jpg_to_hsv(cutout)
            #print('2 - to hsv')
            # compute sharpness
            sharp0 = tt.focusDetect(hsvlist,size=cutout.size)
            #print('3 - focus detect')
            sharp1 = [sharp0]
            sharp2 = [sharp0]
            # smooth with a 1-pixel window twice 
            hsvlist_s = tt.defocusValue(hsvlist,size=cutout.size,window=1)
            #print('4 - defocusValue')
            sharp1.append(tt.focusDetect(hsvlist_s,size=cutout.size))
            hsvlist_s = tt.defocusValue(hsvlist_s,size=cutout.size,window=1)
            #print('5 - defocusValue')
            sharp1.append(tt.focusDetect(hsvlist_s,size=cutout.size))
            # smooth with a 2-pixel window twice 
            hsvlist_s = tt.defocusValue(hsvlist,size=cutout.size,window=2)
            #print('6 - defocusValue')
            sharp2.append(tt.focusDetect(hsvlist_s,size=cutout.size))
            hsvlist_s = tt.defocusValue(hsvlist_s,size=cutout.size,window=2)
            #print('7 - defocusValue')
            sharp2.append(tt.focusDetect(hsvlist_s,size=cutout.size))
            # add focus stats to list
            focusstats.append({'sharp1': sharp1, 'sharp2': sharp2})
    # write focusstats to json file
    # create json text file with list of hsv stats for each
    # artist's portfolio
    print('\nwriting json index file . . .')
    with open(focusstatsFile,'w') as jsonfile:
        print('writing focusstats file . . .')
        jsonfile.write(json.dumps(focusstats,indent=2))
    with open(namesFile,'w') as jsonfile:
        print('writing names file . . .')
        jsonfile.write(json.dumps(names,indent=2))


# Logistic Regression

In [None]:
# Let's try a logistic regression using a few of the features
# that look a tiny bit promisingimport numpy as np
import Predictors.logRegres as lr
reload(lr)
featureMat = np.matrix([ \
                        len(distance)*[1.0], \
                        distance, \
                        distance3, \
                        sat_mean_dist, \
                        sat_std_dist, \
                        val_mean_dist, \
                        total_hue_dist, \
                        dom_hue_dist, \
                       ]).transpose()
classLabels = [pair[2] for pair in pairs]

# partition features and classes into training
# and testing sets
splitRatio = 0.8
ntrain = int(0.5*splitRatio*len(featureMat))

featureTrain = np.concatenate((featureMat[:ntrain],featureMat[-ntrain:]),axis=0)
classTrain = np.concatenate((classLabels[:ntrain],classLabels[-ntrain:]),axis=0)

featureTest = featureMat[ntrain:-ntrain]
classTest = classLabels[ntrain:-ntrain]


In [None]:
# standardize (centre and normalize) training data
obj = lr.standardize(featureTrain[:,1:])
# apply same transformation to testing data
lr.standardize_predict(featureTest[:,1:],obj)

# train logistic regression model
weights = lr.gradAscent(featureTrain,classTrain,alpha=0.001,iterations=200,report=75)
# print out weights
weights

In [None]:
# in-sample predictions
reload(lr)
predictTrain = lr.predict(featureTrain,weights)
trainConfusion = lr.confusion(classTrain,predictTrain,0.5)
print(trainConfusion)
print('\nin-sample accuracy: ' + str(float(np.trace(trainConfusion))/np.sum(trainConfusion)))

In [None]:
# out-of-sample predictions
predictTest = lr.predict(featureTest,weights)
testConfusion = lr.confusion(classTest,predictTest,0.5)
print(testConfusion)
print('\nout-of-sample accuracy: ' + str(float(np.trace(testConfusion))/np.sum(testConfusion)))

In [None]:
# maximum confidence prediction same artist
print((np.argmax(predictTest), max(predictTest)))
print(featureTest[np.argmax(predictTest)])

showImage(pairs[ntrain+np.argmax(predictTest)][0])
showImage(pairs[ntrain+np.argmax(predictTest)][1])

In [None]:
# maximum confidence prediction different artist
print((np.argmin(predictTest), min(predictTest)))
print(featureTest[np.argmin(predictTest)])

showImage(pairs[ntrain+np.argmin(predictTest)][0])
showImage(pairs[ntrain+np.argmin(predictTest)][1])

In [None]:
%matplotlib inline
rocObj = lr.ROC(classTrain,predictTrain, step=0.1)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
plt.xlabel('False Positive')
plt.ylabel('True Positive')
plt.plot(rocObj['fpr'],rocObj['tpr'])
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
# compute PCA for 16-colour CGA colour distributions in miniatures
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import PBNFeatures.paletteTools as pt
import PBNPCA.pca as pbnpca
import os.path
import PIL.Image as Image
#import math
palette = pt.CGApalette(ncolours=16)
nc = len(palette)/3
featureDir = '../Data/FeatureData/'

In [None]:
# compute the PC of data
reload(pbnpca)
ncomp = 15
pobj = pbnpca.pca(np.array(data),ncomp)

# pc:
pcs = []
for col in xrange(ncomp-1,-1,-1):
    # column to row
    pc = [vec[col] for vec in pobj['eigvecs']]
    # construct a "colours" list out of the pc (clumsy but should be okay)
    pc = [(a,i) for i,a in enumerate(pc)]
    pcs.append(pc)

In [None]:
# plot leading eigenvectors (PC)
pt.plotColourDistribution(pcs[0],pt.unflatten(palette))
pt.plotColourDistribution(pcs[1],pt.unflatten(palette))
pt.plotColourDistribution(pcs[2],pt.unflatten(palette))
pt.plotColourDistribution(pcs[3],pt.unflatten(palette))
pt.plotColourDistribution(pcs[4],pt.unflatten(palette))
pt.plotColourDistribution(pcs[5],pt.unflatten(palette))
plt.show()

In [None]:
# project row onto leading PC
pbnpca.pcaProject(data[750],pobj)[-8:]

In [None]:
# compare full vector and projection onto first 8
print(data[750])
print('\n')
print(pbnpca.pcaTrunc(data[750],pobj,8))

#np.zeros((len(data[0]),1))
#np.array(pobj['meanvec']).reshape((len(data[0]),1))

In [None]:
# ideas for other features ...
#    saturation, lightness
#    texture, edges, dots
#    size
#    number of significant colours
# neural network idea?  (Auto-encoder-decoder)
# topic (=style) modelling

# contrast detection, "focus and sharpness"