In [3]:
# Packages that we use 
from __future__ import division
import json as js
import pandas as pd
import numpy as np
import webcolors
import time
from PIL import Image as pil
import urllib.request as urllib
import io
from matplotlib import colors as mcolors
import math as mt

# run this command too - just to allow more data to be displayed than default
pd.set_option('display.max_rows', 50)
# this one ensures graphs properly display in the notebook
%matplotlib inline



In [4]:
# %load ./colorgram/__init__.py

# colorgram.py, a module to extract colors from images.
# Based on Jan Forst's original JavaScript version.

from __future__ import absolute_import



# The following code is taken from the colorgram package and adapted to our specific need, it is further optimized due to the large amount of data that is going to be analysed; This code extracts the N most present colors in a picture in RGB format. N is defined by the user;

https://github.com/obskyr/colorgram.py

In [5]:
# %load ./colorgram/colorgram.py


from __future__ import unicode_literals
from __future__ import division

import array
from collections import namedtuple
from PIL import Image

import sys
if sys.version_info[0] <= 2:
    range = xrange
    ARRAY_DATATYPE = b'l'
else:
    ARRAY_DATATYPE = 'l'

class Color(object):
    def __init__(self, r, g, b, proportion):
        self.rgb = (r, g, b)
        self.proportion = proportion
    
    def __repr__(self):
        return "<colorgram.py Color: {}, {}%>".format(
            str(self.rgb), str(self.proportion * 100))
    
def extract(f, number_of_colors):
    image = Image.open(io.BytesIO(urllib.urlopen(f).read()))
    image = image.convert('RGB')
    samples = sample(image)
    used = pick_used(samples)
    used.sort(key=lambda x: x[0], reverse=True)
    return get_colors(samples, used, number_of_colors)

def sample(image):
    top_two_bits = 0b11000000

    sides = 1 << 2 # Left by the number of bits used.
    cubes = sides ** 7

    samples = array.array(ARRAY_DATATYPE, (0 for _ in range(cubes)))
    width, height = image.size
    
    pixels = image.load()
    for y in range(height):
        for x in range(width):
            # Pack the top two bits of all 6 values into 12 bits.
            # 0bYYhhllrrggbb - luminance, hue, luminosity, red, green, blue.

            r, g, b = pixels[x, y][:3]
            # Standard constants for converting RGB to relative luminance.
            Y = int(r * 0.2126 + g * 0.7152 + b * 0.0722)

            # Everything's shifted into place from the top two
            # bits' original position - that is, bits 7-8.
            packed  = (Y & top_two_bits) << 4
            packed *= 4
            samples[packed]     += r
            samples[packed + 1] += g
            samples[packed + 2] += b
            samples[packed + 3] += 1
    return samples

def pick_used(samples):
    used = []
    for i in range(0, len(samples), 4):
        count = samples[i + 3]
        if count:
            used.append((count, i))
    return used

def get_colors(samples, used, number_of_colors):
    pixels = 0
    colors = []
    number_of_colors = min(number_of_colors, len(used))

    for count, index in used[:number_of_colors]:
        pixels += count

        color = Color(
            samples[index]     // count,
            samples[index + 1] // count,
            samples[index + 2] // count,
            count
        )

        colors.append(color)
    for color in colors:
        color.proportion /= pixels
    return colors


The function distance takes a pair of lists (vectors) with 3 elements each and returns the cartesian distance between the two points that the vectors point to as follows :
$distance([x_1,x_2,x_3],[y_1,y_2,y_3])=\sqrt{(x_1-y_1)^2+(x_2-y_2)^2+(x_3-y_3)^2}$ \n
Under the RGB representation, the colors are all the points with integer coordinates in the 3D cartesian space within a cube of side 256 in the 1st quadrant with a corner a the origin. \n
This way, we can represent each color as a vector that is contained in the RGB cube;
Then, relying on a set of colors that we know (colorsRGB variable ), we can assign names to the colors based on their proximity to the ones we know. 

In [6]:
# Home made functions
# Function Cartesian distance to use for color assignement

def distance(x = [0,0,0],y = [0,0,0]):
    '''distance between two 3D vectors passed sa arguments'''
    return mt.sqrt(pow(x[0]-y[0],2)+pow(x[1]-y[1],2)+pow(x[2]-y[2],2))
    
def rgbtolist(rgb):
    '''converts the colorgram rgb variable into a simple list with 3 values'''
    return [rgb[0],rgb[1],rgb[2]]

def closestcolor(dist = 1000,x = [0,0,0]):
    '''returns the distance and the closest color in our database to a the passed color parameter'''
    j = 0
    for k in range(1,colorsRGB.shape[1]): 
            if dist > distance(x,colorsRGB.loc[:,cols[k]].tolist()):
                dist = distance(x,colorsRGB.loc[:,cols[k]].tolist())
                j=k
    return dist,cols[j]
# the central function of the color recognition algorithm, takes the url of an image and returns its main color palette.
# if unable to extract the colors, returns np.nan

def ColorRecognition(URL):
    '''Returns the color paletteof an image passed as a URL in the parameter of the function'''
    try : 
        colors = extract(URL, main_color_num)
        color_list = pd.DataFrame(data = ([rgbtolist(color.rgb), color.proportion] for color in colors), columns = ['RGB','Proportion'])
        Couleurs = []
        for i,p in enumerate(color_list['Proportion']):
            couleur = "not assigned color"
            #print(p)
            if p > 0.08:
                #print(p > 0.3)
                x = color_list.loc[i,'RGB']
                #print(colorsRGB.loc[:,cols[0]])
                dist = distance(x,colorsRGB.loc[:,cols[0]].tolist())
                dist,couleur  = closestcolor(dist,x)
            #print(couleur)
            if couleur != "not assigned color":
                Couleurs.append(couleur)
        return Couleurs
    except : 
        return np.nan


# Generating the reference color palette based on the CSS colors, originally there are 148 colors, but for optimization reasons, 54 are kept. 

it is taken from matplotlib

In [None]:
# storing the values in a dictionary, initially, they are in Hex basis. ex: #AB0A15
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)

In [None]:
# converting the values to RGB in decimal basis
for name, color in colors.items():
    colors[name] = mcolors.to_rgb(color)

In [None]:
# Defining global variables that are the colors database and names
global cols
global colorsRGB

In [None]:
# requires matplotlib package to run
def generateColorPalette():
    ''''''
    # storing the values in a dictionary, initially, they are in Hex basis. ex: #AB0A15
    colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
    # converting the values to RGB in decimal basis
    for name, color in colors.items():
        colors[name] = mcolors.to_rgb(color)
    # storing them in a data frame
    colorsRGB = pd.DataFrame(data = colors)
    #droping the basic pallette
    colorsRGB = colorsRGB.drop(colorsRGB.columns[0:8], axis=1)
    #Converting to values from 0 to 255
    colorsRGB *= 256
    colorsRGB = colorsRGB.round(0)
    cols=colorsRGB.columns
    # the set is reduced from 148 to 54 colors in order to optimize the color detection on a large data set. 
    keepcolors = [0,1,3,5,6,7,9,10,11,15,16,20,42,49,51,53,54,55,57,59,60,61,62,63,65,82,83,84,85,86,97,99,101,103,105,106,107,112,114,115,118,120,121,122,124,128,129,134,137,138,140,141,142,143,144,146]
    #assigning the colors to keep
    cols = cols[keepcolors]
    # keeping only the RGB values we want and return
    return colorsRGB[cols]

In [None]:
colorsRGB = generateColorPalette()

In [None]:
# Final color pallette
colorsRGB

# Reading the data to analyze

In [None]:
artworks = pd.read_json('CleanedDataJG.json')

In [None]:
artworks

In [None]:
# producing a sample data set for initial tests; 
artworksSample = artworks.sample(frac = 0.001, axis = 0)

In [None]:
artworksSample

# TEST ON THE SAMPLE DATA SET 

In [None]:
#number of colors to extract from each image
main_color_num = 6

# the %prun command returns the detailed times it took for each function call within the cell; it is initially used for  finding the time consuming operations in order to try to reduce it. 

In [None]:
# Running a test on a sample that constitutes 0.001 of the whole data set

%prun artworksSample['Colors'] = artworksSample['ThumbnailURL'].apply(ColorRecognition)

In [None]:
artworksSample

# Visual verification of the outputs on the sample data set

In [None]:
#print the colors 
for i, x in enumerate(artworksSample['ThumbnailURL']):
    print(artworksSample.loc[artworksSample.index[i],'Colors'])
    Image.open(io.BytesIO(urllib.urlopen(x).read())).show()

# MAIN COLOR EXTRACTION

# The best result in term of time was 19 seconds for the sample data set with 69 images ; so the $t_{min}= 0.27$ second per image;
# The total computation for 70000 files should take around 6-7 hours depending on the internet connection

In [None]:
# Line that runs for 8 hours and generates the coor palette of each artwork of the moma collection. 
%%prun
artworks.loc['Colors'] = artworks.loc['ThumbnailURL'].apply(ColorRecognition)

In [12]:
artworks.to_csv('artworksandColors.csv', sep = ';')
artworks.to_json('ArtworksandColors.json')

1st run : 0-1k works, 660 secs
2nd run : 1k-5k , 1267 secs
3rd run : 5k - 15k, 3048 secs
4th run : 15k - 25k, 3656 secs
5th run : 25k-35k, 3862 secs
6th run : 35k-45k, 4315 secs
7th run : 45k - 55k , 5250 secs
8th run : 55k-65k , 5397 secs
9th run : 65k-end , 2024 secs

TOTAL TIME : 29479 seconds, 8.2 hours, not so bad ! 

In [5]:
artworks = pd.read_json('ArtworksandColors.json')

# Summary statistics

In [7]:
l1 = artworks.shape[0]

In [8]:
l1

68688

In [9]:
l2 = len(artworks.explode('Colors'))

In [10]:
l2

166713

In [11]:
r = l2/l1

In [12]:
r

2.427105171208945

In [13]:
artworks = artworks.explode('Colors')

In [19]:
artworks.shape[0]

166713

In [25]:
dimgray = artworks.Colors.value_counts()[0]

In [26]:
dimgray

38321

In [28]:
top10 = artworks.Colors.value_counts()[0:10]/l1

In [29]:
top10

dimgray         0.557899
rosybrown       0.461565
black           0.340831
gainsboro       0.335706
silver          0.138685
linen           0.119642
wheat           0.070871
gray            0.069663
antiquewhite    0.063636
tan             0.054071
Name: Colors, dtype: float64

In [30]:
artworks

Unnamed: 0,FID,Title,Artist,NationalityCleaner,Classification,tidyDate,Decade,GenderClean,Medium,ThumbnailURL,ConstituentID,ObjectID,AccessionNumber,URL,Colors
0,0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",[Otto Wagner],[Austrian],Architecture,1896,1890,[Male],Ink and cut-and-pasted painted pages on paper,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,[6210],2,885.1996,http://www.moma.org/collection/works/2,rosybrown
1,1,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],[French],Architecture,1987,1980,[Male],Paint and colored pencil on print,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,[7470],3,1.1995,http://www.moma.org/collection/works/3,rosybrown
1,1,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],[French],Architecture,1987,1980,[Male],Paint and colored pencil on print,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,[7470],3,1.1995,http://www.moma.org/collection/works/3,dimgray
2,2,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],[Austrian],Architecture,1903,1900,[Male],"Graphite, pen, color pencil, ink, and gouache ...",http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,[7605],4,1.1997,http://www.moma.org/collection/works/4,wheat
2,2,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],[Austrian],Architecture,1903,1900,[Male],"Graphite, pen, color pencil, ink, and gouache ...",http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,[7605],4,1.1997,http://www.moma.org/collection/works/4,rosybrown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68692,138979,Plate from Pro dva kvadrata. Suprematicheskii ...,[El Lissitzky],[Russian],Illustrated Book,1922,1920,[Male],One from an illustrated book with seven letter...,http://www.moma.org/media/W1siZiIsIjIyNzQ2MiJd...,[3569],409778,89.2001.3,http://www.moma.org/collection/works/409778,black
68693,138980,Plate from Pro dva kvadrata. Suprematicheskii ...,[El Lissitzky],[Russian],Illustrated Book,1922,1920,[Male],One from an illustrated book with seven letter...,http://www.moma.org/media/W1siZiIsIjIyNzQ2MyJd...,[3569],409779,89.2001.4,http://www.moma.org/collection/works/409779,gainsboro
68694,138981,Plate from Pro dva kvadrata. Suprematicheskii ...,[El Lissitzky],[Russian],Illustrated Book,1922,1920,[Male],One from an illustrated book with seven letter...,http://www.moma.org/media/W1siZiIsIjIyNzQ2NCJd...,[3569],409780,89.2001.5,http://www.moma.org/collection/works/409780,gainsboro
68695,138982,Plate from Pro dva kvadrata. Suprematicheskii ...,[El Lissitzky],[Russian],Illustrated Book,1922,1920,[Male],One from an illustrated book with seven letter...,http://www.moma.org/media/W1siZiIsIjIyNzQ2NSJd...,[3569],409781,89.2001.6,http://www.moma.org/collection/works/409781,gainsboro


NOT USED
colorsRGB = {
    'Black' : (0,0,0),
    'White': (255,255,255),
    'Red': (255,0,0),
    'Lime' : (0,255,0),
    'Blue': (0,0,255),
    'Yellow': (255,255,0),
    'Cyan': (0,255,255),
    'Magenta': (255,0,255),
    'Silver': (192,192,192),
    'Gray': (128,128,128),
    'Maroon': (128,0,0),
    'Olive': (128,128,0),
    'Green': (0,128,0),
    'Purple': (128,0,128),
    'Teal': (0,128,128),
    'Navy': (0,0,128)
}