In [1]:
import json
import numpy as np
import colorspacious
import scipy.interpolate

# Post-processing data from Heer & Stone (2012)

Heer & Stone (2012) produced an English color naming model consisting of 153 names by analyzing data from the xkcd color survey. Along with naming colors, the model can also be used to define a color saliency metric. However, the model has an excessive number of colors, so most of the colors in the model are not the preferred color of any point in the color gamut. Additionally, synonyms are not merged. Both the excessive number of colors and the lack of synonym merging reduce the effectiveness of the color saliency metric, since neither the presence of rarely used alternative names nor the presence of synonyms negatively affects the ease to which a given color can be named. Furthermore, the lack of synonym merging reduces the accuracy of the model's color naming in cases where a third name is more probable than either of two synonyms but less probable than the combined probability of the synonyms, since the third name will be used instead of that of the synonyms. To remedy these shortcomings, synonyms should be merged, and rarely used colors should be eliminated.

## Loading Heer & Stone (2012) data

The results JSON file is loaded, and the counts table is constructed. Additionally, helper functions and a Hellinger distance function are defined.

In [2]:
with open("c3_data.json") as infile:
    c3_data = json.load(infile)

# Use US English spelling
c3_data["terms"] = [c.replace("grey", "gray") for c in c3_data["terms"]]

W = len(c3_data["terms"])

c3_color = np.array(c3_data["color"])
c3_color = c3_color.reshape((c3_color.size // 3, 3))

# Parse count table
T = {
    c3_data["T"][i * 2]: c3_data["T"][i * 2 + 1] for i in range(len(c3_data["T"]) // 2)
}

# Build lookup table
cmap = {tuple(c3_color[i]): i for i in range(c3_color.shape[0])}

# Construct counts
ccount = np.zeros(c3_color.shape[0], dtype=int)
tcount = np.zeros(W, dtype=int)
for idx in T:
    c = idx // W
    w = idx % W
    if idx in T:
        ccount[c] += T[idx]
        tcount[w] += T[idx]

In [3]:
def color_name_idx_from_cmap_idx(idx):
    """Determine color name based on a color index."""
    cc = idx * W
    scores = {w: T[cc + w] for w in range(W) if cc + w in T}
    return max(scores, key=scores.get)

In [4]:
def color_name_prob_from_cmap_idx(idx):
    """Determine color name probabilities based on a color index."""
    cc = idx * W
    scores = [T[cc + w] if cc + w in T else 0 for w in range(W)]
    return scores

In [5]:
def color_hellinger(a, b):
    """Calculate Hellinger distance between two color indices."""
    bc = 0
    z = np.sqrt(tcount[a] * tcount[b])
    for c in range(len(c3_data["color"])):
        pa = T[c * W + a] if c * W + a in T else 0
        pb = T[c * W + b] if c * W + b in T else 0
        bc += np.sqrt(pa * pb)
    return np.sqrt(1 - bc / z)

## Find synonyms

To find synonyms, a list of colors is constructed such that each color is the preferred color for at least one voxel in the Lab color gamut. Then, synonyms are found by looking for other color names with a Hellinger distance of <0.25 from each of the preferred colors.

In [6]:
# Locate colors that are the preferred color for at least one voxel
names_idx_idx = np.array([color_name_idx_from_cmap_idx(i) for i in range(len(cmap))])
num_names = 0
remaining_name_idx = []
for i in range(W):
    if names_idx_idx[names_idx_idx == i].size > 0:
        print(i, names_idx_idx[names_idx_idx == i].size, c3_data["terms"][i])
        num_names += 1
        remaining_name_idx.append(i)
print(num_names)

0 1322 green
1 1015 blue
2 1264 purple
3 314 red
4 897 pink
5 270 yellow
6 307 orange
7 713 brown
8 216 teal
9 217 lightblue
10 453 gray
11 12 limegreen
12 39 magenta
13 92 lightgreen
16 52 cyan
19 109 darkgreen
21 124 olive
22 113 navyblue
23 114 lavender
25 140 black
29 154 tan
33 4 yellowgreen
34 112 maroon
35 24 darkpurple
36 52 salmon
37 72 peach
38 65 beige
41 15 mustard
49 18 mauve
57 2 lightpink
68 1 darkteal
75 22 white
88 1 offwhite
33


In [7]:
# Find very similar color among remaining colors
color_synonyms = {cidx: [] for cidx in remaining_name_idx}
for cidx in remaining_name_idx:
    print(c3_data["terms"][cidx] + ":")
    for i in range(len(c3_data["terms"])):
        if (
            cidx != i
            and i not in remaining_name_idx
            and color_hellinger(cidx, i) < 0.25
        ):
            color_synonyms[cidx].append(i)
            print(color_hellinger(cidx, i), c3_data["terms"][i])

green:
blue:
purple:
0.13934000761791623 violet
red:
pink:
yellow:
orange:
brown:
teal:
0.20058096340649395 turquoise
0.22216206461580287 bluegreen
lightblue:
0.1399636840687085 skyblue
gray:
limegreen:
0.15156836542273974 lime
magenta:
0.15614773714168304 fuchsia
lightgreen:
cyan:
0.24624808916839397 aqua
darkgreen:
0.16059734080569615 forestgreen
olive:
navyblue:
0.18581406431217756 darkblue
lavender:
0.20054279242594514 lightpurple
0.13068359809102645 lilac
black:
tan:
yellowgreen:
0.22223365451259827 greenyellow
0.21869958196766523 chartreuse
maroon:
0.21133744257750556 burgundy
darkpurple:
salmon:
peach:
beige:
mustard:
0.18176374441303011 gold
0.21610061396993244 darkyellow
0.23209732977246744 mustardyellow
mauve:
lightpink:
darkteal:
white:
offwhite:


In [8]:
# Make sure there aren't duplicates
all_color_synonyms = sum(list(color_synonyms.values()), [])
assert len(all_color_synonyms) == len(set(all_color_synonyms))
print(len(all_color_synonyms))

17


## Simplify color list

To simplify the color list, synonyms are combined, with the most commonly used name, not the one with the highest preferred voxel count, used to refer to the combined data. Next, the voxel counts of the basic color terms from Berlin & Kay (1969) are tallied, and the voxel counts of the other colors are compared to the lowest basic color term voxel counts; colors with lower voxel counts are eliminated. Finally, the remaining color names are returned to their correct forms by re-adding spaces and the like.

In [9]:
# Score all colors
all_scores = np.array([color_name_prob_from_cmap_idx(i) for i in range(len(cmap))])
per_color_count = all_scores.sum(axis=0)
print(all_scores.shape)

(8325, 153)


In [10]:
# Use most commonly used name for synonyms instead of one with highest voxel count
for cidx in [i for i in color_synonyms if len(color_synonyms[i]) > 0]:
    vals = [cidx] + color_synonyms[cidx]
    max_idx = np.argmax(per_color_count[vals])
    if cidx != vals[max_idx]:
        color_synonyms[vals.pop(max_idx)] = vals
        del color_synonyms[cidx]
remaining_name_idx = sorted(color_synonyms.keys())

In [11]:
# Combine synonym scores
for cidx in color_synonyms:
    for i in color_synonyms[cidx]:
        all_scores[:, cidx] += all_scores[:, i]

In [12]:
# Remove remaining colors
for i in reversed(range(all_scores.shape[1])):
    if i not in remaining_name_idx:
        if i < all_scores.shape[1]:
            all_scores = np.concatenate(
                (all_scores[:, :i], all_scores[:, i + 1 :]), axis=1
            )
        else:
            all_scores = all_scores[:, :i]

In [13]:
# From "Basic color terms: Their universality and evolution", p. 2
berlin_kay_color_names = [
    "white",
    "black",
    "red",
    "green",
    "yellow",
    "blue",
    "brown",
    "purple",
    "pink",
    "orange",
    "gray",
]

In [14]:
# Determine minimum voxel count where a given BCT is the prefered color
names_idx_idx2 = np.argmax(all_scores, axis=1)
min_bct_count = all_scores.shape[0]
for bct in berlin_kay_color_names:
    min_bct_count = min(
        min_bct_count,
        names_idx_idx2[
            names_idx_idx2 == remaining_name_idx.index(c3_data["terms"].index(bct))
        ].size,
    )
print(min_bct_count)

21


In [15]:
# Eliminate color terms with a voxel count lower than that of the BCT with the fewest voxels
num_names2 = 0
remaining_name_idx2 = []
remaining_names2 = []
for i in range(len(remaining_name_idx)):
    if names_idx_idx2[names_idx_idx2 == i].size >= min_bct_count:
        remaining_names2.append(c3_data["terms"][remaining_name_idx[i]])
        print(i, names_idx_idx2[names_idx_idx2 == i].size, remaining_names2[-1])
        num_names2 += 1
        remaining_name_idx2.append(i)
print(num_names2)

0 1139 green
1 789 blue
2 1189 purple
3 301 red
4 799 pink
5 231 yellow
6 299 orange
7 691 brown
8 341 teal
9 325 lightblue
10 417 gray
11 33 limegreen
12 97 magenta
13 90 lightgreen
14 51 cyan
15 191 darkblue
16 180 darkgreen
17 112 olive
18 279 lavender
19 125 black
20 149 tan
21 46 yellowgreen
22 140 maroon
24 52 salmon
25 72 peach
26 65 beige
27 73 mustard
31 21 white
28


In [16]:
# Remove remaining colors
for i in reversed(range(all_scores.shape[1])):
    if i not in remaining_name_idx2:
        if i < all_scores.shape[1]:
            all_scores = np.concatenate(
                (all_scores[:, :i], all_scores[:, i + 1 :]), axis=1
            )
        else:
            all_scores = all_scores[:, :i]

In [17]:
# Fix color names
remaining_names2[remaining_names2.index("lightblue")] = "light blue"
remaining_names2[remaining_names2.index("limegreen")] = "lime green"
remaining_names2[remaining_names2.index("lightgreen")] = "light green"
remaining_names2[remaining_names2.index("darkgreen")] = "dark green"
remaining_names2[remaining_names2.index("darkblue")] = "dark blue"
remaining_names2[remaining_names2.index("yellowgreen")] = "yellow-green"

In [18]:
remaining_names2

['green',
 'blue',
 'purple',
 'red',
 'pink',
 'yellow',
 'orange',
 'brown',
 'teal',
 'light blue',
 'gray',
 'lime green',
 'magenta',
 'light green',
 'cyan',
 'dark blue',
 'dark green',
 'olive',
 'lavender',
 'black',
 'tan',
 'yellow-green',
 'maroon',
 'salmon',
 'peach',
 'beige',
 'mustard',
 'white']

In [19]:
all_scores.shape

(8325, 28)

In [20]:
np.max(all_scores)

1138

## Names and Saliencies

Finally, the most probable name and most probable basic color term for each color is determined, and saliencies are calculated. For saliencies, the negative entropy is used, as is done in Heer & Stone (2012). However, a different normalization is used, ensuring that the saliencies span the full zero to one range.

In [21]:
rgb_colors = np.arange(2**24)
rgb_colors = np.array([rgb_colors & 0xff, (rgb_colors >> 8) & 0xff, (rgb_colors >> 16) & 0xff]).T
lab_colors = colorspacious.cspace_convert(rgb_colors, "sRGB255", {"name": "CIELab", "XYZ100_w": "D65"})
interpolated = scipy.interpolate.griddata(np.array(list(cmap.keys()), dtype=np.int8), all_scores, lab_colors, method='linear')
name_idxs = np.argmax(interpolated, axis=1).astype(np.uint8)
interpolated_bct = np.zeros(interpolated.shape)
interpolated_bct[:, [remaining_names2.index(i) for i in berlin_kay_color_names]] = interpolated[:, [remaining_names2.index(i) for i in berlin_kay_color_names]]
bct_idxs = np.argmax(interpolated_bct, axis=1).astype(np.uint8)

In [22]:
probs = all_scores / np.sum(all_scores, axis=1)[:, np.newaxis]
saliencies = np.copy(probs)
saliencies[probs > 0] *= np.log2(saliencies[probs > 0])
saliencies = np.sum(saliencies, axis=1)

In [23]:
# Renormalize
min_saliency = np.min(saliencies)
max_saliency = np.max(saliencies)
print(f"min: {min_saliency:6.3f}")
print(f"max: {max_saliency:6.3f}")
saliencies = ((saliencies - min_saliency) / (max_saliency - min_saliency)).astype(np.float32)

min: -3.158
max:  0.000


In [24]:
np.savez_compressed(
    "colornamemodel.npz",
    scores=all_scores.astype(np.uint16),
    names=remaining_names2,
    saliencies=saliencies,
    name_idxs=name_idxs,
    bct_idxs=bct_idxs,
    cmap=np.array(list(cmap.keys()), dtype=np.int8),
)