## setup

In [1]:
import umap

In [2]:
from transformers import AutoTokenizer, DistilBertModel
import torch
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

# https://huggingface.co/distilbert-base-uncased
# https://huggingface.co/docs/transformers/v4.35.0/en/model_doc/distilbert
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# create embeddings

def get_embeddings(tokenizer, model, words):
    # get embedding for each class
    # ❗️ note: I am averaging the embeddings for each word in the class
    # ❓ question: are we interested in the final contextual embedding for each class? currently, we're looking at the final hidden state.
    embeddings = []
    for i in range(len(words)):
        input_ids = torch.tensor(tokenizer.encode(words[i])).unsqueeze(0)
        outputs = model(input_ids)
        last_hidden_states = outputs[0]
        # skip the first token, which is the [CLS] token, and skip the last token, which is the [SEP] token
        # average the rest of the tokens
        embeddings.append(last_hidden_states[0][1:-1].mean(dim=0).tolist())
    return embeddings

In [4]:
# convert embeddings to df and save as csv
def process_embeddings(words, embeddings):
    # round to 3 decimal places
    # embeddings = [list(np.around(np.array(e),3)) for e in embeddings]

    # convert embeddings to pandas dataframe
    df = pd.DataFrame(embeddings)
    df.insert(0, 'word', words)

    return df

## hex colors

In [5]:
# hex color list
hex_colors = [
    '#016B75', '#319C04', '#4D1630', '#A1BF45', '#B75DC2', '#9FB278', '#B1A35F', '#17FCA4', '#C1EA3F', '#5B76FD', 
    '#AEB607', '#B71F50', '#AFD624', '#CEA521', '#C64073', '#01C59A', '#B847A6', '#AFE746', '#6DFE18', '#DAF20C', 
    '#819DBC', '#209E63', '#958DEB', '#A05248', '#60FC17', '#219E8A', '#F9B6A3', '#7D296F', '#613298', '#2058C9', 
    '#7650C1', '#16845F', '#5931AD', '#F6985E', '#62D81F', '#0391EA', '#5E10AF', '#4EFA57', '#5DC76E', '#6A9382', 
    '#7ADB3F', '#A903DF', '#1D398C', '#A75C9B', '#3FC60D', '#67DEB5', '#C7D4FE', '#29364B', '#D64B9C', '#FD12E7', 
    '#34FDAB', '#E90D12', '#BD76E4', '#96A2B5', '#E1B7F2', '#A8B6DF', '#DE825C', '#3086B9', '#F529AB', '#54F3EC', 
    '#74FAC8', '#FA839E', '#BA8562', '#F920BE', '#20973F', '#DAC279', '#A97C02', '#69B4F1', '#BF934D', '#15C290', 
    '#24AD70', '#08CE7F', '#1698F5', '#75B6EF', '#C462BD', '#1045EC', '#891B30', '#752A31', '#83D175', '#9B6580', 
    '#5F6AEC', '#38C7FD', '#E9D763', '#7E432F', '#72E8C3', '#F84071', '#C3A5E4', '#3C7654', '#C05D28', '#267F5C', 
    '#CB1865', '#CF9514', '#923E6B', '#42D56C', '#76F2BD', '#E7029A', '#8F49C3', '#A6D253', '#1F0B82', '#1AC9B3'
]


In [6]:
# sanity check
len(hex_colors)

100

In [7]:
hex_embeddings = get_embeddings(tokenizer, model, hex_colors)
hex_df = process_embeddings(hex_colors, hex_embeddings)

# sanity check
hex_df.head()

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,#016B75,-0.041097,-0.060254,0.015227,-0.335,0.472685,0.027859,-0.038826,0.167444,-0.329404,...,0.108277,0.13946,-0.233798,-0.478629,0.383578,-0.13325,0.017718,-0.139436,0.148138,-0.033521
1,#319C04,-0.033801,-0.166829,0.371787,-0.261019,0.44936,0.068444,-0.087322,0.293612,-0.423338,...,0.1231,0.16682,-0.065852,-0.540716,0.140492,-0.099883,0.144653,-0.149762,0.159682,0.104678
2,#4D1630,-0.047302,-0.056122,0.258664,-0.035974,0.207901,0.12044,-0.078138,-0.110812,-0.256565,...,0.175543,0.175414,-0.193573,-0.381425,0.147723,-0.053388,0.128342,-0.07422,0.175073,0.086331
3,#A1BF45,0.003316,0.148153,0.124057,-0.252602,0.200356,0.198172,0.13085,-0.078454,-0.362433,...,0.162428,0.177945,0.019736,-0.172148,0.355101,-0.138481,-0.025503,-0.075893,0.052348,-0.048452
4,#B75DC2,0.039551,-0.145425,0.071734,-0.18751,0.266204,-0.126067,0.136231,0.035757,-0.347607,...,0.299077,0.2416,-0.142906,-0.412847,0.33769,-0.149937,0.065814,-0.151042,0.138075,0.107318


In [8]:
# perform pca of hex colors (2d)
pca = PCA(n_components=2)
hex_pca = pca.fit_transform(hex_embeddings)


In [9]:
# perform pca of hex colors (3d)
pca3d = PCA(n_components=3)
hex_pca3d = pca3d.fit_transform(hex_embeddings)


In [10]:
# plot 2d results with plotly
fig = px.scatter(
    hex_pca3d, x=0, y=1, color=hex_colors,
    labels={'color': 'hex color'},
    title='PCA of DistilBERT embeddings of hex colors (2d)'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

In [11]:
# plot 3d results with plotly
fig = px.scatter_3d(
    hex_pca3d, x=0, y=1, z=2, color=hex_colors,
    labels={'color': 'hex color'},
    title='PCA of DistilBERT embeddings of hex colors (3d)'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

In [19]:
# try umap 
reducer = umap.UMAP()
hex_umap = reducer.fit_transform(hex_embeddings)

# plot umap results with plotly
fig = px.scatter(
    hex_umap, x=0, y=1, color=hex_colors,
    labels={'color': 'hex color'},
    title='UMAP of DistilBERT embeddings of hex colors'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
# try umap in 3d
reducer3d = umap.UMAP(n_components=3)
hex_umap3d = reducer3d.fit_transform(hex_embeddings)

# plot umap results with plotly
fig = px.scatter_3d(
    hex_umap3d, x=0, y=1, z=2, color=hex_colors,
    labels={'color': 'hex color'},
    title='UMAP of DistilBERT embeddings of hex colors'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

## rgb colors

In [13]:
rgb_colors = [
    'rgb(188,103,238)', 'rgb(97,81,54)', 'rgb(252,15,162)', 'rgb(84,85,58)', 'rgb(14,216,69)', 
    'rgb(130,60,47)', 'rgb(176,71,183)', 'rgb(254,115,20)', 'rgb(98,64,237)', 'rgb(40,156,255)', 
    'rgb(126,203,31)', 'rgb(121,30,28)', 'rgb(240,33,189)', 'rgb(150,8,42)', 'rgb(47,160,19)', 
    'rgb(251,42,60)', 'rgb(192,90,121)', 'rgb(55,109,123)', 'rgb(87,210,111)', 'rgb(142,245,129)', 
    'rgb(186,165,150)', 'rgb(249,160,157)', 'rgb(209,241,224)', 'rgb(21,137,107)', 'rgb(33,189,179)', 
    'rgb(101,235,204)', 'rgb(177,113,97)', 'rgb(0,202,112)', 'rgb(60,139,204)', 'rgb(201,172,176)', 
    'rgb(192,192,156)', 'rgb(244,168,222)', 'rgb(200,213,123)', 'rgb(192,37,81)', 'rgb(234,240,134)', 
    'rgb(223,133,22)', 'rgb(141,4,108)', 'rgb(131,213,43)', 'rgb(68,62,57)', 'rgb(108,67,56)', 
    'rgb(16,96,145)', 'rgb(149,121,63)', 'rgb(111,238,142)', 'rgb(76,33,241)', 'rgb(169,100,217)', 
    'rgb(38,33,179)', 'rgb(240,131,30)', 'rgb(219,104,187)', 'rgb(229,197,107)', 'rgb(107,42,232)', 
    'rgb(134,85,163)', 'rgb(196,162,157)', 'rgb(171,62,119)', 'rgb(110,50,193)', 'rgb(165,241,54)', 
    'rgb(121,210,47)', 'rgb(153,76,23)', 'rgb(93,131,47)', 'rgb(67,27,213)', 'rgb(194,67,93)', 
    'rgb(232,97,13)', 'rgb(60,113,122)', 'rgb(88,112,214)', 'rgb(54,193,89)', 'rgb(135,50,51)', 
    'rgb(86,42,236)', 'rgb(182,76,18)', 'rgb(1,142,71)', 'rgb(188,64,197)', 'rgb(80,78,33)', 
    'rgb(132,59,110)', 'rgb(140,197,241)', 'rgb(30,14,77)', 'rgb(148,98,34)', 'rgb(194,75,211)', 
    'rgb(11,74,194)', 'rgb(141,72,201)', 'rgb(14,153,214)', 'rgb(100,153,76)', 'rgb(98,220,10)', 
    'rgb(174,242,82)', 'rgb(164,206,205)', 'rgb(146,59,190)', 'rgb(166,95,171)', 'rgb(46,81,210)', 
    'rgb(52,132,191)', 'rgb(203,152,93)', 'rgb(79,255,95)', 'rgb(190,105,145)', 'rgb(196,244,215)', 
    'rgb(21,28,245)', 'rgb(177,79,212)', 'rgb(160,67,120)', 'rgb(237,101,163)', 'rgb(133,22,133)', 
    'rgb(70,144,59)', 'rgb(217,230,170)', 'rgb(86,155,9)', 'rgb(222,101,185)', 'rgb(16,75,0)'
]


In [14]:
# sanity check
len(rgb_colors)

100

In [15]:
rgb_embeddings = get_embeddings(tokenizer, model, rgb_colors)
rgb_df = process_embeddings(rgb_colors, rgb_embeddings)

# sanity check
rgb_df.head()

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,"rgb(188,103,238)",0.017656,0.112886,0.390108,-0.018283,0.078841,0.201212,-0.062687,0.198341,-0.17966,...,0.133042,0.35588,-0.188991,-0.060454,0.260127,-0.252567,-0.134221,-0.095889,0.175725,-0.084634
1,"rgb(97,81,54)",-0.217185,-0.06849,0.559675,-0.071991,-0.00089,0.244733,0.017081,0.259954,-0.209885,...,0.198512,0.321227,-0.090673,-0.128226,0.271363,-0.162526,-0.032537,-0.093063,0.183469,-0.133734
2,"rgb(252,15,162)",-0.106713,0.112892,0.445009,-0.077836,0.081173,0.386877,-0.058155,0.245033,-0.228639,...,0.216179,0.278108,-0.100687,-0.120121,0.237165,-0.25766,-0.047887,-0.11647,0.206646,-0.087084
3,"rgb(84,85,58)",-0.180302,0.150298,0.599966,-0.17522,-0.031473,0.286632,-0.044158,0.219023,-0.365295,...,0.228512,0.301177,-0.186266,-0.055869,0.242304,-0.216334,-0.072357,-0.140031,0.05219,-0.108971
4,"rgb(14,216,69)",-0.035112,0.014865,0.461914,-0.05562,-0.018628,0.40429,-0.017759,0.323656,-0.300444,...,0.171201,0.350442,-0.094125,-0.147213,0.333543,-0.221303,-0.059972,-0.077612,0.239908,-0.002455


In [16]:
# perform rgb of hex colors (2d)
pca = PCA(n_components=2)
rgb_pca = pca.fit_transform(rgb_embeddings)

# perform pca of rgb colors (3d)
pca3d = PCA(n_components=3)
rgb_pca3d = pca3d.fit_transform(rgb_embeddings)


In [17]:
# plot 2d results with plotly
fig = px.scatter(
    rgb_pca3d, x=0, y=1, color=rgb_colors,
    labels={'color': 'rgb color'},
    title='PCA of DistilBERT embeddings of rgb colors (2d)'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

In [18]:
# plot 3d results with plotly
fig = px.scatter_3d(
    rgb_pca3d, x=0, y=1, z=2, color=rgb_colors,
    labels={'color': 'rgb color'},
    title='PCA of DistilBERT embeddings of rgb colors (3d)'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

In [21]:
# try umap in 2d
reducer = umap.UMAP(n_components=2)
rgb_umap = reducer.fit_transform(rgb_embeddings)

# plot umap results with plotly
fig = px.scatter(
    rgb_umap, x=0, y=1, color=rgb_colors,
    labels={'color': 'rgb color'},
    title='UMAP of DistilBERT embeddings of rgb colors'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()

In [22]:
# try umap in 3d
reducer3d = umap.UMAP(n_components=3)
rgb_umap3d = reducer3d.fit_transform(rgb_embeddings)

# plot umap results with plotly
fig = px.scatter_3d(
    rgb_umap3d, x=0, y=1, z=2, color=rgb_colors,
    labels={'color': 'rgb color'},
    title='UMAP of DistilBERT embeddings of rgb colors'
)

# hide legend
fig.update_layout(showlegend=False)
fig.show()