In [1]:
from transformers import BertTokenizer, BertModel
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# will be 11 Tokens for Bert
sentence = "Sphinx of black quartz, judge my vow."

In [11]:
# Add the special tokens.
marked_text = f"[CLS] {sentence} [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

segments_ids = [1 for _ in range(len(tokenized_text))]

tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[-1]


# `hidden_states` has shape [13 x 1 x 11 x 768]
# `token_vecs` is a tensor with shape [11 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [14]:
token_vecs.shape

torch.Size([11, 768])

In [37]:
from sentence_transformers import SentenceTransformer, util
from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus = """So she was considering in her own mind (as well as she could,
            for the hot day made her feel very sleepy and stupid), whether
            the pleasure of making a daisy-chain would be worth the trouble
            of getting up and picking the daisies, when suddenly a White
            Rabbit with pink eyes ran close by her.""".split()

df = pd.DataFrame()

for i in range(3, len(corpus)):

    corpus_embeddings = embedder.encode(corpus[:i], convert_to_tensor=True)
    three_dim = PCA(n_components=3, random_state=0).fit_transform(corpus_embeddings)

    df = pd.DataFrame(data=[x for x in three_dim], columns=["x", "y", "z"])
    df["label"] = corpus[:i]
    df["frame"] = i
    df = pd.concat([df, df])

# normalize
df["x"] = (df["x"] - df["x"].min()) / (df["x"].max() - df["x"].min())
df["y"] = (df["y"] - df["y"].min()) / (df["y"].max() - df["y"].min())
df["z"] = (df["z"] - df["z"].min()) / (df["z"].max() - df["z"].min())

### Helper Functions

In [34]:
from sentence_transformers import SentenceTransformer, util
from sklearn.decomposition import KernelPCA
import pandas as pd

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def create_pca_colors(corpus: list, normalize: bool = True) -> pd.DataFrame:
    """Converts a list of words or sentences to color vectors
    via PCA dimensionality reduction. Uses the all-MiniLM-l6-v2
    Model for embeddings.
    
    Params:
        corpus: a list of strings, either words or sentences
        normalize: optional, already normalized the values to 0-1
    
    Returns:
        DataFrame with columns: red, green, blue, label
    
    NOTE: PCA and KernelPCA (with non linear Kernel) give
    equal results.
    NOTE: TSNE was excluded here because of the higher
    computational cost in high dimensions > 50
    """
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    three_dim = KernelPCA(n_components=3, random_state=0, kernel="poly").fit_transform(corpus_embeddings)

    df = pd.DataFrame(data=[x for x in three_dim], columns=["red", "green", "blue"])
    df["label"] = corpus

    if normalize:
        df["red"] = (df["red"] - df["red"].min()) / (df["red"].max() - df["red"].min())
        df["green"] = (df["green"] - df["green"].min()) / (df["green"].max() - df["green"].min())
        df["blue"] = (df["blue"] - df["blue"].min()) / (df["blue"].max() - df["blue"].min())
    
    return df

In [27]:
from PIL import Image
import imageio.v2 as imageio
from PIL import ImageDraw
from PIL import ImageFont


def save_as_gif(colors: pd.DataFrame, filepath: str, size: tuple = (300, 300), freq: int = 5, font_size: int = 25):
    """Creates and saves a gif from the word to color DataFrame
    
    Params:
        colors: DataFrame with red, green, blue, label
        filepath: destination for gif
        size: a tuple with (width, height)
        freq: repeat rate of frames per color, higher value -> longer per color
        font_size: size of the text
    """
    images = []
    fnt = ImageFont.truetype("FreeMono.ttf", font_size)
    for _, row in colors.iterrows():
        img = Image.new("RGB", size, (int(row["red"]*255), int(row["green"]*255), int(row["blue"]*255)))
        ctx = ImageDraw.Draw(img)
        ctx.text((100, 100), row["label"], fill=(255, 255, 255), font=fnt, stroke_width=1)
        for _ in range(freq):
            images.append(img)
    imageio.mimsave(filepath, images)

### Testing

In [20]:
corpus = """So she was considering in her own mind (as well as she could,
            for the hot day made her feel very sleepy and stupid), whether
            the pleasure of making a daisy-chain would be worth the trouble
            of getting up and picking the daisies, when suddenly a White
            Rabbit with pink eyes ran close by her.""".split()

In [24]:
# check for linearity of the color space
# using PCA or kernel PCA with a POLY KERNEL does not change
# the output too much
colors = create_pca_colors(corpus)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X = colors["red"].to_numpy().reshape(-1, 1)
Y = colors["blue"].to_numpy().reshape(-1, 1)

regressor = LinearRegression().fit(X, Y)
r2_score(regressor.predict(X), Y)

-5.231364703270346e+31

In [28]:
colors = create_pca_colors(corpus)
save_as_gif(colors, "test2.gif")

In [None]:
# try out feeding in a sliding window over the text
# what length is the best? Make it to short and the change isn't smooth anymore
# make it to large and there is no change?
# coming up next!