## An exploration of *token embedding* with a transformer

Luca Mari, January 2025

In [1]:
import _keys
from tokenizeutils import Model
from pprint import pprint

model = Model('google-bert/bert-large-uncased', False)

print(f"The vocabulary has {model.vocab_size} tokens.")
print(f"Each token is associated to (embedded in) a vector of {model.embedding_dim} numbers.")

The vocabulary has 30522 tokens.
Each token is associated to (embedded in) a vector of 1024 numbers.


In [5]:
token = "library"
token_id = model.token_to_id(token)
unknown_token_id = model.tokenizer.convert_tokens_to_ids(model.tokenizer.unk_token) # type: ignore

if token_id is not unknown_token_id:
    print(f"The token '{token}' has the identifier {token_id}")
else:
    print(f"The token '{token}' is not in the vocabulary.")
print(f"(To any token not in the vocabulary the identifier {model.tokenizer.convert_tokens_to_ids(model.tokenizer.unk_token)} is associated).") # type: ignore

The token 'library' has the identifier 3075
(To any token not in the vocabulary the identifier 100 is associated).


Il modello è stato addestrato a mappare (_to embed_, appunto) ogni token, con il suo identificatore, in una successione di numeri (c'è da considerare che i transformer, come `BERT`, operano sulla base di un embedding dinamico, in cui la successione di numeri associata a ogni token dipende anche dal contesto ('embedding posizionale'): qui noi lavoriamo solo con la componente statica del mapping).

In [6]:
embedding = model.token_to_embedding(token)
print(f"The token '{token}' is associated to a vector of {len(embedding)} elements whose first 5 elements are:\n{embedding[:5]}")

The token 'library' is associated to a vector of 1024 elements whose first 5 elements are:
[-0.02522797 -0.02368372 -0.04366647 -0.00486308 -0.00388711]


In [7]:
n = 10
print(f"\nThe {n} tokens most similar to '{token}' in the vocabulary:")
pprint(model.most_similar(token, top_n=n))


The 10 tokens most similar to 'library' in the vocabulary:
[('libraries', 0.67),
 ('librarian', 0.53),
 ('archives', 0.41),
 ('bookstore', 0.4),
 ('museum', 0.4),
 ('archive', 0.39),
 ('collection', 0.34),
 ('database', 0.34),
 ('bibliography', 0.33),
 ('repository', 0.33)]


In [7]:
positive_examples = ["king", "woman"]
negative_examples = ["man"]
pprint(model.combine_meanings(positive_examples, negative_examples, top_n=1))

[('queen', 0.57)]


In [None]:
positive_examples = ["Rome", "Spain"]
negative_examples = ["Italy"]
print(model.combine_meanings(positive_examples, negative_examples, top_n=1))

[('madrid', 0.42)]


In [14]:
positive_examples = ["summer", "cold"] 
negative_examples = ["warm"]
print(model.combine_meanings(positive_examples, negative_examples, top_n=1))

[('winter', 0.48)]


In [15]:
positive_examples = ["guitar", "pianist"]
negative_examples = ["guitarist"]
print(model.combine_meanings(positive_examples, negative_examples))

[('piano', 0.62)]


In [None]:
positive_examples = ["father", "daughter"]
negative_examples = ["son"]
print(model.combine_meanings(positive_examples, negative_examples))

[('mother', 0.62)]


In [None]:
positive_examples = ["actor", "woman"]
negative_examples = ["man"]
print(model.combine_meanings(positive_examples, negative_examples))

[('actress', 0.73)]


In [None]:
positive_examples = ["fine", "bad"]
negative_examples = ["ugly"]
print(model.combine_meanings(positive_examples, negative_examples))

[('good', 0.46)]


Se i precedenti sono esempi ricchi semanticamente, proviamo a sperimentare anche con esempi solo grammaticali.

In [None]:
positive_examples = ["train", "automobiles"]
negative_examples = ["trains"]
print(model.combine_meanings(positive_examples, negative_examples))

[('automobile', 0.61)]


In [None]:
positive_examples = ["go", "seen"]
negative_examples = ["gone"]
print(model.combine_meanings(positive_examples, negative_examples))

[('see', 0.48)]


In [None]:
positive_examples = ["thinking", "listen"]
negative_examples = ["think"]
print(model.combine_meanings(positive_examples, negative_examples))

[('listening', 0.63)]
