In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import argparse

import torch.nn.functional as F

MODEL_CONFIGS = {
    "gpt2": {
        "model_name": "gpt2",
        "tokenizer_name": "gpt2",
    },
    "pythia1.4b": {
        "model_name": "EleutherAI/pythia-1.4b-v0",
        "tokenizer_name": "EleutherAI/pythia-1.4b-v0",
    },
    "gemma2b": {
        "model_name": "google/gemma-2-2b",
        "tokenizer_name": "google/gemma-2-2b",
    },
    "qwen2": {
        "model_name": "Qwen/Qwen2.5-1.5B-Instruct",
        "tokenizer_name": "Qwen/Qwen2.5-1.5B-Instruct",
    },
    "bert-base-uncased": {
        "model_name": "bert-base-uncased",
        "tokenizer_name": "bert-base-uncased",
    },
    "bert-large-uncased": {
        "model_name": "bert-large-uncased",
        "tokenizer_name": "bert-large-uncased",
    },
    "distilbert-base-uncased": {
        "model_name": "distilbert-base-uncased",
        "tokenizer_name": "distilbert-base-uncased",
    },
}

def get_embedding(tokenizer, embeddings, word, method="sum"):
    if method == "tokenize":
        toks = tokenizer.tokenize(word, add_special_tokens=False)
        ids = tokenizer.convert_tokens_to_ids(toks)
        vecs = embeddings[ids]
        return vecs.mean(dim=0)
    else: # sum
        toks = tokenizer.tokenize(" " + word, add_special_tokens=False)
        ids = tokenizer.convert_tokens_to_ids(toks)
        vecs = embeddings[ids]
        return vecs.sum(dim=0)
    
def get_word_rank(tokenizer, embeddings, query_vec, word, method="sum"):
    emb_norm = F.normalize(embeddings, dim=1)
    q_norm = F.normalize(query_vec.unsqueeze(0), dim=1)
    sims = torch.mm(q_norm, emb_norm.t()).squeeze(0)

    if method == "tokenize":
        toks = tokenizer.tokenize(word, add_special_tokens=False)
        ids_for_rank = tokenizer.convert_tokens_to_ids(toks)
    else:  # sum
        toks = tokenizer.tokenize(" " + word, add_special_tokens=False)
        ids_for_rank = tokenizer.convert_tokens_to_ids(toks)
    
    sorted_idxs = torch.argsort(sims, descending=True)
    ranks = []
    for tid in ids_for_rank:
        pos = (sorted_idxs == tid).nonzero(as_tuple=True)[0]
        ranks.append(pos.item() + 1)
    return sum(ranks) / len(ranks)

def find_closest(tokenizer, embeddings, query_vec, top_k=5):
    emb_norm = F.normalize(embeddings, dim=1)
    q_norm = F.normalize(query_vec.unsqueeze(0), dim=1)
    sims = torch.mm(q_norm, emb_norm.t()).squeeze(0)
    vals, idxs = torch.topk(sims, k=top_k*10)
    results, seen = [], set()
    
    for score, idx in zip(vals.tolist(), idxs.tolist()):
        tok = tokenizer.decode([idx]).strip()
        # tok = tokenizer.convert_ids_to_tokens([idx])[0].strip()
        if not tok.isalpha() or tok in seen: # don't include byte-level tokens
            continue
        seen.add(tok)
        results.append((tok, score))
        if len(results) >= top_k:
            break
        
    return results

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
cfg = MODEL_CONFIGS["gpt2"]
tokenizer = AutoTokenizer.from_pretrained(cfg["tokenizer_name"])
model = AutoModel.from_pretrained(cfg["model_name"])
embeddings = model.get_input_embeddings().weight.data

tests = [
    ("king", "man", "woman", "queen"),
    ("man", "king", "queen", "woman"),
    ("walked", "walk", "jump", "jumped"),
    ("go", "went", "run", "ran"),
    ("sang", "sing", "ring", "rang"),
    ("sing", "sang", "rang", "ring"),
]

for a, b, c, d in tests:
    print(f"\n=== Analogy ({a}-{b}+{c}) expecting {d} ===")
    for method in ("tokenize", "sum"):
        va = get_embedding(tokenizer, embeddings, a, method=method)
        vb = get_embedding(tokenizer, embeddings, b, method=method)
        vc = get_embedding(tokenizer, embeddings, c, method=method)
        query = va - vb + vc

        rank = get_word_rank(tokenizer, embeddings, query, d)
        print(f"\n method={method}: rank of '{d}' = {int(rank)}")
        for tok, sim in find_closest(tokenizer, embeddings, query, top_k=5):
            print(f"   {tok!r:<10} sim={sim:.4f}")
            # print(f"   {tok!r}  cos_sim={sim:.4f}")

print("\n=== E('ed') + E('jump') comparison ===")
for method in ("tokenize", "sum"):
    v_ed = get_embedding(tokenizer, embeddings, "ed",   method=method)
    v_jump = get_embedding(tokenizer, embeddings, "jump", method=method)
    query = v_jump+v_ed

    rank = get_word_rank(tokenizer, embeddings, query, "jumped", method=method)
    print(f"\n method={method}: rank of 'jumped' = {int(rank)}")
    print("  top-5:", [(tok, f"{score:.4f}") for tok, score in find_closest(tokenizer, embeddings, query, top_k=5)])
    
    sim = F.cosine_similarity(
        query.unsqueeze(0),
        get_embedding(tokenizer, embeddings, "jumped", method=method).unsqueeze(0),
        dim=1
    ).item()
    print(f"  cos_sim={sim:.4f}")


=== Analogy (king-man+woman) expecting queen ===

 method=tokenize: rank of 'queen' = 115
   'king'     sim=0.7752
   'woman'    sim=0.5815
   'ked'      sim=0.5442
   'KING'     sim=0.5017
   'women'    sim=0.4754

 method=sum: rank of 'queen' = 2
   'king'     sim=0.7758
   'queen'    sim=0.7085
   'princess' sim=0.6046
   'Queen'    sim=0.5964
   'kings'    sim=0.5932

=== Analogy (man-king+queen) expecting woman ===

 method=tokenize: rank of 'woman' = 25
   'man'      sim=0.5980
   'en'       sim=0.4626
   'men'      sim=0.4309
   'MAN'      sim=0.3857
   'que'      sim=0.3844

 method=sum: rank of 'woman' = 2
   'man'      sim=0.6716
   'woman'    sim=0.6622
   'queen'    sim=0.5638
   'lady'     sim=0.4987
   'girl'     sim=0.4858

=== Analogy (walked-walk+jump) expecting jumped ===

 method=tokenize: rank of 'jumped' = 8
   'jump'     sim=0.8265
   'Jump'     sim=0.5245
   'ed'       sim=0.5114
   'jumps'    sim=0.4896
   'jumping'  sim=0.4872

 method=sum: rank of 'jumped' = 

In [7]:
cfg = MODEL_CONFIGS["bert-base-uncased"]
tokenizer = AutoTokenizer.from_pretrained(cfg["tokenizer_name"])
model = AutoModel.from_pretrained(cfg["model_name"])
embeddings = model.get_input_embeddings().weight.data

tests = [
    ("king", "man", "woman", "queen"),
    ("man", "king", "queen", "woman"),
    ("walked", "walk", "jump", "jumped"),
    ("go", "went", "run", "ran"),
    ("sang", "sing", "ring", "rang"),
    ("sing", "sang", "rang", "ring"),
]

for a, b, c, d in tests:
    print(f"\n=== Analogy ({a}-{b}+{c}) expecting {d} ===")
    for method in ("tokenize", "sum"):
        va = get_embedding(tokenizer, embeddings, a, method=method)
        vb = get_embedding(tokenizer, embeddings, b, method=method)
        vc = get_embedding(tokenizer, embeddings, c, method=method)
        query = va - vb + vc

        rank = get_word_rank(tokenizer, embeddings, query, d)
        print(f"\n method={method}: rank of '{d}' = {int(rank)}")
        for tok, sim in find_closest(tokenizer, embeddings, query, top_k=5):
            print(f"   {tok!r:<10} sim={sim:.4f}")
            # print(f"   {tok!r}  cos_sim={sim:.4f}")

print("\n=== E('ed') + E('jump') comparison ===")
for method in ("tokenize", "sum"):
    v_ed = get_embedding(tokenizer, embeddings, "ed",   method=method)
    v_jump = get_embedding(tokenizer, embeddings, "jump", method=method)
    query = v_jump+v_ed

    rank = get_word_rank(tokenizer, embeddings, query, "jumped", method=method)
    print(f"\n method={method}: rank of 'jumped' = {int(rank)}")
    print("  top-5:", [(tok, f"{score:.4f}") for tok, score in find_closest(tokenizer, embeddings, query, top_k=5)])
    
    sim = F.cosine_similarity(
        query.unsqueeze(0),
        get_embedding(tokenizer, embeddings, "jumped", method=method).unsqueeze(0),
        dim=1
    ).item()
    print(f"  cos_sim={sim:.4f}")


=== Analogy (king-man+woman) expecting queen ===

 method=tokenize: rank of 'queen' = 2
   'king'     sim=0.7370
   'queen'    sim=0.6469
   'woman'    sim=0.4885
   'princess' sim=0.4752
   'kings'    sim=0.4659

 method=sum: rank of 'queen' = 2
   'king'     sim=0.7370
   'queen'    sim=0.6469
   'woman'    sim=0.4885
   'princess' sim=0.4752
   'kings'    sim=0.4659

=== Analogy (man-king+queen) expecting woman ===

 method=tokenize: rank of 'woman' = 2
   'man'      sim=0.7196
   'woman'    sim=0.6336
   'queen'    sim=0.5380
   'girl'     sim=0.5340
   'lady'     sim=0.4520

 method=sum: rank of 'woman' = 2
   'man'      sim=0.7196
   'woman'    sim=0.6336
   'queen'    sim=0.5380
   'girl'     sim=0.5340
   'lady'     sim=0.4520

=== Analogy (walked-walk+jump) expecting jumped ===

 method=tokenize: rank of 'jumped' = 2
   'jump'     sim=0.8086
   'jumped'   sim=0.7265
   'jumps'    sim=0.6664
   'jumping'  sim=0.6360
   'leaped'   sim=0.6048

 method=sum: rank of 'jumped' = 2
 

In [8]:
cfg = MODEL_CONFIGS["qwen2"]
tokenizer = AutoTokenizer.from_pretrained(cfg["tokenizer_name"])
model = AutoModel.from_pretrained(cfg["model_name"])
embeddings = model.get_input_embeddings().weight.data

tests = [
    ("king", "man", "woman", "queen"),
    ("man", "king", "queen", "woman"),
    ("walked", "walk", "jump", "jumped"),
    ("go", "went", "run", "ran"),
    ("sang", "sing", "ring", "rang"),
    ("sing", "sang", "rang", "ring"),
]

for a, b, c, d in tests:
    print(f"\n=== Analogy ({a}-{b}+{c}) expecting {d} ===")
    for method in ("tokenize", "sum"):
        va = get_embedding(tokenizer, embeddings, a, method=method)
        vb = get_embedding(tokenizer, embeddings, b, method=method)
        vc = get_embedding(tokenizer, embeddings, c, method=method)
        query = va - vb + vc

        rank = get_word_rank(tokenizer, embeddings, query, d)
        print(f"\n method={method}: rank of '{d}' = {int(rank)}")
        for tok, sim in find_closest(tokenizer, embeddings, query, top_k=5):
            print(f"   {tok!r:<10} sim={sim:.4f}")
            # print(f"   {tok!r}  cos_sim={sim:.4f}")

print("\n=== E('ed') + E('jump') comparison ===")
for method in ("tokenize", "sum"):
    v_ed = get_embedding(tokenizer, embeddings, "ed",   method=method)
    v_jump = get_embedding(tokenizer, embeddings, "jump", method=method)
    query = v_jump+v_ed

    rank = get_word_rank(tokenizer, embeddings, query, "jumped", method=method)
    print(f"\n method={method}: rank of 'jumped' = {int(rank)}")
    print("  top-5:", [(tok, f"{score:.4f}") for tok, score in find_closest(tokenizer, embeddings, query, top_k=5)])
    
    sim = F.cosine_similarity(
        query.unsqueeze(0),
        get_embedding(tokenizer, embeddings, "jumped", method=method).unsqueeze(0),
        dim=1
    ).item()
    print(f"  cos_sim={sim:.4f}")


=== Analogy (king-man+woman) expecting queen ===

 method=tokenize: rank of 'queen' = 6
   'king'     sim=0.6939
   'KING'     sim=0.4254
   'woman'    sim=0.4185
   'King'     sim=0.3925
   'queen'    sim=0.3881

 method=sum: rank of 'queen' = 2
   'king'     sim=0.6690
   'queen'    sim=0.5566
   'King'     sim=0.5379
   'KING'     sim=0.5199
   'kings'    sim=0.5119

=== Analogy (man-king+queen) expecting woman ===

 method=tokenize: rank of 'woman' = 12
   'man'      sim=0.6176
   'MAN'      sim=0.4735
   'queen'    sim=0.4342
   'Man'      sim=0.4330
   'woman'    sim=0.3886

 method=sum: rank of 'woman' = 4
   'man'      sim=0.7234
   'Man'      sim=0.5867
   'woman'    sim=0.5149
   'MAN'      sim=0.4944
   'manifold' sim=0.4252

=== Analogy (walked-walk+jump) expecting jumped ===

 method=tokenize: rank of 'jumped' = 7
   'jump'     sim=0.7297
   'Jump'     sim=0.5395
   'jumping'  sim=0.5046
   'jumps'    sim=0.4881
   'jumped'   sim=0.4749

 method=sum: rank of 'jumped' = 2


In [9]:
cfg = MODEL_CONFIGS["gemma2b"]
tokenizer = AutoTokenizer.from_pretrained(cfg["tokenizer_name"])
model = AutoModel.from_pretrained(cfg["model_name"])
embeddings = model.get_input_embeddings().weight.data

tests = [
    ("king", "man", "woman", "queen"),
    ("man", "king", "queen", "woman"),
    ("walked", "walk", "jump", "jumped"),
    ("go", "went", "run", "ran"),
    ("sang", "sing", "ring", "rang"),
    ("sing", "sang", "rang", "ring"),
]

for a, b, c, d in tests:
    print(f"\n=== Analogy ({a}-{b}+{c}) expecting {d} ===")
    for method in ("tokenize", "sum"):
        va = get_embedding(tokenizer, embeddings, a, method=method)
        vb = get_embedding(tokenizer, embeddings, b, method=method)
        vc = get_embedding(tokenizer, embeddings, c, method=method)
        query = va - vb + vc

        rank = get_word_rank(tokenizer, embeddings, query, d)
        print(f"\n method={method}: rank of '{d}' = {int(rank)}")
        for tok, sim in find_closest(tokenizer, embeddings, query, top_k=5):
            print(f"   {tok!r:<10} sim={sim:.4f}")
            # print(f"   {tok!r}  cos_sim={sim:.4f}")

print("\n=== E('ed') + E('jump') comparison ===")
for method in ("tokenize", "sum"):
    v_ed = get_embedding(tokenizer, embeddings, "ed",   method=method)
    v_jump = get_embedding(tokenizer, embeddings, "jump", method=method)
    query = v_jump+v_ed

    rank = get_word_rank(tokenizer, embeddings, query, "jumped", method=method)
    print(f"\n method={method}: rank of 'jumped' = {int(rank)}")
    print("  top-5:", [(tok, f"{score:.4f}") for tok, score in find_closest(tokenizer, embeddings, query, top_k=5)])
    
    sim = F.cosine_similarity(
        query.unsqueeze(0),
        get_embedding(tokenizer, embeddings, "jumped", method=method).unsqueeze(0),
        dim=1
    ).item()
    print(f"  cos_sim={sim:.4f}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 38.40it/s]



=== Analogy (king-man+woman) expecting queen ===

 method=tokenize: rank of 'queen' = 16
   'king'     sim=0.5567
   'KING'     sim=0.5456
   'woman'    sim=0.5124
   'women'    sim=0.4308
   'Woman'    sim=0.4152

 method=sum: rank of 'queen' = 4
   'king'     sim=0.6380
   'KING'     sim=0.5579
   'kings'    sim=0.5487
   'queen'    sim=0.5401
   'King'     sim=0.4936

=== Analogy (man-king+queen) expecting woman ===

 method=tokenize: rank of 'woman' = 24
   'man'      sim=0.5386
   'MAN'      sim=0.4883
   'queen'    sim=0.4669
   'Queen'    sim=0.3955
   'Man'      sim=0.3947

 method=sum: rank of 'woman' = 6
   'man'      sim=0.7066
   'Man'      sim=0.5206
   'MAN'      sim=0.5068
   'woman'    sim=0.4132
   'hombre'   sim=0.3574

=== Analogy (walked-walk+jump) expecting jumped ===

 method=tokenize: rank of 'jumped' = 5
   'jump'     sim=0.7561
   'Jump'     sim=0.6496
   'jumped'   sim=0.5866
   'jumps'    sim=0.5515
   'jumping'  sim=0.5470

 method=sum: rank of 'jumped' = 1