In [1]:
!python -V

Python 3.11.13


In [2]:
# !python -m pip install onnx

In [3]:
!mkdir -p ../models/tmp/static-retrieval-mrl-en-v1

In [4]:
tgt_embedding_size = 256

In [5]:
import torch
from sentence_transformers import SentenceTransformer

class WrappedModel(torch.nn.Module):
  def __init__(self, m):
    super().__init__()
    self.embedding = m[0].embedding
    
  def forward(self, input_ids, attention_mask):
    indices = input_ids[attention_mask == 1]
    offsets = torch.cat([torch.tensor([0]), attention_mask.sum(dim=1)[:-1].cumsum(dim=0)])
    return self.embedding(indices, offsets)

shape = (3, 4)
input_ids = torch.tensor([1, 2, 3, 4, 5, 6, -1, -1, 1, 1, 1, 0]).view(shape)
attention_mask = torch.tensor([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]).view(shape)

model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu", truncate_dim=tgt_embedding_size)
wrapped = WrappedModel(model) # test forward pass

# # Export the model
# torch.onnx.export(wrapped,
#                   (input_ids, attention_mask),
#                   "../models/tmp/static-retrieval-mrl-en-v1/model.onnx",
#                   export_params=True,
#                   opset_version=14,
#                   do_constant_folding=True,
#                   input_names = ['input_ids', 'attention_mask'],
#                   output_names = ['sentence_embedding'],
#                   dynamic_axes={
#                       'input_ids' : {0 : 'batch_size', 1: 'sequence_length'},
#                       'attention_mask' : {0 : 'batch_size', 1: 'sequence_length'},
#                       'sentence_embedding' : {0 : 'batch_size'},
#                   })

In [6]:
wrapped.embedding

EmbeddingBag(30522, 1024, mode='mean')

In [7]:
embeddings = model[0].embedding.state_dict()['weight'].numpy()

In [8]:
embeddings.shape

(30522, 1024)

In [9]:
embeddings_tgt = embeddings[:,:tgt_embedding_size]
embeddings_tgt.shape

(30522, 256)

In [10]:
len(model[0].tokenizer.get_vocab())

30522

In [11]:
vocab = model[0].tokenizer.get_vocab()

In [12]:
# vocab
steps = 0
for token, id in vocab.items():
    print(f"Token: {token}, id = {id}")
    steps += 1
    if steps > 10:
        break

Token: goodbye, id = 9119
Token: argues, id = 9251
Token: 2009, id = 2268
Token: benches, id = 19571
Token: megan, id = 12756
Token: ##tters, id = 24168
Token: guillermo, id = 21070
Token: ##erland, id = 22492
Token: ##eous, id = 14769
Token: „, id = 1525
Token: ##ammed, id = 27479


In [13]:
vocab_sorted = dict(sorted([(token, idx) for token,idx in vocab.items()], key=lambda kv: kv[1]))

In [14]:
# vocab_sorted

In [15]:
# sorted([(k, v) for k,v in vocab.items()], key=lambda x: x[1])[1000:1100]

In [16]:
import numpy as np

# def embed(query):
#     input_ids = [vocab.get(word, 0) for word in query.lower().split()]
#     return np.mean([embeddings[input_id, :tgt_embedding_size] for input_id in input_ids], axis=0)

from transformers import BertTokenizer

# Initialize the tokenizer with a pre-trained BERT model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Your input sentence.
# sentence = "hello world this is a test extraordinarily the simplest"


def embed(query):
    tokens = tokenizer.tokenize(query)
    print("Tokens:", tokens)
    input_ids = [vocab.get(token, 0) for token in tokens]
    return np.mean([embeddings[input_id, :tgt_embedding_size] for input_id in input_ids], axis=0)

In [17]:
# query = "fun filled morning at school"
# query = "bus departing time"
# query = "it was an action movie"
# query = "cs algorithms"
# query = "basketball player"
query = "diabetes treatment"
# query = "hello world this is a test extra ##ord ##ina ##rily the simplest"

# e1 = embed(query)

queries = [
    "fun filled morning at school",
    "bus departing time",
    "it was an action movie",
    "cs algorithms",
    "basketball player",
    "diabetes treatment",
    "hello world this is a test extraordinarily the simplest"
]

e1 = np.array([embed(query) for query in queries])


Tokens: ['fun', 'filled', 'morning', 'at', 'school']
Tokens: ['bus', 'departing', 'time']
Tokens: ['it', 'was', 'an', 'action', 'movie']
Tokens: ['cs', 'algorithms']
Tokens: ['basketball', 'player']
Tokens: ['diabetes', 'treatment']
Tokens: ['hello', 'world', 'this', 'is', 'a', 'test', 'extra', '##ord', '##ina', '##rily', 'the', 'simplest']


In [18]:
e1[6, :10]

array([ 6.879415 ,  1.8830419,  1.8674997,  1.0040632,  4.127914 ,
       -2.4525557,  0.5300663,  0.842521 ,  5.7000585,  8.80159  ],
      dtype=float32)

In [19]:
e1.shape

(7, 256)

In [20]:
e1_from_transformers = model.encode(queries)

In [21]:
e1_from_transformers.shape

(7, 256)

In [22]:
e1_from_transformers[6, :10]

array([ 6.879415 ,  1.8830419,  1.8674997,  1.0040632,  4.127914 ,
       -2.4525557,  0.5300663,  0.842521 ,  5.7000585,  8.80159  ],
      dtype=float32)

In [23]:
e1.shape == e1_from_transformers.shape

True

In [24]:
e1.shape[0]

7

In [25]:
all([all(e1[i] == e1_from_transformers[i]) for i in range(e1.shape[0])])

True

In [26]:
# all(e1 == e1_from_transformers)
all([all(e1[i] == e1_from_transformers[i]) for i in range(e1.shape[0])])

True

In [27]:
embeddings_tgt.shape

(30522, 256)

In [28]:
embeddings_tgt[vocab['##ord'], :10]

array([  8.065368  ,  -8.882175  , -20.208576  , -18.220303  ,
         0.26347125,   2.1979225 ,  19.462276  ,  -1.8650678 ,
        11.297153  , -10.203839  ], dtype=float32)

In [29]:
corpus = [
    "School is kind to students",
    "lovely day and wonderful weather",
    "nearest pharmacy",
    "lebran james",
    "train schedule",
    "hollywood flick",
    "computers",
    "soccer is nice to watch",
    "insulin research",
]

In [30]:
corpus_embeddings = [embed(text) for text in corpus]


Tokens: ['school', 'is', 'kind', 'to', 'students']
Tokens: ['lovely', 'day', 'and', 'wonderful', 'weather']
Tokens: ['nearest', 'pharmacy']
Tokens: ['le', '##bra', '##n', 'james']
Tokens: ['train', 'schedule']
Tokens: ['hollywood', 'flick']
Tokens: ['computers']
Tokens: ['soccer', 'is', 'nice', 'to', 'watch']
Tokens: ['insulin', 'research']


In [31]:

def cosine_similarity(query_embedding: list[float], corpus_embeddings: list[list[float]]) -> np.ndarray:
    # Convert lists to numpy arrays
    query = np.array(query_embedding)
    corpus = np.array(corpus_embeddings)
    
    # Normalize query and corpus vectors
    query_norm = query / np.linalg.norm(query)
    corpus_norm = corpus / np.linalg.norm(corpus, axis=1, keepdims=True)
    
    # Compute cosine similarity (dot product of normalized vectors)
    similarities = corpus_norm.dot(query_norm)
    print(similarities)
    return np.argmax(similarities)

In [32]:
e1.shape

(7, 256)

In [33]:
# similar_idx = cosine_similarity(e1, corpus_embeddings).item()
print(f"query = {queries[5]}")
similar_idx = cosine_similarity(e1[5], corpus_embeddings).item()
corpus[similar_idx]

query = diabetes treatment
[-0.1352294  -0.04187926  0.08710091  0.00459079 -0.11412142 -0.02789718
 -0.00536665  0.04453617  0.39935726]


'insulin research'

In [34]:
# inputs = model[0].tokenize(["hello world and the earthly"])

In [35]:
# inputs

In [None]:
# vocab['the']

In [None]:
# import numpy as np
# np.mean([embeddings[input_id.item(), :tgt_embedding_size] for input_id in inputs['input_ids']], axis=0)

In [None]:
# # vocab_sorted
# list(vocab_sorted.keys())

In [None]:
import struct
import numpy as np

def save_embeddings(filename, vocab, embeddings):
    # vocab: list of strings (words)
    # embeddings: NumPy array of shape (vocab_size, embedding_dim), dtype=np.float32
    vocab_size, embedding_dim = embeddings.shape

    with open(filename, "wb") as f:
        # Write a header: magic number "EMBD", vocab_size, and embedding_dim (unsigned ints)
        f.write(b"EMBD")
        f.write(struct.pack("<II", vocab_size, embedding_dim))
        
        # Write each word: first its length then the word bytes
        for word in vocab:
            encoded = word.encode("utf-8")
            f.write(struct.pack("<I", len(encoded)))
            f.write(encoded)
        
        # Write the embedding matrix as contiguous block of little-endian floats
        f.write(embeddings.astype(np.float32).tobytes())

# Example usage:
# vocab = ["hello", "world", "foo", "bar"]
# embeddings = np.random.rand(len(vocab), 300).astype(np.float32)
save_embeddings(f"embeddings_dim_{tgt_embedding_size}.bin", list(vocab_sorted.keys()), embeddings_tgt)


In [None]:
embeddings_tgt.shape

In [None]:
# diabetes treatment
vocab["treatment"]

In [None]:
embeddings_tgt[3949, :10]

In [None]:
# !python -m pip install huggingface_hub

In [None]:
# !huggingface-cli whoami

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository

model_id = "Mozilla/static-retrieval-mrl-en-v1"
bin_file_path = f"embeddings_dim_{tgt_embedding_size}.bin"
target_path_in_repo = f"embeddings_dim_{tgt_embedding_size}.bin"
print(f"target_path_in_repo = {target_path_in_repo}")

api = HfApi()
api.upload_file(
    path_or_fileobj=bin_file_path,
    path_in_repo=target_path_in_repo,
    repo_id=model_id,
    repo_type="model",
)