In [3]:
import numpy as np
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer, BatchEncoding
from transformers import AutoTokenizer, AutoModel
import sys
import os
import json
import torch.nn.functional as F
from dotenv import load_dotenv

load_dotenv()

sys.path.append("..")  # Adds the parent directory to sys path

from mailio_ai_libs.create_embeddings import Embedder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# model_id = os.getenv("MODEL_ID")
# print(model_id)
model_id = "intfloat/e5-large-v2"

In [5]:
base_data_dir = "../data"
subfolder = model_id.split("/")[-1]
data_dir = f"{base_data_dir}/{subfolder}"
embeddings_path = f"{data_dir}/embeddings.npy"
index_path = f"{data_dir}/embeddings_index.npy"
jsonl_files = [f for f in os.listdir(base_data_dir) if f.endswith(".jsonl") and os.path.isfile(os.path.join(base_data_dir, f))]

In [6]:
# convert database to dictionary by id
database_dict = {}
for file in jsonl_files:
    file_path = os.path.join(base_data_dir, file)
    with open(file_path, "r") as f:
        lines = f.read()
    lines = lines.split("\n")
    for line in lines:
        if line == "":
            continue
        j = json.loads(line)
        if "message_id" in j:
            database_dict[j["message_id"]] = j

In [7]:
len(database_dict)

12432

In [8]:
embeddings = np.load(embeddings_path)
index = np.load(index_path)

In [9]:
# sanity check the shapes
print(embeddings.shape, index.shape)
assert embeddings.shape[0] == index.shape[0]

(16328, 1024) (16328,)


In [10]:
embeddings = torch.from_numpy(embeddings)

In [11]:
embeddings.shape

torch.Size([16328, 1024])

In [12]:
# embeddings = F.normalize(embeddings, p=2, dim=1)

In [13]:
# quantize the embeddings
# embeddings = embeddings.type(torch.HalfTensor)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

embedder = Embedder(model, tokenizer)

In [15]:
def search_embeddings(embedder, query, embeddings, index, limit=10):
    q = torch.from_numpy(embedder.embed([query]))
    print(q)
    # q = F.normalize(q, p=2, dim=1)
    similarity = F.cosine_similarity(q, embeddings, dim=1)
    values, indices = similarity.topk(limit, dim=0)
    return indices.detach().cpu().numpy().ravel(), values.detach().cpu().numpy().ravel()

In [24]:
query = "example"
indices, scores = search_embeddings(embedder, query, embeddings, index, limit=20)

tensor([[ 0.0047, -0.9481,  0.5189,  ..., -0.8574,  1.0762,  0.5967]])


In [17]:
indices, scores

(array([13462, 13529, 13470,   937,  3653,   687,  2441,  3413,  2520,
         3410,  2658,  2256,  8785,  3830, 11648, 12864,  9964,   744,
          897,  1121]),
 array([0.91508114, 0.9070264 , 0.87404835, 0.8667275 , 0.779546  ,
        0.7770086 , 0.77420497, 0.76401925, 0.76401925, 0.76401925,
        0.76401925, 0.7614256 , 0.7606676 , 0.7586557 , 0.75803566,
        0.7578621 , 0.7568965 , 0.7547649 , 0.75395954, 0.7536761 ],
       dtype=float32))

In [25]:
embeddings[indices[0]]

tensor([ 0.0147, -0.5494,  0.5068,  ..., -0.9313,  0.9302,  0.3688])

In [None]:
result_ids = index[indices]
for i, idx in enumerate(result_ids):
    item = database_dict[idx.item()]
    print(f"Score: {scores[i]}, Subject: {item['subject']}, id: {item['message_id']}, sentences: {item['sentences']}")