In [1]:
import os

PATH = os.getcwd() + "/.cache/huggingface"
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TORCH_HOME"] = PATH

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import (
    PointStruct,
    Distance,
    VectorParams,
    SparseVectorParams,
    Modifier,
    Prefetch,
    SparseVector,
    FusionQuery,
    Fusion,
)
import pandas as pd
import math
from tqdm.notebook import tqdm
from BM25 import BM25
from pprint import pprint

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("../models/all-MiniLM-L6-v2-tokenizer")
model = AutoModel.from_pretrained("../models/all-MiniLM-L6-v2-model")
model = model.to(DEVICE)

bm25 = BM25(
    stopwords_dir=os.path.abspath("./stopwords"), languages=["english", "bengali"]
)

In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def dense_embedding(texts: list[str]):
    encoded_queries = tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt"
    )

    with torch.no_grad():
        queries_outputs = model(**encoded_queries.to(DEVICE))

    embeddings = mean_pooling(queries_outputs, encoded_queries["attention_mask"])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.cpu().numpy()

In [4]:
COLLECTION_NAME = "product_collection_all-MiniLM-L6-v2"
client = QdrantClient(url="http://localhost:6333", timeout=600)

In [5]:
client.delete_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={"dense_vector": VectorParams(size=384, distance=Distance.COSINE)},
    sparse_vectors_config={"sparse_vector": SparseVectorParams(modifier=Modifier.IDF)},
)

True

In [16]:
product_info_df = pd.read_csv("./final_5000_products.csv")
product_info_df.head(5)

Unnamed: 0,title,price,description
0,Casio DJ-120D Plus Check & Recheck Basic Calcu...,1305,
1,Colorful CN600 PRO 1TB M.2 NVMe SSD,7300,Capacity: 1TB\nFlash Type: 3D NAND\nInterface:...
2,Anker Soundcore Space One Foldable Over-Ear Bl...,7990,Frequency Range: 20Hz-20KHz\nInput Jack: AUX C...
3,"Smart SEL-50V24K 50"" 4K Voice Control Android ...",51900,Display Type: LED\nScreen Size: 50 Inch\nResol...
4,EZVIZ H3c 3MP Wi-Fi Smart Home Outdoor Securit...,4324,Image Sensor: 1/2.7”Progressive Scan CMOS\nEff...


In [17]:
total_row = product_info_df.shape[0]
batch_size = 10
total_batch = math.ceil(total_row / batch_size)

In [21]:
text_for_embedding_format = "Name: {}\n{}\nPrice: {}"
documents = []

for idx, row in product_info_df.iterrows():
    title = row["title"]
    description = row["description"]
    price = row["price"]
    formatted_document = text_for_embedding_format.format(title, description, price)
    documents.append(formatted_document)

In [25]:
bm25.calculate_avg_doc_len(documents)
print(bm25.avg_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


95.3308


In [27]:
for start in tqdm(range(0, total_row, batch_size)):
    batch = product_info_df.iloc[start : start + batch_size]

    titles = batch["title"].tolist()
    descriptions = batch["description"].tolist()
    prices = batch["price"].tolist()

    texts_for_embedding = [
        text_for_embedding_format.format(title, description, price)
        for title, description, price in zip(titles, descriptions, prices)
    ]
    dense_vectors = dense_embedding(texts_for_embedding)
    sparse_vectors = bm25.raw_embed(texts_for_embedding)

    points = []
    for idx, (batch_idx, row) in enumerate(batch.iterrows()):
        title = row["title"]
        description = row["description"]
        price = row["price"]

        points.append(
            PointStruct(
                id=batch_idx,
                vector={
                    "dense_vector": dense_vectors[idx],
                    "sparse_vector": sparse_vectors[idx],
                },
                payload={
                    "title": title,
                    "description": description,
                    "price": price,
                },
            )
        )

    operation_info = client.upsert(
        collection_name=COLLECTION_NAME, wait=True, points=points
    )
    print(operation_info, end="\r")

  0%|          | 0/500 [00:00<?, ?it/s]

  return F.linear(input, self.weight, self.bias)


operation_id=499 status=<UpdateStatus.COMPLETED: 'completed'>

In [28]:
def query(query_text: str):
    dense_vector = dense_embedding([query_text])[0]
    sparse_vector = bm25.raw_embed([query_text])[0]

    prefetch = [
        Prefetch(query=dense_vector, using="dense_vector", limit=10),
        Prefetch(query=SparseVector(**sparse_vector), using="sparse_vector", limit=10),
    ]

    results = client.query_points(
        collection_name=COLLECTION_NAME,
        prefetch=prefetch,
        query=FusionQuery(fusion=Fusion.RRF),
        with_payload=True,
        limit=5,
    )

    # results = client.query_points(
    #     collection_name=COLLECTION_NAME,
    #     query=dense_vector,
    #     using="dense_vector",
    #     with_payload=True,
    #     limit=5
    # )

    return [{"score": point.score, "payload": point.payload} for point in results.points]

In [30]:
%%time
query_result = query("large monitor")
pprint(query_result)

[{'payload': {'description': 'Functions: Print Only\n'
                             'Printer Type: Single Function Large Format '
                             'Printer\n'
                             'Output Color: Color\n'
                             'Technology: Single Function Large Format '
                             'Printer\n'
                             'Print Speed: Plain Paper\n'
                             'Print Resolution: 2400 x 1200 dpi\n'
                             'Memory: 1GB RAM\n'
                             'Connectivity: Inputs/Outputs\n'
                             'Power Consumption: Power Supply: AC 100 - 240 V, '
                             '50 - 60 Hz\n'
                             'Color: White\n'
                             'Dimension: Main Unit:\n'
                             'Weight: Main Unit: 43 kg\n'
                             'Toner/ Cartridge/ Refill: Black / Cyan / Magenta '
                             '/ Yellow: PFI-050 (70 ml)\n'
  