In [3]:
import os

PATH = os.getcwd() + "/.cache/huggingface"
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TORCH_HOME"] = PATH

import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import (
    PointStruct,
    Distance,
    VectorParams,
    SparseVectorParams,
    Modifier,
    Prefetch,
    SparseVector,
    FusionQuery,
    Fusion,
    MultiVectorConfig,
    MultiVectorComparator,
)
import pandas as pd
import math
from tqdm.notebook import tqdm
from BM25 import BM25
from pprint import pprint
import numpy as np
from fastembed import LateInteractionTextEmbedding

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dense_embedding_model = SentenceTransformer(
    "./trained_models/all_mpnet_base_v2", device=DEVICE
)
late_interaction_embedding_model = LateInteractionTextEmbedding(
    "colbert-ir/colbertv2.0", cache_dir="./.cache",
    providers=["ROCMExecutionProvider"]
)

bm25 = BM25(
    stopwords_dir=os.path.abspath("./stopwords"), languages=["english", "bengali"]
)

[0;93m2025-04-05 13:40:53.027489909 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-04-05 13:40:53.027507041 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [5]:
len(
    list(
        late_interaction_embedding_model.passage_embed(
            ["Hello there hi there", "Hi there"]
        )
    )[0][0]
)

128

In [6]:
COLLECTION_NAME = "product_collection_all_mpnet_base_v2_trained_rerank"
client = QdrantClient(url="http://localhost:6333", timeout=600)

In [7]:
client.delete_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "dense_vector": VectorParams(size=768, distance=Distance.COSINE),
        "rerank_vector": VectorParams(
            size=128,
            distance=Distance.COSINE,
            multivector_config=MultiVectorConfig(
                comparator=MultiVectorComparator.MAX_SIM
            ),
        ),
    },
    sparse_vectors_config={"sparse_vector": SparseVectorParams(modifier=Modifier.IDF)},
)

True

In [8]:
product_info_df = pd.read_csv("./datasets/final_5000_products.csv")
product_info_df = product_info_df.replace(np.nan, None)
product_info_df.head(5)

Unnamed: 0,title,price,description,id
0,Casio DJ-120D Plus Check & Recheck Basic Calcu...,1305,,009b7e66-ef69-49fc-87c8-9d40d53e0e33
1,Colorful CN600 PRO 1TB M.2 NVMe SSD,7300,Capacity: 1TB\nFlash Type: 3D NAND\nInterface:...,7bd5da56-89e9-4b68-92e2-cd31f0578bcb
2,Anker Soundcore Space One Foldable Over-Ear Bl...,7990,Frequency Range: 20Hz-20KHz\nInput Jack: AUX C...,3c7d8f65-a7b7-47cd-b808-d6e8c445ca69
3,"Smart SEL-50V24K 50"" 4K Voice Control Android ...",51900,Display Type: LED\nScreen Size: 50 Inch\nResol...,212bc014-cec5-4bc6-ad82-2591098ab808
4,EZVIZ H3c 3MP Wi-Fi Smart Home Outdoor Securit...,4324,Image Sensor: 1/2.7”Progressive Scan CMOS\nEff...,617e0e00-cfd2-4465-b46f-9537476327a4


In [9]:
def format_product_details(name, price, description):
    product_details = ""
    if description is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{description}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [10]:
total_row = product_info_df.shape[0]
batch_size = 10
total_batch = math.ceil(total_row / batch_size)

In [11]:
documents = []

for idx, row in product_info_df.iterrows():
    title = row["title"]
    description = row["description"]
    price = row["price"]
    formatted_document = format_product_details(title, price, description)
    documents.append(formatted_document)

In [12]:
bm25.calculate_avg_doc_len(documents)
print(bm25.avg_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


96.1696


In [13]:
for start in tqdm(range(0, total_row, batch_size)):
    batch = product_info_df.iloc[start : start + batch_size]

    titles = batch["title"].tolist()
    descriptions = batch["description"].tolist()
    prices = batch["price"].tolist()

    texts_for_embedding = [
        format_product_details(title, price, description)
        for title, description, price in zip(titles, descriptions, prices)
    ]
    dense_vectors = dense_embedding_model.encode(texts_for_embedding)
    rerank_vector = list(late_interaction_embedding_model.passage_embed(texts_for_embedding))
    sparse_vectors = bm25.raw_embed(texts_for_embedding)

    points = []
    for idx, (batch_idx, row) in enumerate(batch.iterrows()):
        title = row["title"]
        description = row["description"]
        price = row["price"]

        points.append(
            PointStruct(
                id=batch_idx,
                vector={
                    "dense_vector": dense_vectors[idx],
                    "rerank_vector": rerank_vector[idx],
                    "sparse_vector": sparse_vectors[idx],
                },
                payload={
                    "title": title,
                    "description": description,
                    "price": price,
                },
            )
        )

    operation_info = client.upsert(
        collection_name=COLLECTION_NAME, wait=True, points=points
    )
    print(operation_info, end="\r")

  0%|          | 0/500 [00:00<?, ?it/s]

  return F.linear(input, self.weight, self.bias)


operation_id=499 status=<UpdateStatus.COMPLETED: 'completed'>

In [27]:
def query(query_text: str):
    dense_vector = dense_embedding_model.encode([query_text])[0]
    sparse_vector = bm25.raw_embed([query_text])[0]
    late_query_vector = next(late_interaction_embedding_model.embed(query_text))

    prefetch = [
        Prefetch(query=dense_vector, using="dense_vector", limit=10),
        Prefetch(query=SparseVector(**sparse_vector), using="sparse_vector", limit=10),
    ]


    results = client.query_points(
        collection_name=COLLECTION_NAME,
        prefetch=prefetch,
        query=FusionQuery(fusion=Fusion.RRF),
        with_payload=True,
        limit=5,
    )

    # results = client.query_points(
    #     collection_name=COLLECTION_NAME,
    #     prefetch=prefetch,
    #     query=late_query_vector,
    #     using="rerank_vector",
    #     with_payload=True,
    #     limit=5,
    # )

    # results = client.query_points(
    #     collection_name=COLLECTION_NAME,
    #     query=dense_vector,
    #     using="dense_vector",
    #     with_payload=True,
    #     limit=5,
    # )

    return [
        {"score": point.score, "payload": point.payload} for point in results.points
    ]

In [28]:
query_result = query("long battery life smartphone")
pprint(query_result)

[{'payload': {'description': 'Microphone Type: Lavalier\n'
                             'Noise/Signal: 70dB\n'
                             'Others: 8 Hours Long Battery Life Microphone\n'
                             'Sensitivity: -38 ± 3dB\n'
                             'Dimension: Case Size - (60*27*59)Mm\n'
                             'Polar pattern: Omni-Directional\n'
                             'Battery: Mic Battery: 80mAH\n'
                             'Audio recorders: Yes\n'
                             'Warranty: 1 year',
              'price': '2940',
              'title': 'Fantech LEVIOSA AIR WMV11C Lavalier Wireless '
                       'Microphone'},
  'score': 0.5},
 {'payload': {'description': 'Display: 1.32 inches AMOLED color screen\n'
                             'Battery: 7-day battery life for maximum usage\n'
                             'Connectivity: NFC Supported\n'
                             'Material: Watch Case: Stainless steel\n'
               