In [1]:
import os

PATH = os.getcwd() + "/.cache/huggingface"
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TORCH_HOME"] = PATH

import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import (
    PointStruct,
    Distance,
    VectorParams,
    SparseVectorParams,
    Modifier,
    Prefetch,
    SparseVector,
    FusionQuery,
    Fusion,
)
import pandas as pd
import math
from tqdm.notebook import tqdm
from BM25 import BM25
from pprint import pprint
import numpy as np
import ollama
from pydantic import BaseModel

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("./trained_models/all_mpnet_base_v2", device="cpu")

bm25 = BM25(
    stopwords_dir=os.path.abspath("./stopwords"), languages=["english", "bengali"]
)

In [3]:
COLLECTION_NAME = "product_collection_all_mpnet_base_v2"
client = QdrantClient(url="http://localhost:6333", timeout=600)

In [4]:
client.delete_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={"dense_vector": VectorParams(size=768, distance=Distance.COSINE)},
    sparse_vectors_config={"sparse_vector": SparseVectorParams(modifier=Modifier.IDF)},
)

True

In [4]:
product_info_df = pd.read_excel("./datasets/startech.xlsx")
product_info_df = product_info_df.replace(np.nan, None)
product_info_df.head(5)

Unnamed: 0,id,name,price,category,specification
0,ff9f4b9c-64f8-4b20-8cbb-698db1302f05,AMD Ryzen 5 5600G Budget Desktop PC,26699,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
1,34c772a6-ef7d-4f3c-ae95-3c8acff1a1b4,AMD Ryzen 5 5600G Desktop PC,29500,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
2,e3c3d67a-5138-481b-930c-dcb64ea67b93,Intel 12th Gen Core i5-12400 Desktop PC,31200,Star PC,Processor: Intel 12th Gen Core i5-12400 Alder ...
3,797a3b0a-8c2b-4cf6-8a90-2f36a82bd722,AMD Ryzen 7 5700G Custom Desktop PC,32400,Star PC,Processor: AMD Ryzen 7 5700G Processor with Ra...
4,71be6c35-b744-4b69-8f1b-ee665c29a76a,AMD Ryzen 5 8500G Desktop PC,37499,Star PC,Processor: AMD Ryzen 5 8500G Processor with Ra...


In [5]:
def format_product_details(name, price, specification):
    product_details = ""
    if specification is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{specification.strip()}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [7]:
total_row = product_info_df.shape[0]
batch_size = 10
total_batch = math.ceil(total_row / batch_size)

In [8]:
documents = []

for idx, row in product_info_df.iterrows():
    name = row["name"]
    specification = row["specification"]
    price = row["price"]
    formatted_document = format_product_details(name, price, specification)
    documents.append(formatted_document)

In [9]:
bm25.calculate_avg_doc_len(documents)
print(bm25.avg_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


145.60198926428797


In [None]:
for start in tqdm(range(0, total_row, batch_size)):
    batch = product_info_df.iloc[start : start + batch_size]

    names = batch["name"]
    specifications = batch["specification"]
    prices = batch["price"]

    texts_for_embedding = [
        format_product_details(name, price, specification)
        for name, specification, price in zip(names, specifications, prices)
    ]
    dense_vectors = model.encode(texts_for_embedding)
    sparse_vectors = bm25.raw_embed(texts_for_embedding)

    points = []
    for idx, (batch_idx, row) in enumerate(batch.iterrows()):
        name = row["name"]
        specification = row["specification"]
        price = row["price"]

        points.append(
            PointStruct(
                id=batch_idx,
                vector={
                    "dense_vector": dense_vectors[idx],
                    "sparse_vector": sparse_vectors[idx],
                },
                payload={
                    "name": name,
                    "specification": specification,
                    "price": price,
                },
            )
        )

    operation_info = client.upsert(
        collection_name=COLLECTION_NAME, wait=True, points=points
    )
    print(operation_info, end="\r")

  0%|          | 0/634 [00:00<?, ?it/s]

operation_id=633 status=<UpdateStatus.COMPLETED: 'completed'>

In [101]:
class RelevanceScore(BaseModel):
    relevance_score: int


def rank_result(query_resuls: list[dict], user_query: str) -> list[dict]:
    system_prompt = f"""
    Rate the relevance of this product details for the given query. Note that user can MISSPEEL.
        Consider these aspects:
        - Direct answer relevance (0-5)
        - Information completeness (0-3)
        - Factual accuracy (0-2)
        
        Query: {user_query}
        
        Provide scores for each aspect and a total score out of 10.
    """

    results_with_score = []

    for result in query_resuls:
        formatted_result = f'Name: {result["payload"]["name"]}\nPrice":{result["payload"]["price"]}\nSpecification: {result["payload"]["specification"]}\n'.strip()

        response = ollama.generate(
            model="gemma3",
            system=system_prompt,
            prompt=formatted_result,
            format=RelevanceScore.model_json_schema(),
            options={"temperature": 0.1, "top_k": 4, "top_p": 0.80, "num_predict": 512},
        )

        try:
            validated_schema = RelevanceScore.model_validate_json(response.response)
            if validated_schema.relevance_score >= 4:
                results_with_score.append((validated_schema.relevance_score, result))
        except Exception as e:
            pass

    sorted_results_with_score = sorted(
        results_with_score, key=lambda x: x[0], reverse=True
    )
    return sorted_results_with_score


def query(query_text: str, query_type: str = "hybrid", is_rank_result: bool = False):

    dense_vector = model.encode([query_text])[0]
    sparse_vector = bm25.raw_embed([query_text])[0]

    prefetch = [
        Prefetch(query=dense_vector, using="dense_vector", limit=10),
        Prefetch(query=SparseVector(**sparse_vector), using="sparse_vector", limit=10),
    ]

    if query_type == "hybrid":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            prefetch=prefetch,
            query=FusionQuery(fusion=Fusion.RRF),
            with_payload=True,
            limit=10,
        )

    elif query_type == "sparse":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            query=SparseVector(**sparse_vector),
            using="sparse_vector",
            with_payload=True,
            limit=5,
        )

    elif query_type == "dense":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            query=dense_vector,
            using="dense_vector",
            with_payload=True,
            limit=5,
        )

    results = [
        {"score": point.score, "payload": point.payload} for point in results.points
    ]

    if is_rank_result:
        results = rank_result(results, query_text)
    return results

In [110]:
user_query = "dji drone"
query_result = query(query_text=user_query, query_type="dense", is_rank_result=False)
pprint(query_result)

[{'payload': {'name': 'DJI Flip Drone',
              'price': 72990,
              'specification': 'Weight (Battery & Propellers Included): '
                               'Takeoff Weight \u200c< 249 g\n'
                               'Diagonal Size (Propellers Excluded): Folded: '
                               '136×62×165 mm (L×W×H). Unfolded: 233×280×79 mm '
                               '(L×W×H)\n'
                               'Max Ascent Speed: 5 m/s (Sport mode). 5 m/s '
                               '(Normal mode). 2 m/s (Cine mode)\n'
                               'Max Descent Speed: 5 m/s (Sport mode). 5 m/s '
                               '(Normal mode). 1.5 m/s (Cine mode)\n'
                               'Max Speed: Max Horizontal Speed. At sea level, '
                               'in windless conditions:. 12 m/s* (Sport mode). '
                               '12 m/s (tracking status). At sea level, with 4 '
                               'm/s tailwind, while

In [112]:
query_result = query(query_text=user_query, query_type="dense", is_rank_result=True)
pprint(query_result)

[(5,
  {'payload': {'name': 'DJI Flip Drone',
               'price': 72990,
               'specification': 'Weight (Battery & Propellers Included): '
                                'Takeoff Weight \u200c< 249 g\n'
                                'Diagonal Size (Propellers Excluded): Folded: '
                                '136×62×165 mm (L×W×H). Unfolded: 233×280×79 '
                                'mm (L×W×H)\n'
                                'Max Ascent Speed: 5 m/s (Sport mode). 5 m/s '
                                '(Normal mode). 2 m/s (Cine mode)\n'
                                'Max Descent Speed: 5 m/s (Sport mode). 5 m/s '
                                '(Normal mode). 1.5 m/s (Cine mode)\n'
                                'Max Speed: Max Horizontal Speed. At sea '
                                'level, in windless conditions:. 12 m/s* '
                                '(Sport mode). 12 m/s (tracking status). At '
                                'sea level, with 4