In [1]:
import os

PATH = os.getcwd() + "/.cache/huggingface"
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TORCH_HOME"] = PATH

import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import (
    PointStruct,
    Distance,
    VectorParams,
    SparseVectorParams,
    Modifier,
    Prefetch,
    SparseVector,
    FusionQuery,
    Fusion,
)
import pandas as pd
import math
from tqdm.notebook import tqdm
from BM25 import BM25
from pprint import pprint
import numpy as np

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("./trained_models/mini_lm_l6_v2", device=DEVICE)

bm25 = BM25(
    stopwords_dir=os.path.abspath("./stopwords"), languages=["english", "bengali"]
)

In [3]:
COLLECTION_NAME = "product_collection_all_minilm_l6_v2_trained"
client = QdrantClient(url="http://localhost:6333", timeout=600)

In [4]:
client.delete_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={"dense_vector": VectorParams(size=384, distance=Distance.COSINE)},
    sparse_vectors_config={"sparse_vector": SparseVectorParams(modifier=Modifier.IDF)},
)

True

In [5]:
product_info_df = pd.read_excel("./datasets/startech.xlsx")
product_info_df = product_info_df.replace(np.nan, None)
print(product_info_df.shape)
product_info_df.head(5)

(6334, 5)


Unnamed: 0,id,name,price,category,specification
0,ff9f4b9c-64f8-4b20-8cbb-698db1302f05,AMD Ryzen 5 5600G Budget Desktop PC,26699,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
1,34c772a6-ef7d-4f3c-ae95-3c8acff1a1b4,AMD Ryzen 5 5600G Desktop PC,29500,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
2,e3c3d67a-5138-481b-930c-dcb64ea67b93,Intel 12th Gen Core i5-12400 Desktop PC,31200,Star PC,Processor: Intel 12th Gen Core i5-12400 Alder ...
3,797a3b0a-8c2b-4cf6-8a90-2f36a82bd722,AMD Ryzen 7 5700G Custom Desktop PC,32400,Star PC,Processor: AMD Ryzen 7 5700G Processor with Ra...
4,71be6c35-b744-4b69-8f1b-ee665c29a76a,AMD Ryzen 5 8500G Desktop PC,37499,Star PC,Processor: AMD Ryzen 5 8500G Processor with Ra...


In [4]:
def format_product_details(name, price, specification):
    product_details = ""
    if specification is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{specification.strip()}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [7]:
total_row = product_info_df.shape[0]
batch_size = 10
total_batch = math.ceil(total_row / batch_size)

In [8]:
documents = []

for idx, row in product_info_df.iterrows():
    name = row["name"]
    specification = row["specification"]
    price = row["price"]
    formatted_document = format_product_details(name, price, specification)
    documents.append(formatted_document)

In [9]:
bm25.calculate_avg_doc_len(documents)
print(bm25.avg_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


145.60198926428797


In [10]:
for start in tqdm(range(0, total_row, batch_size)):
    batch = product_info_df.iloc[start : start + batch_size]
    
    names = batch["name"]
    specifications = batch["specification"]
    prices = batch["price"]

    texts_for_embedding = [
        format_product_details(name, price, specification)
        for name, specification, price in zip(names, specifications, prices)
    ]
    dense_vectors = model.encode(texts_for_embedding)
    sparse_vectors = bm25.raw_embed(texts_for_embedding)

    points = []
    for idx, (batch_idx, row) in enumerate(batch.iterrows()):
        name = row["name"]
        specification = row["specification"]
        price = row["price"]

        points.append(
            PointStruct(
                id=batch_idx,
                vector={
                    "dense_vector": dense_vectors[idx],
                    "sparse_vector": sparse_vectors[idx],
                },
                payload={
                    "name": name,
                    "specification": specification,
                    "price": price,
                },
            )
        )

    operation_info = client.upsert(
        collection_name=COLLECTION_NAME, wait=True, points=points
    )
    print(operation_info, end="\r")

  0%|          | 0/634 [00:00<?, ?it/s]

  return F.linear(input, self.weight, self.bias)


operation_id=633 status=<UpdateStatus.COMPLETED: 'completed'>

In [6]:
def query(query_text: str, query_type: str = "hybrid"):
    
    dense_vector = model.encode([query_text])[0]
    sparse_vector = bm25.raw_embed([query_text])[0]

    prefetch = [
        Prefetch(query=dense_vector, using="dense_vector", limit=10),
        Prefetch(query=SparseVector(**sparse_vector), using="sparse_vector", limit=10),
    ]

    if query_type == "hybrid":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            prefetch=prefetch,
            query=FusionQuery(fusion=Fusion.RRF),
            with_payload=True,
            limit=5,
        )

    elif query_type == "sparse":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            query=SparseVector(**sparse_vector),
            using="sparse_vector",
            with_payload=True,
            limit=5
        )
        
    elif query_type == "dense":
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            query=dense_vector,
            using="dense_vector",
            with_payload=True,
            limit=5
        )

        
    return [{"score": point.score, "payload": point.payload} for point in results.points]

In [13]:
query_result = query("quite fan", "dense")
pprint(query_result)

[{'payload': {'name': 'Sony FDR-AX43 4K Handycam Camcorder',
              'price': 80000,
              'specification': 'Image Sensor: 1/2.5 type (7.20 mm) '
                               'back-illuminated Exmor R CMOS Sensor\n'
                               'Resolution: Actual: 8.57 Megapixel\n'
                               'Effective Sensor Resolution: Effective: 8.29 '
                               'Megapixel\n'
                               'Shutter Speed: Auto Control Range: '
                               '1/6-1/10000. Standard: 1/50-1/10000. Manual '
                               'Iris Control\u3000(Photo mode): 1/25-1/10000. '
                               'Manual Shutter: 1/6-1/10000. Smooth Slow: '
                               '1/215-1/10000\n'
                               'Recording Media: Video Resolution: XAVC S 4K: '
                               '3840x2160 / 25p,24p, XAVC S HD: 1920x1080 / '
                               '50p,25p,24p, AVCHD: 1920x1080 / 