In [1]:
import ollama
from pprint import pprint
from pydantic import BaseModel
import pandas as pd
from random import randint
import numpy as np
from tqdm.notebook import tqdm

In [2]:
product_df = pd.read_csv("./final_5000_products.csv")
product_df = product_df.replace(np.nan, None)
product_df.head(5)

Unnamed: 0,title,price,description,id
0,Casio DJ-120D Plus Check & Recheck Basic Calcu...,1305,,009b7e66-ef69-49fc-87c8-9d40d53e0e33
1,Colorful CN600 PRO 1TB M.2 NVMe SSD,7300,Capacity: 1TB\nFlash Type: 3D NAND\nInterface:...,7bd5da56-89e9-4b68-92e2-cd31f0578bcb
2,Anker Soundcore Space One Foldable Over-Ear Bl...,7990,Frequency Range: 20Hz-20KHz\nInput Jack: AUX C...,3c7d8f65-a7b7-47cd-b808-d6e8c445ca69
3,"Smart SEL-50V24K 50"" 4K Voice Control Android ...",51900,Display Type: LED\nScreen Size: 50 Inch\nResol...,212bc014-cec5-4bc6-ad82-2591098ab808
4,EZVIZ H3c 3MP Wi-Fi Smart Home Outdoor Securit...,4324,Image Sensor: 1/2.7”Progressive Scan CMOS\nEff...,617e0e00-cfd2-4465-b46f-9537476327a4


In [3]:
class Query(BaseModel):
    relevant_query: list[str]
    irrelevant_query: list[str]

In [4]:
def format_product_details(name, price, description):
    product_details = ""
    if description is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{description}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [5]:
model = "llama3.2"
system_prompt = "Based on the product description, give five exact product-relevant and five irrelevant queries without specifying any country that a user might search on an e-commerce platform."
# Include product attributes and in relevant queries

In [6]:
idx = randint(0, product_df.shape[0])
name = product_df["title"][idx]
price = product_df["price"][idx]
description = product_df["description"][idx]
product_details = format_product_details(name, price, description)

response = ollama.generate(
    model=model,
    system=system_prompt,
    prompt=product_details,
    format=Query.model_json_schema(),
    options={"temperature": 0.25, "top_k": 5, "top_p": 0.65, "num_predict": 2048},
)

query = Query.model_validate_json(response.response)

print(f"Product details: {product_details}\n")
print(f"Relevent query: {query.relevant_query}")
print(f"Irrelevant query: {query.irrelevant_query}")

Product details: Name: Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse
Price: 3999 taka
Connection Type: 2.4 GHz wireless / USB
Optical Sensor: Pixart 3370 optical gaming sensor
Resolution: 400–19000
Polling Rate: 125/500/1000 Hz
Mechanical Switches: Kailh GM 8.0
Acceleration: 50 G
Color: Black
Weight: 71 g
Battery Life: 500 mA (up to 75 h)
Warranty: 1 Year

Relevent query: ['Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse features', 'Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse price in Bangladesh', 'Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse review', 'Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse specs', 'Xtrfy M4 RGB Wireless Ultra-Light Gaming Mouse warranty details']
Irrelevant query: ['Best wireless headphones for music streaming', 'How to cook a perfect steak', 'New iPhone release date 2024', 'Top 10 best selling books of all time', 'Average cost of a used car in Europe']


In [7]:
queries = {
    "product_id": [],
    "relevant_query": [],
    "irrelevant_query": [],
}

start = 0
end = 500

for idx in tqdm(range(start, end)):
    id = product_df["id"][idx]
    name = product_df["title"][idx]
    price = product_df["price"][idx]
    description = product_df["description"][idx]

    product_details = format_product_details(name, price, description)

    response = ollama.generate(
        model=model,
        system=system_prompt,
        prompt=product_details,
        format=Query.model_json_schema(),
        options={"temperature": 0.25, "top_k": 5, "top_p": 0.65, "num_predict": 2048},
    )

    try:
        query = Query.model_validate_json(response.response)
        min_len = min(5, min(len(query.relevant_query), len(query.irrelevant_query)))
        relevant_query = query.relevant_query[:min_len]
        irrelevant_query = query.irrelevant_query[:min_len]

        # print(f"Relevent query: {relevant_query}")
        # print(f"Irrelevant query: {irrelevant_query}")

        queries["product_id"].extend([id] * min_len)
        queries["relevant_query"].extend(relevant_query)
        queries["irrelevant_query"].extend(irrelevant_query)
    except Exception as e:
        print(f"Error: {e}")

    print(f"Done: {idx + 1} out of {end - start}", end="\r")

  0%|          | 0/500 [00:00<?, ?it/s]

Done: 500 out of 500

In [8]:
query_df = pd.DataFrame.from_dict(queries)
query_df.to_csv(f"./generated_queries/queries_{start}_{end}.csv", index=False)