In [5]:
import ollama
from pprint import pprint
from pydantic import BaseModel
import pandas as pd
from random import randint
import numpy as np
from tqdm.notebook import tqdm

In [6]:
product_df = pd.read_csv("./datasets/final_5000_products.csv")
product_df = product_df.replace(np.nan, None)
product_df.head(5)

Unnamed: 0,title,price,description,id
0,Casio DJ-120D Plus Check & Recheck Basic Calcu...,1305,,009b7e66-ef69-49fc-87c8-9d40d53e0e33
1,Colorful CN600 PRO 1TB M.2 NVMe SSD,7300,Capacity: 1TB\nFlash Type: 3D NAND\nInterface:...,7bd5da56-89e9-4b68-92e2-cd31f0578bcb
2,Anker Soundcore Space One Foldable Over-Ear Bl...,7990,Frequency Range: 20Hz-20KHz\nInput Jack: AUX C...,3c7d8f65-a7b7-47cd-b808-d6e8c445ca69
3,"Smart SEL-50V24K 50"" 4K Voice Control Android ...",51900,Display Type: LED\nScreen Size: 50 Inch\nResol...,212bc014-cec5-4bc6-ad82-2591098ab808
4,EZVIZ H3c 3MP Wi-Fi Smart Home Outdoor Securit...,4324,Image Sensor: 1/2.7”Progressive Scan CMOS\nEff...,617e0e00-cfd2-4465-b46f-9537476327a4


In [7]:
class Query(BaseModel):
    relevant_query: list[str]
    irrelevant_query: list[str]

In [8]:
def format_product_details(name, price, description):
    product_details = ""
    if description is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{description}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [38]:
model = "llama3.2"
# system_prompt = "Based on the product description, give five exact product-relevant and five irrelevant queries without specifying any country that a user might search on an e-commerce platform."

# system_prompt = "Given the following product description, generate five natural user search queries that a potential customer might use on an e-commerce platform. The queries should reflect real-world search behavior, including specific product attributes, pricing, features, and user intent. Focus on how customers typically search for similar products online."

# system_prompt = """Generate two sets of search queries for the following product description:
# Five (5) RELEVANT search queries that accurately reflect user intent when searching for this specific product or products with similar key features
# Five (5) IRRELEVANT search queries that would not lead to this product or would represent misaligned user intent"""

# system_prompt = "Based on the product description below, generate exactly 5 product-relevant and 5 product-irrelevant search queries that a user might enter on an e-commerce platform. For the product-relevant queries, avoid using specific product names, model numbers, or detailed technical attributes. Instead, use generic adjectives and phrases that describe the product type and its key features. For the product-irrelevant queries, ensure they pertain to entirely different product categories that do not overlap with the product's category and more like relavent queries but for other product categories. Also these irrelevent queries should follow the structure of the relevent quries but for other category of products.  Do not include any country-specific or regional terms"

system_prompt ="Based on the product description below, generate exactly 5 product-relevant search queries and exactly 5 product-irrelevant search queries that a user might enter on an e-commerce platform. For the product-relevant queries, do not include specific product names, model numbers, or detailed technical attributes. Instead, use generic adjectives and phrases that describe the product type and its key features. The irrelevant queries should follow the structure of the relevent quries but for other category of products. Use product related synonyms occationally in relevant queries. Do not include any country-specific or regional terms."

In [45]:
idx = randint(0, product_df.shape[0])
name = product_df["title"][idx]
price = product_df["price"][idx]
description = product_df["description"][idx]
product_details = format_product_details(name, price, description)

response = ollama.generate(
    model=model,
    system=system_prompt,
    prompt=product_details,
    format=Query.model_json_schema(),
    options={"temperature": 0.6, "top_k": 5, "top_p": 0.7, "num_predict": 2048},
)

query = Query.model_validate_json(response.response)

print(f"Product details: {product_details}\n")
print(f"Relevent query: {query.relevant_query}")
print(f"Irrelevant query: {query.irrelevant_query}")

Product details: Name: ASUS TUF Gaming GeForce RTX 4070 Ti SUPER BTF White OC Edition 16GB GDDR6X Graphics Card
Price: 161000 taka
Type: GDDR6X
Size: 16GB
Resolution: 7680 x 4320
Core Clock: OC mode: 2670 MHz
Memory Clock: 21 Gbps
BUS Type: PCI Express 4.0
Memory Interface: 256-bit
CUDA Cores: 8448
Display Port: 3x Native DisplayPort 1.4a
HDMI: 2x Native HDMI 2.1
HDCP: HDCP  2.3
Recommended PSU: 750W
Multi Display: 4
OpenGL: 4.6
Dimensions: 12 x 5.43 x 2.55 inch
Others: 3.25 Slot
Manufacturing Warranty: 3 years

Relevent query: ['gaming graphics card with high resolution support', 'high-end graphics card for gaming', 'powerful graphics card for 4K display', 'fast memory clock speed graphics card', 'PCIe 4.0 graphics card']
Irrelevant query: ['best laptop for college students', 'top rated smartwatches for men', 'gaming keyboard with mechanical switches', 'high-end headphones for music streaming', 'powerful laptops for video editing']


In [7]:
queries = {
    "product_id": [],
    "relevant_query": [],
    "irrelevant_query": [],
}

start = 0
end = 500

for idx in tqdm(range(start, end)):
    id = product_df["id"][idx]
    name = product_df["title"][idx]
    price = product_df["price"][idx]
    description = product_df["description"][idx]

    product_details = format_product_details(name, price, description)

    response = ollama.generate(
        model=model,
        system=system_prompt,
        prompt=product_details,
        format=Query.model_json_schema(),
        options={"temperature": 0.25, "top_k": 5, "top_p": 0.65, "num_predict": 2048},
    )

    try:
        query = Query.model_validate_json(response.response)
        min_len = min(5, min(len(query.relevant_query), len(query.irrelevant_query)))
        relevant_query = query.relevant_query[:min_len]
        irrelevant_query = query.irrelevant_query[:min_len]

        # print(f"Relevent query: {relevant_query}")
        # print(f"Irrelevant query: {irrelevant_query}")

        queries["product_id"].extend([id] * min_len)
        queries["relevant_query"].extend(relevant_query)
        queries["irrelevant_query"].extend(irrelevant_query)
    except Exception as e:
        print(f"Error: {e}")

    print(f"Done: {idx + 1} out of {end - start}", end="\r")

  0%|          | 0/500 [00:00<?, ?it/s]

Done: 500 out of 500

In [8]:
query_df = pd.DataFrame.from_dict(queries)
query_df.to_csv(f"./generated_queries/queries_{start}_{end}.csv", index=False)