In [1]:
import ollama
from pprint import pprint
from pydantic import BaseModel
import pandas as pd
from random import randint
import numpy as np
from tqdm.notebook import tqdm

In [3]:
product_df = pd.read_excel("./datasets/startech_first_half.xlsx")
product_df = product_df.replace(np.nan, None)
product_df.head(5)

Unnamed: 0,id,name,price,category,specification
0,6195dfc6-1544-450f-adb0-a7c31c47ddd6,AMD Ryzen 5 5600G Budget Desktop PC,26699,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
1,0ea7d2cb-9d73-4abb-9986-f9c0d44c5c5f,AMD Ryzen 5 5600G Desktop PC,29500,Star PC,Processor: AMD Ryzen 5 5600G Processor with Ra...
2,5aaf0d1a-8304-4849-8c96-ecb9d3bf840b,Intel 12th Gen Core i5-12400 Desktop PC,31200,Star PC,Processor: Intel 12th Gen Core i5-12400 Alder ...
3,9f7f0b17-1422-4288-9dd2-86298d4ba38b,AMD Ryzen 7 5700G Custom Desktop PC,32400,Star PC,Processor: AMD Ryzen 7 5700G Processor with Ra...
4,41c74a5c-5afd-4a30-88b1-412f5609622a,AMD Ryzen 5 8500G Desktop PC,37499,Star PC,Processor: AMD Ryzen 5 8500G Processor with Ra...


In [4]:
class Query(BaseModel):
    relevant_query: list[str]

In [36]:
def format_product_details(name, price, specification):
    product_details = ""
    if specification is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{specification.strip()}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details


def generate_response(model, system_prompt, prompt):
    response = ollama.generate(
        model=model,
        system=system_prompt,
        prompt=prompt,
        format=Query.model_json_schema(),
        options={"temperature": 0.6, "top_k": 5, "top_p": 0.7, "num_predict": 2048},
    )

    try:
        query = Query.model_validate_json(response.response)
        return query
    except Exception as e:
        print(f"Error: {e}")
    return None

In [37]:
model = "gemma3"

system_prompt = "Based on the given product description which is basically the technical specifications of a tech product, generate exactly 7 relevant product search queries that a user might enter on an e-commerce platform search bar. Out of these 7   relevant queries, 5 of them must not include directly the product, or specific product names, model numbers, brands or detailed technical attributes. Instead, use generic adjectives, context and phrases that describe the product type , specifications and its key features. Try understanding the intent of the product description given. Try to find relevant queries that a user might search to reach to this product. Use product related synonyms occasionally in relevant queries. And the rest two relevant queries must include the exact brand name, model number and/or name. Do not include any country-specific or regional terms."

In [38]:
idx = randint(0, product_df.shape[0])

name = product_df["name"][idx]
price = product_df["price"][idx]
specification = product_df["specification"][idx]
product_details = format_product_details(name, price, specification)

response = generate_response(model, system_prompt, product_details)
if response is not None:
    print(f"Product details: {product_details}")
    print(f"Relevent query: {response.relevant_query}")

Product details: Name: Dahua 4 unit CC camera package
Price: 17300 taka
Camera: Dahua 4 unit 1080P night vision security cc camera
NVR/XVR: Dahua 4 Channel 720p Recording XVR
Storage: 1000GB Hard Disk Drive
Monitor: No
Adapter: 4 unit power adapter good quality
Connector: 4 unit video balun Good quality
Cable: 100m. Cat6 & Power cable.
Relevent query: ['4 channel security camera system', '1080p night vision surveillance kit', 'high resolution outdoor security camera', 'small business security camera package', 'network video recorder 4 channel', 'Dahua 4 Channel XVR 720p', 'Dahua DVR 4 unit CC camera']


In [39]:
queries = {
    "id": [],
    "relevant_query": [],
}

start = 0
end = 5

for idx in tqdm(range(start, end)):
    
    id = product_df["id"][idx]
    name = product_df["name"][idx]
    price = product_df["price"][idx]
    specification = product_df["specification"][idx]
    product_details = format_product_details(name, price, specification)

    response = generate_response(model, system_prompt, product_details)

    if response is not None:
        relevant_query = response.relevant_query
        queries["id"].extend([id] * len(relevant_query))
        queries["relevant_query"].extend(relevant_query)

    if (idx + 1) % 100 == 0: 
        print(f"Done: {idx + 1}")

  0%|          | 0/5 [00:00<?, ?it/s]

In [44]:
query_df = pd.DataFrame.from_dict(queries)
query_df.head(10)

Unnamed: 0,id,relevant_query
0,6195dfc6-1544-450f-adb0-a7c31c47ddd6,budget desktop pc for gaming
1,6195dfc6-1544-450f-adb0-a7c31c47ddd6,entry level gaming computer
2,6195dfc6-1544-450f-adb0-a7c31c47ddd6,small form factor pc build
3,6195dfc6-1544-450f-adb0-a7c31c47ddd6,gaming pc with integrated graphics
4,6195dfc6-1544-450f-adb0-a7c31c47ddd6,AMD Ryzen 5 desktop processor
5,6195dfc6-1544-450f-adb0-a7c31c47ddd6,MSI A520M motherboard specs
6,6195dfc6-1544-450f-adb0-a7c31c47ddd6,Ryzen 5 5600G gaming setup
7,0ea7d2cb-9d73-4abb-9986-f9c0d44c5c5f,gaming desktop pc build
8,0ea7d2cb-9d73-4abb-9986-f9c0d44c5c5f,powerful desktop processor for gaming
9,0ea7d2cb-9d73-4abb-9986-f9c0d44c5c5f,compact gaming computer setup


In [43]:
query_df.to_excel(f"queries_{start}_{end}.xlsx", index=False, engine='xlsxwriter')