In [None]:
import ollama
from pprint import pprint
from pydantic import BaseModel
import pandas as pd
from random import randint
import numpy as np
from tqdm.notebook import tqdm

In [None]:
product_df = pd.read_csv("./final_5000_products.csv")
product_df = product_df.replace(np.nan, None)
product_df.head(5)

In [None]:
class Query(BaseModel):
    relevant_query: list[str]
    irrelevant_query: list[str]

In [None]:
def format_product_details(name, price, description):
    product_details = ""
    if description is not None:
        product_details = f"Name: {name}\nPrice: {price} taka\n{description}"
    else:
        product_details = f"Name: {name}\nPrice: {price} taka"

    return product_details

In [None]:
model = "llama3.2"
system_prompt = "Based on the product description, give five exact product-relevant and five irrelevant queries without specifying any country that a user might search on an e-commerce platform."
# Include product attributes and in relevant queries

In [None]:
idx = randint(0, product_df.shape[0])
name = product_df["title"][idx]
price = product_df["price"][idx]
description = product_df["description"][idx]
product_details = format_product_details(name, price, description)

response = ollama.generate(
    model=model,
    system=system_prompt,
    prompt=product_details,
    format=Query.model_json_schema(),
    options={"temperature": 0.25, "top_k": 5, "top_p": 0.65, "num_predict": 2048},
)

query = Query.model_validate_json(response.response)

print(f"Product details: {product_details}\n")
print(f"Relevent query: {query.relevant_query}")
print(f"Irrelevant query: {query.irrelevant_query}")

In [None]:
queries = {
    "product_id": [],
    "relevant_query": [],
    "irrelevant_query": [],
}

start = 0
end = 500

for idx in tqdm(range(start, end)):
    id = product_df["id"][idx]
    name = product_df["title"][idx]
    price = product_df["price"][idx]
    description = product_df["description"][idx]

    product_details = format_product_details(name, price, description)

    response = ollama.generate(
        model=model,
        system=system_prompt,
        prompt=product_details,
        format=Query.model_json_schema(),
        options={"temperature": 0.25, "top_k": 5, "top_p": 0.65, "num_predict": 2048},
    )

    query = Query.model_validate_json(response.response)
    min_len = min(5, min(len(query.relevant_query), len(query.irrelevant_query)))
    relevant_query = query.relevant_query[:min_len]
    irrelevant_query = query.irrelevant_query[:min_len]

    # print(f"Relevent query: {relevant_query}")
    # print(f"Irrelevant query: {irrelevant_query}")

    queries["product_id"].extend([id] * min_len)
    queries["relevant_query"].extend(relevant_query)
    queries["irrelevant_query"].extend(irrelevant_query)

In [None]:
query_df = pd.DataFrame.from_dict(queries)
query_df.to_csv(f"./generated_queries/queries_{start}_{end}.csv", index=False)