In [1]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
# import matplotlib.pyplot as plt
import numpy as np
import pickle
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb

### Internal Classes
# from testing import Tester

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
openai = OpenAI()

In [5]:
with open("test.pkl", "rb") as f: 
    test = pickle.load(f)

In [6]:
DB = "products_vectorstore/"

client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection("product")

In [7]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [8]:
test[0].test_prompt()

'How much does this cost to the nearest dollar?\n\nMain_Category: Toys & Games\n\nMedicom Batman Hush Black Suit Version Batman Real Hero Action Figure\nFrom the Manufacturer From MEDICOM Toy! Based on the best-selling series Batman  Hush by Jeph Loeb and Jim Lee, Medicom unveils two more releases in their DC Comics Real Action Hero line of figures. Superman and Batman (Black Suit) are both sculpted from original designs by Jim Lee and are built using the finest craftsmanship and costume tailoring. If you can find a nicer looking Batman or Superman, buy him! From Medicom Toys Based on the best-selling DC Comics miniseries Batman  Hush by Jeph Loeb and Jim Lee 1 figure Sculpted from original designs by Jim Lee Built using the finest craftmanship and costume tailoring  "Product Dimensions"  "12 x \n\nPrice is $'

In [9]:
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

In [10]:
def vector(item): 
    return embedding_model.encode([description(item)])

In [11]:
def find_similars(item): 
    results = collection.query(query_embeddings=vector(item))
    similar_docs = results["documents"][0][:]
    prices = [meta["price"] for meta in results["metadatas"][0][:]]
    
    return similar_docs, prices 

In [12]:
find_similars(test[1])

(['Main_Category: Tools & Home Improvement\n\nPorlik 8 Inch Heavy Duty Metal Peg Board Shelving Hooks, Seven-Station Waterfall Hook, Pegboard Hook Steel Stainless Hooks Pegs Duty，Heavy Control Wall Hangers，Heavy Duty Pegboard Pack\nThese hooks are a simple yet brilliant storage solution for any kitchen or work area where space is limited and you might need to something up. These hooks are made of stainless steel which is strong and safe, so you can heavier items without fear of them falling or becoming damaged. Material  Stainless Steel. - Take little space for hanging displaying goods or tools, easy to install and use. - for hanging tools, brushes, extension cables, sports equipment, etc., to make your room more concise and tidy. - Hooks used for grid wall brackets to make your products great cube displaying. - ',
  'Main_Category: Tools & Home Improvement\n\nFactorDuty 200 Pack 2 inch Length Steel Black PEG Board Pegboard Shelving Hooks Wholesale Lot 20 lbs PEG Hooks Organizer Tools 

In [15]:
def make_context(similars, prices): 
    message = "To provide some context, here are some other items that might be similar to the item you need to estimate.\n\n"
    for similar, price in zip(similars, prices): 
        message += f"Potentially related product:\n{similar}\nPrice is ${price:.2f}\n\n"
    
    return message

In [16]:
def messages_for(item, similars, prices): 
    system_message = "You estimate prices of items. Reply only with the price, no explanation."
    user_prompt = make_context(similars, prices)
    user_prompt += "And now the question for you:\n\n"  
    user_prompt += item.test_prompt().replace("to the nearest dollar", "").replace("Price is $", "")
    
    return [
        {"role": "system", "content": system_message}, 
        {"role": "user", "content": user_prompt}, 
        {"role": "assistant", "content": "Price is $"}
    ]

In [17]:
docs, prices = find_similars(test[2])

In [18]:
print(test[2])

<Nady RSM-5 Ribbon Microphone - Unique compact shape perfect for close miking, includes microphone clip and soft cloth pouch = $109.99>


In [None]:
print(make_context(docs, prices))

In [20]:
print(messages_for(test[5], docs, prices))

[{'role': 'system', 'content': 'You estimate prices of items. Reply only with the price, no explanation.'}, {'role': 'user', 'content': 'To provide some context, here are some other items that might be similar to the item you need to estimate.\n\nPotentially related product:\nMain_Category: Musical Instruments\n\nCascade Microphones 98-G-A FAT HEAD Ribbon Microphone, Brown Body/Gold Grill\nUnlike most ribbon motors designed today with an offset ribbon the new Cascade FAT HEAD houses a hand-tuned ribbon element that incorporates the legendary symmetrical ribbon design. This design offers a true figure 8 pattern. The corrugated aluminum membrane itself is positioned in the center from front to back, thus producing a balanced audio input signal to both sides of the ribbon assembly. This design is very useful when executing a mid-side or Blumlein recording set-up and also great for live stage use. The FAT HEAD warm full-bodied signature and increased sensitivity is what you would expect an

In [60]:
def get_price(s): 
    s = s.replace("$", "").replace(",", "")
    match = re.search(r"[-+]?\d*.\d+|\d+", s)
    return float(match.group()) if match else 0

In [74]:
def gpt_5_mini_rag(item): 
    documents, prices = find_similars(item)
    response = openai.chat.completions.create(
        model="gpt-5-mini", 
        messages=messages_for(item, documents, prices), 
        seed=42
    )
    print(messages_for(item, documents, prices))
    reply = response.choices[0].message.content 
    print(reply)
    return get_price(reply)

In [97]:
test[500].price

139.37

In [100]:
gpt_5_mini_rag(test[500])

$259.99


259.99