In [1]:
import chromadb
import pandas as pd

# Read the processed data
data = pd.read_csv('processed_data.csv')

client = chromadb.Client()
# Create a table to store data
collection = client.get_or_create_collection('Test')

In [2]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,brand,description,initial_price,final_price,currency,availability,reviews_count,images_count,...,discount,manufacturer,delivery,buybox_prices,parent_asin,is_available,images,product_details,max_quantity_available,return_policy
0,0,New 15.6 inches Compatible with ROG Strix GL50...,Generic,New Item. Direct form Original Factory. Compat...,68.3,68.3,USD,In Stock,0,4,...,,Generic,"[""FREE delivery July 8 - 10. "",""Or fastest del...","{""final_price"":68.3,""unit_price"":null}",B07DH6VZB3,True,"[""https://m.media-amazon.com/images/I/61EhnSIM...","[{""type"":""Package Dimensions"",""value"":""12 x 8 ...",30.0,30-day refund/replacement
1,1,"Li-ion 800mAh Battery for FinePix F100fd, Fine...",GAXI,Cameron Sino created equal or higher capacity ...,,,USD,Currently unavailable. We don't know when or i...,0,4,...,,Cameron Sino Technology Limited,,"{""unit_price"":null}",B07ZQC3918,False,"[""https://m.media-amazon.com/images/I/41N-GtE+...","[{""type"":""Product Dimensions"",""value"":""1 x 1 x...",,
2,2,Killer E3100 USB-C 3.1 to RJ-45 2.5Gbps Ethern...,Killer,Killer E3100 USB-C 3.1 to RJ-45 2.5Gbps Ethern...,48.61,48.61,USD,Only 11 left in stock - order soon.,108,5,...,,Killer,"[""FREE delivery Wednesday, July 16. Order with...","{""final_price"":48.61,""unit_price"":null}",B08HNJC3R8,True,"[""https://m.media-amazon.com/images/I/51mUbLqP...","[{""type"":""Brand"",""value"":""Killer""},{""type"":""It...",11.0,
3,3,Cameron-Sino Replacement Battery for Panasonic...,Cameron-Sino,Cameron-Sino Replacement Battery for Panasonic...,,,USD,,0,5,...,,Cameron-Sino,,"{""unit_price"":null}",B07VSNN7XX,False,"[""https://m.media-amazon.com/images/I/415EF5Tz...","[{""type"":""Product Dimensions"",""value"":""3.31 x ...",,
4,4,MightySkins Skin Compatible with HP Pavilion x...,MIGHTY SKINS,Give your HP Pavilion x360 - 11t Touch Laptop ...,,,USD,Currently unavailable. We don't know when or i...,0,3,...,,MightySkins,,"{""unit_price"":null}",B01BLSXRFY,False,"[""https://m.media-amazon.com/images/I/51HaEk3I...","[{""type"":""Product Dimensions"",""value"":""16 x 1 ...",,


In [3]:
# Enhanced prompt template using data from processed_data.csv
def create_product_prompt(row):
    """
    Create a comprehensive product prompt for LLM using all available data
    """
    import pandas as pd
    
    # Helper function to safely get values and handle NaN
    def safe_get(key, default=None):
        value = row.get(key, default)
        if pd.isna(value) or value == '' or value == 'nan':
            return default
        return value
    
    # Basic product information
    title = safe_get('title', 'Product name not available')
    brand = safe_get('brand', 'Brand not specified')
    description = safe_get('description', 'No description available')
    
    # Pricing information
    initial_price = safe_get('initial_price')
    final_price = safe_get('final_price')
    currency = safe_get('currency', 'USD')
    discount = safe_get('discount')
    
    # Create price text
    if final_price:
        try:
            final_price = float(final_price)
            if initial_price:
                initial_price = float(initial_price)
                if initial_price != final_price:
                    price_text = f"The current price is {final_price} {currency}, originally priced at {initial_price} {currency}"
                    if discount:
                        price_text += f" (discount: {discount})"
                else:
                    price_text = f"The price is {final_price} {currency}"
            else:
                price_text = f"The price is {final_price} {currency}"
        except (ValueError, TypeError):
            price_text = "The price is not available - please contact the seller"
    else:
        price_text = "The price is not available - please contact the seller"
    
    # Availability and stock information
    availability = safe_get('availability', 'Availability unknown')
    is_available = safe_get('is_available', False)
    max_quantity = safe_get('max_quantity_available')
    
    availability_text = f"Availability: {availability}"
    if max_quantity:
        try:
            max_quantity = float(max_quantity)
            availability_text += f" (Maximum quantity available: {int(max_quantity)})"
        except (ValueError, TypeError):
            pass
    
    # Product specifications
    item_weight = safe_get('item_weight')
    product_dimensions = safe_get('product_dimensions')
    
    specs_text = ""
    if item_weight:
        specs_text += f"Item weight: {item_weight}. "
    if product_dimensions:
        specs_text += f"Product dimensions: {product_dimensions}. "
    
    # Reviews and ratings
    rating = safe_get('rating')
    reviews_count = safe_get('reviews_count', 0)
    
    try:
        reviews_count = int(float(reviews_count)) if reviews_count else 0
        rating = float(rating) if rating else None
    except (ValueError, TypeError):
        reviews_count = 0
        rating = None
    
    if rating and reviews_count:
        reviews_text = f"Customer rating: {rating} out of 5 stars based on {reviews_count} reviews"
    elif reviews_count > 0:
        reviews_text = f"Based on {reviews_count} customer reviews"
    else:
        reviews_text = "No customer reviews available yet"
    
    # Delivery information
    delivery = safe_get('delivery')
    delivery_text = f"Delivery: {delivery}" if delivery else "Delivery information not specified"
    
    # Return policy
    return_policy = safe_get('return_policy')
    return_text = f"Return policy: {return_policy}" if return_policy else "Return policy not specified"
    
    # Manufacturer information
    manufacturer = safe_get('manufacturer')
    manufacturer_text = f"Manufactured by {manufacturer}" if manufacturer else ""
    
    # Images information
    images_count = safe_get('images_count', 0)
    try:
        images_count = int(float(images_count)) if images_count else 0
    except (ValueError, TypeError):
        images_count = 0
    
    images_text = f"Product has {images_count} images available" if images_count > 0 else "No product images available"
    
    # Construct the comprehensive prompt
    prompt = f"""
Product Information:
{title} is sold on Amazon from the brand {brand}.

Description:
{description}

Pricing:
{price_text}

Availability:
{availability_text}

Customer Reviews:
{reviews_text}

Product Specifications:
{specs_text if specs_text else 'Product specifications not available'}

Shipping & Returns:
{delivery_text}
{return_text}

Additional Information:
{manufacturer_text}
{images_text}
"""
    
    return prompt.strip()

# Example usage with a sample row
sample_prompt = create_product_prompt(data.iloc[0])
print("Sample prompt for first product:")
print("="*50)
print(sample_prompt)

Sample prompt for first product:
Product Information:
New 15.6 inches Compatible with ROG Strix GL503VM-DB74 Gaming Laptop IPS FHD 1080P Laptop LED LCD Replacement Screen is sold on Amazon from the brand Generic.

Description:
New Item. Direct form Original Factory. Compatibility Guaranteed. Pixels Policy in Accordance with ISO Regulation. Fullcom – We export high quality LCD screens throughout more than 80 countries all over the world. With our longer than 15 years LCD-business experience, we have the confidence to provide you several advantages make your business successfully.First of all, we have strong support directly from original manufacturers, we can offer you stable quality LCD at reasonable price. Secondly, we have professional purchasing team locates in Taiwan. We deem to search for reliable quality screens. Most importantly, in order to keep quality stable, we also test screens 100% in order to confirm all screens must approve by ISO-13406. To work in this way, we hope to s

In [12]:
import ollama
from tqdm._tqdm_notebook import tqdm

embeddings = []

print("Creating prompts for all products...")
for i in tqdm(range(50)):
    prompt = data.title[i]

    embed = ollama.embed(
        model='llama3.2:1B',
        input=prompt
    )

    embeddings.append((i, embed))

Creating prompts for all products...


  0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
# Convert embeddings to the right format (remove index from tuples)
embed_vectors = []
for i in range(50):
    embed_vectors.append(embeddings[i][1]['embeddings'])

ids = [data.iloc[i, 1] for i in range(50)]

# Create proper serializable metadata dictionaries
metadatas = []
for i in range(50):
    # Convert the row to a dictionary and keep only string/numeric values
    row_dict = data.iloc[i].to_dict()
    # Add the source title as a special field
    row_dict["source"] = f"Title: {data.iloc[i]['title']}"
    metadatas.append(row_dict)

# Add documents to collection
collection.add(
    ids=ids,
    embeddings=[embed_vectors[i][0] for i in range(50)],
    metadatas=metadatas
)

In [17]:
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction

llama_embeder = OllamaEmbeddingFunction(model_name='llama3.2:1B')
res_list = []

input_query = '''
ROG gaming laptop from Generic.
'''


q_embed = llama_embeder([input_query])

res_list.append(collection.query(
    query_embeddings=q_embed,
    n_results=1
))

res_list

[{'ids': [['Recorder Battery Replacement for Casio Exilim EX-S6SR,Exilim EX-S8,Exilim EX-S8BE,Exilim EX-S8BK,Exilim EX-S8PE,Exilim EX-S8PK NP-82']],
  'embeddings': None,
  'documents': [[None]],
  'uris': None,
  'included': ['metadatas', 'documents', 'distances'],
  'data': None,
  'metadatas': [[{'availability': "Currently unavailable. We don't know when or if this item will be back in stock.",
     'currency': 'USD',
     'parent_asin': 'B0B616NHFB',
     'url': 'https://www.amazon.com/Recorder-Battery-Replacement-EX-S6SR-EX-S8BE/dp/B0B616NHFB?th=1&psc=1&language=en_US&currency=USD',
     'images_count': '5',
     'title': 'Recorder Battery Replacement for Casio Exilim EX-S6SR,Exilim EX-S8,Exilim EX-S8BE,Exilim EX-S8BK,Exilim EX-S8PE,Exilim EX-S8PK NP-82',
     'reviews_count': '0',
     'manufacturer': 'Willump',
     'item_weight': '1.41 ounces',
     'source': 'Title: Recorder Battery Replacement for Casio Exilim EX-S6SR,Exilim EX-S8,Exilim EX-S8BE,Exilim EX-S8BK,Exilim EX-S8PE,

# Stop here, the code above is having a problem

In [81]:
import ollama
from tqdm._tqdm_notebook import tqdm

eval_list = []

for i in tqdm(range(100)):
  instruction_prompt = f'''
  You are a helpful chatbot that gives a concise and short answer.
  Use only the following pieces of context to answer the question. Don't make up any new information:
  {res_list[i]['metadatas'][0][0]['text'], res_list[i]['metadatas'][0][1]['text']}
  '''

  eval_prompt = '''
  You are an answer evaluator. Compare the correct answer with the predicted answer and determine if they match.
  The predicted answer must be CONSISTENT and CLEAR throughout - contradictory statements should be marked as incorrect.

  You can ONLY output:
  - "1" if the predicted answer is correct, consistent, and substantially matches the correct answer
  - "0" if the predicted answer is incorrect, contradictory, unclear, or does not match the correct answer

  Do not provide any explanation, just output the number.
  '''

  answer = ollama.chat(
      model='llama3.2:1B',
      messages=[
        {'role': 'system', 'content': instruction_prompt},
        {'role': 'user', 'content': test['test']['question'][i]},
      ]
  )

  # print the response from the chatbot in real-time
  # print('Chatbot response:')
  # print(answer['message']['content'])

  # Get the correct answer from your test dataset
  correct_answer = test['test']['answer'][i]  # Adjust index as needed
  predicted_answer = answer['message']['content']

  evaluation_input = f"Correct answer: {correct_answer}\nPredicted answer: {predicted_answer}"

  evaluation = ollama.chat(
      model='llama3.2:1B',
      messages=[
        {'role': 'system', 'content': eval_prompt},
        {'role': 'user', 'content': evaluation_input},
      ]
  )


  evaluation_score = evaluation['message']['content'].strip()
  eval_list.append(int(evaluation_score))

  0%|          | 0/100 [00:00<?, ?it/s]

KeyError: 'text'

In [None]:
correct = 0
for i in eval_list:
    if i == 1:
        correct += 1
correct

97

In [None]:
test['test']['answer'][3]

'18 months'