In [1]:
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import numpy as np
import pandas as pd
import time
from PIL import Image
from IPython.display import display
import matplotlib.pyplot as plt
import requests
import os

  from pandas.core import (


In [2]:
client = chromadb.PersistentClient(path="chromadb")
apparel_collection = client.get_or_create_collection(
    "apparel_50k",
    embedding_function=OpenCLIPEmbeddingFunction(),
    data_loader=ImageLoader(),
)
apparel_collection.count()

90000

In [3]:
df = pd.read_csv("apparel_test.csv")

In [4]:
random_state = 329
test_df = (
    df.sample(n=len(df), random_state=random_state).groupby("category_id").head(25)
)
test_prices = test_df["price"].values
test_descriptions = test_df["title"].values

len(test_df)

250

In [6]:
import langchain
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_community.vectorstores import Chroma
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from langchain_core.runnables import RunnablePassthrough

# Setting up LANGCHAIN_API_KEY, HUGGINGFACEHUB_API_TOKEN
os.environ["LANGCHAIN_PROJECT"] = "price-discovery"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [7]:
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"use_cache": False}
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\trung\.cache\huggingface\token
Login successful


In [8]:
langchain_apparel_collection = Chroma(
    client=client,
    collection_name="apparel_50k",
    embedding_function=OpenCLIPEmbeddings(
        model_name="ViT-B-32", checkpoint="laion2b_s34b_b79k"
    ),
)

retriever = langchain_apparel_collection.as_retriever(search_kwargs={"k": 20})

contexts = []


def get_similar_products(docs):
    simlar_products = ""
    context = []
    for doc in docs[:5]:
        simlar_products += "-{}. Price: ${}\n".format(
            doc.metadata["description"], doc.metadata["price"]
        )
        context.append(doc.metadata["description"])
    contexts.append(context)
    return simlar_products

In [9]:
# retriever.get_relevant_documents(query='Men\'s Retro 6"Hare Neutral Grey/Black-White (CT8529 062)')

In [10]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import OutputFixingParser

min_price_schema = ResponseSchema(
    name="min_price",
    description="The reasonable minimum price for the product",
    type="number",
)
max_price_schema = ResponseSchema(
    name="max_price",
    description="The reasonable maximum price for the product",
    type="number",
)
reason_schema = ResponseSchema(
    name="reason",
    description="Explanation for establishing the price range",
    type="text",
)

price_range_parser = StructuredOutputParser.from_response_schemas(
    [min_price_schema, max_price_schema, reason_schema]
)

In [11]:
price_range_prompt_template = """<s> [INST]
Imagine you are an expert in the field of apparel and you are asked to provide a price range for the following product: {original_product}
Here are some products that are related to the product you are asked to provide a price range for. \
Pick the most similar products and use them to come up with an accurate price range. \
Similar products are those with closely matching specifications based on criteria such as type of product, functionality, target users, style, material, and brand.

SIMILAR PRODUCTS:
{similar_products}

Please provide a price range for the product you are asked to provide a price range for and a comprehensive and detailed rationale for the specified price range. \
Don't put any comments in the final answer.
{format_instructions} [/INST] </s>
"""

price_range_prompt = PromptTemplate.from_template(
    price_range_prompt_template,
    partial_variables={
        "format_instructions": price_range_parser.get_format_instructions()
    },
)

In [12]:
from huggingface_hub.utils import HfHubHTTPError
from langchain_core.exceptions import OutputParserException
import time


def hf_call(
    chain, input={}, parser=None, max_tries=5
):  # input: to be invoked (usualy a dictionary)
    i = 0
    while i < max_tries:
        try:
            llm_response = chain.invoke(input)
            if parser is None:
                return llm_response
            parsed_response = parser.parse(llm_response)
            return parsed_response
        except OutputParserException as e:
            print(e)
            print("Trying to fix the response format")
            try:
                output_fixing_prompt = PromptTemplate(
                    input_variables=["completion", "error", "instructions"],
                    template="<s> [INST] Instructions:\n--------------\n{instructions}\n--------------\nCompletion:\n--------------\n{completion}\n--------------\n\nAbove, the Completion did not satisfy the constraints given in the Instructions.\nError:\n--------------\n{error}\n--------------\n\nPlease try again. Please only respond with an answer that satisfies the constraints laid out in the Instructions: [/INST] </s>",
                )
                output_fixing_parser = OutputFixingParser.from_llm(
                    parser=parser, prompt=output_fixing_prompt, llm=llm
                )
                parsed_response = output_fixing_parser.parse(llm_response)
                return parsed_response
            except OutputParserException as e:
                print(e)
                print("Failed to fix the response format. Will send another LLM call")
                del contexts[-1]
                i += 1
                continue
        except HfHubHTTPError as e:
            print(e)
            print(f"Slepping for 1 hour since {time.ctime()}")
            time.sleep(3600)

In [None]:
rag_chain = (
    {
        "similar_products": retriever | get_similar_products,
        "original_product": RunnablePassthrough(),
    }
    | price_range_prompt
    | llm
)
contexts = []
price_ranges = []
reasons = []
for i in range(len(test_descriptions)):
    test_description = test_descriptions[i]
    test_price = test_prices[i]

    print(i)
    print("Test Description: ", test_description)
    print("Test Price: $", test_price)

    parsed_response = hf_call(
        rag_chain, input=test_description, parser=price_range_parser
    )

    price_ranges.append([parsed_response["min_price"], parsed_response["max_price"]])
    reasons.append(parsed_response["reason"])
    print(parsed_response, "\n")

In [14]:
rag_mixtral_dataset = {
    "test_descriptions": list(test_descriptions),
    "test_prices": list(test_prices),
    "price_ranges": price_ranges,
    "reasons": reasons,
    "contexts": contexts,
}

In [16]:
# import pickle
# with open('rag_eval/rag_mixtral_dataset_250.pkl', 'wb') as f:
#     pickle.dump(rag_mixtral_dataset, f)

Filtering the retrieved products

In [17]:
# from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain.output_parsers import PydanticOutputParser
# from langchain.output_parsers import OutputFixingParser

# class IsSimilar(BaseModel):
#     similar: bool = Field(..., title="Is Similar", description="Whether the two products are similar to each other (True or False)")

# filter_parser = PydanticOutputParser(pydantic_object=IsSimilar)

# output_fixing_prompt = PromptTemplate(input_variables=['completion', 'error', 'instructions'], template='<s> [INST] Instructions:\n--------------\n{instructions}\n--------------\nCompletion:\n--------------\n{completion}\n--------------\n\nAbove, the Completion did not satisfy the constraints given in the Instructions.\nError:\n--------------\n{error}\n--------------\n\nPlease try again. Please only respond with an answer that satisfies the constraints laid out in the Instructions: [/INST] </s>')
# filter_output_fixing_parser = OutputFixingParser.from_llm(parser=filter_parser, prompt=output_fixing_prompt, llm=llm, max_retries=3)

In [18]:
# filter_prompt_template = """<s> [INST]
#     Imagine you are a apparel retailer and you want to know if the two products are in the same type of apparel category.\
#     There are a few criteria that need to satisfied in order for two products to be in the same category.
#     - Same type of products (e.g. both are T-shirts, both are watches)
#     - Same functionality: Consider the primary use of the product (e.g. running shoes, casual shoes)
#     - Same target users: Consider gender, age group, etc.
#     - Similar style: Consider the design, color, etc.
#     - Same material: Consider the material used in the product
#     - Same brand: Consider the brand name
#     You should evaluate each of these criteria and decide if the two products are in the same category.
#     Some of these criteria may not be available in the product description. In that case, you can make your own judgement based on the available information.\
#     These are just a few examples of the criteria. You can use your own judgement to decide if the two products are in the same category.
#     These are the two products' descriptions:
#     Product 1: {product_1}
#     Product 2: {product_2}

#     Please evaluate if the two products are in the same category(True or False). Make sure to put the result in between three backticks.
#     {format_instructions} [/INST] </s>
#     """
# filter_prompt = PromptTemplate.from_template(filter_prompt_template, partial_variables={"format_instructions": filter_parser.get_format_instructions()})

In [19]:
# filter_prompt.invoke({"product_1": "This is a T-shirt", "product_2": "This is a T-shirt"})

In [21]:
# import time

# filter_chain = (
#     filter_prompt
#     | llm
# )

# llm_responses = {}
# not_enough = 0
# num_true = 0
# num_false = 0
# false_products = []

# filtered_prices = []
# filtered_descriptions = []
# for i in range(len(retrieved_descriptions)):
#     filtered_price_lst = []
#     filtered_description_lst = []
#     for j in range(len(retrieved_descriptions[i])):
#         print(f"{i}-{j}. {test_descriptions[i]} vs {retrieved_descriptions[i][j]}")

#         try:
#             llm_response = filter_chain.invoke({"product_1": test_descriptions[i], "product_2": retrieved_descriptions[i][j]})
#         except Exception as e:
#             print(e)
#             print(f"Slepping for 1 hour since {time.ctime()}")
#             time.sleep(3600)
#             llm_response = filter_chain.invoke({"product_1": test_descriptions[i], "product_2": retrieved_descriptions[i][j]})

#         try:
#             parsed_response = filter_parser.parse(llm_response)
#         except Exception as e:
#             print(e)
#             print("Try to fix output...")
#             parsed_response = filter_output_fixing_parser.parse(llm_response)

#         print(parsed_response)

#         if parsed_response.similar:
#             num_true += 1
#             filtered_price_lst.append(retrieved_prices[i][j])
#             filtered_description_lst.append(retrieved_descriptions[i][j])
#         else:
#             num_false += 1
#             false_products.append(f"{i}-{j}. {test_descriptions[i]} vs {retrieved_descriptions[i][j]}")

#         if len(filtered_price_lst) == 5:
#             break

#     if len(filtered_price_lst) < 3:
#         not_enough += 1
#         filtered_price_lst = retrieved_prices[i][:5]
#         filtered_description_lst = retrieved_descriptions[i][:5]

#     filtered_prices.append(filtered_price_lst)
#     filtered_descriptions.append(filtered_description_lst)

In [22]:
# print("Not enough:", not_enough)
# print("True:", num_true)
# print("False:", num_false)
# for product in false_products:
#     print(product)

In [23]:
# filterd_similar_products = []
# for i in range(size):
#     similar_product = ""
#     for j in range(len(filtered_descriptions[i])):
#         similar_product += "-{}. Price: ${}\n".format(filtered_descriptions[i][j], filtered_prices[i][j])
#     filterd_similar_products.append(similar_product)

In [24]:
# filter_rag_chain = (
#     price_range_prompt
#     | llm
# )

# filter_responses = []
# for i in range(size):
#     test_description = test_descriptions[i]
#     test_price = test_prices[i]
#     similar_products = filterd_similar_products[i]

#     print("Test Description: ", test_description)
#     print("Test Price: $", test_price)

#     llm_response = filter_rag_chain.invoke({"similar_products": similar_products, "original_product": test_description})

#     try:
#         parsed_response = parser.parse(llm_response)
#     except Exception as e:
#         print(e)
#         print("Try to fix output...")
#         parsed_response = output_fixing_parser.parse(llm_response)

#     filter_responses.append(parsed_response)
#     print(parsed_response, "\n")

In [25]:
# filter_rag_dataset = {
#     "test_descriptions": test_descriptions,
#     "test_prices": test_prices,
#     "contexts": filtered_descriptions,
#     "responses": filter_responses
# }
# # Save the data to pickle
# import pickle
# with open('rag_eval/filter_rag_dataset.pkl', 'wb') as f:
#     pickle.dump(filter_rag_dataset, f)