In [1]:
import openai
from qdrant_client import QdrantClient

from langsmith import Client
from qdrant_client import QdrantClient

# installed via ragas; we don't need to install it directly
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

### Download an example reference data point from LangSmith

In [2]:
client = Client()

In [3]:
dataset = client.read_dataset(
    dataset_name="rag-evaluation-dataset"
)

In [4]:
dataset

Dataset(name='rag-evaluation-dataset', description='Dataset for evaluating RAG pipeline', data_type=<DataType.kv: 'kv'>, id=UUID('2757993d-580d-4c2e-8193-d96a991b34db'), created_at=datetime.datetime(2026, 1, 18, 0, 26, 31, 617616, tzinfo=TzInfo(0)), modified_at=datetime.datetime(2026, 1, 18, 0, 26, 31, 617616, tzinfo=TzInfo(0)), example_count=40, session_count=0, last_session_start_time=None, inputs_schema=None, outputs_schema=None, transformations=None, metadata={'runtime': {'sdk': 'langsmith-py', 'library': 'langsmith', 'runtime': 'python', 'platform': 'macOS-15.7.3-arm64-arm-64bit', 'sdk_version': '0.6.4', 'runtime_version': '3.12.10', 'langchain_version': None, 'py_implementation': 'CPython', 'langchain_core_version': None}})

In [5]:
list(client.list_examples(dataset_id=dataset.id, limit=10))[0].outputs

{'ground_truth': "Examples of unanswerable questions with this dataset: 1) 'How many units of KEEPRO Pencil are currently in stock?' ‚Äî no inventory counts provided. 2) 'What is the customer return rate for the Fossduck Bluetooth adapter?' ‚Äî no sales/returns data. 3) 'Do you offer bulk pricing or discounts for buying 100 Tenda A33 extenders?' ‚Äî pricing and bulk policy absent. 4) 'What is the shipment lead time from your warehouse to my ZIP code?' ‚Äî no fulfillment/shipping info. 5) 'Has the ACEMAGICIAN Mini PC passed any independent performance benchmarks compared to other Ryzen 5 5500U systems?' ‚Äî no benchmark comparisons in chunks.",
 'reference_context_ids': ['B0BF18F6R7',
  'B09PFSVK44',
  'B0BZ5R7CVP',
  'B0CF57H28T',
  'B0C9XFF3CT'],
 'reference_descriptions': ['KEEPRO Pencil 2nd Generation for iPad, Magnetic Wireless Charge Tilt Sensitivity Palm Rejection Active Pen for Apple iPad Pro 11" 4/3/2/1, iPad Pro 12.9" 6/5/4/3, iPad Air 4/5, iPad Mini 6 [Compatibility]- ONLY co

In [6]:
list(client.list_examples(dataset_id=dataset.id, limit=10))[0].inputs

{'question': 'Which five questions cannot be answered from these chunks? (Provide examples of types of questions that lack information here.)'}

In [7]:
reference_input = list(client.list_examples(dataset_id=dataset.id, limit=10))[0].inputs
reference_output = list(client.list_examples(dataset_id=dataset.id, limit=10))[0].outputs

### RAG Pipeline

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model,
    )

    return response.data[0].embedding


def retrieve_data(query, qdrant_client, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query=query_embedding,
        limit=k,
    )

    retrieved_context_ids = []
    retrieved_context = []
    similarity_scores = []
    retrieved_context_ratings = []

    for result in results.points:
        retrieved_context_ids.append(result.payload["parent_asin"])
        retrieved_context.append(result.payload["description"])
        retrieved_context_ratings.append(result.payload["average_rating"])
        similarity_scores.append(result.score)

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "retrieved_context_ratings": retrieved_context_ratings,
        "similarity_scores": similarity_scores,
    }


def process_context(context):

    formatted_context = ""

    for id, chunk, rating in zip(context["retrieved_context_ids"], context["retrieved_context"], context["retrieved_context_ratings"]):
        formatted_context += f"- ID: {id}, rating: {rating}, description: {chunk}\n"

    return formatted_context


def build_prompt(preprocessed_context, question):

    prompt = f"""
You are a shopping assistant that can answer questions about the products in stock.

You will be given a question and a list of context.

Instructions:
- You need to answer the question based on the provided context only.
- Never use word context and refer to it as the available products.

Context:
{preprocessed_context}

Question:
{question}
"""

    return prompt


def generate_answer(prompt):

    response = openai.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "system", "content": prompt}],
        reasoning_effort="minimal"
    )

    return response.choices[0].message.content


def rag_pipeline(question, top_k=5):

    qdrant_client = QdrantClient(url="http://localhost:6333")

    retrieved_context = retrieve_data(question, qdrant_client, top_k)
    preprocessed_context = process_context(retrieved_context)
    prompt = build_prompt(preprocessed_context, question)
    answer = generate_answer(prompt)

    final_result = {
        "answer": answer,
        "question": question,
        "retrieved_context_ids": retrieved_context["retrieved_context_ids"],
        "retrieved_context": retrieved_context["retrieved_context"],
        "similarity_scores": retrieved_context["similarity_scores"]
    }

    return final_result

In [9]:
rag_pipeline("Can I get some charger?", top_k=5)

{'answer': 'Yes. Here are charger options from the available products:\n- B0BYYLJRHT: iPhone charger cables (3-pack, 3 ft each) with Apple MFi certification.\n- B0BFPZGYLD: 5 in 1 USB C to Multi Charging Cable (10 ft), includes Lightning, USB-C, Micro USB. Note: charging only, not for data.\n- B09TNXY54Y: MUXA 6-pack colorful nylon Lightning cables (various lengths: 3/3/6/6/10/10 ft), Apple MFi certified.\n- B0BV6PWVCG: GREPHONE 2-pack USB-C to Lightning cables (6 ft), Apple MFi certified.\n- B0BGDQLZD2: Mixblu charger cable replacement for Fitbit Inspire 3 (2-pack, 3.3 ft).\n\nIf you tell me which device you‚Äôre charging (iPhone, Android, Fitbit, etc.) and preferred length, I‚Äôll narrow down the best match.',
 'question': 'Can I get some charger?',
 'retrieved_context_ids': ['B0BYYLJRHT',
  'B0BFPZGYLD',
  'B09TNXY54Y',
  'B0BV6PWVCG',
  'B0BGDQLZD2'],
 'retrieved_context': ['iPhone Charger Cord Lightning Cables, Original 2022 Upgraded [3Pack 3ft] Apple MFi Certified USB A Charging 

### RAGAS metrics

In [10]:
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import IDBasedContextPrecision, IDBasedContextRecall, Faithfulness, ResponseRelevancy

  from ragas.metrics import IDBasedContextPrecision, IDBasedContextRecall, Faithfulness, ResponseRelevancy
  from ragas.metrics import IDBasedContextPrecision, IDBasedContextRecall, Faithfulness, ResponseRelevancy
  from ragas.metrics import IDBasedContextPrecision, IDBasedContextRecall, Faithfulness, ResponseRelevancy
  from ragas.metrics import IDBasedContextPrecision, IDBasedContextRecall, Faithfulness, ResponseRelevancy


In [11]:
ragas_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
ragas_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

  ragas_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
  ragas_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))


In [12]:
reference_input

{'question': 'Which five questions cannot be answered from these chunks? (Provide examples of types of questions that lack information here.)'}

In [13]:
reference_output

{'ground_truth': "Examples of unanswerable questions with this dataset: 1) 'How many units of KEEPRO Pencil are currently in stock?' ‚Äî no inventory counts provided. 2) 'What is the customer return rate for the Fossduck Bluetooth adapter?' ‚Äî no sales/returns data. 3) 'Do you offer bulk pricing or discounts for buying 100 Tenda A33 extenders?' ‚Äî pricing and bulk policy absent. 4) 'What is the shipment lead time from your warehouse to my ZIP code?' ‚Äî no fulfillment/shipping info. 5) 'Has the ACEMAGICIAN Mini PC passed any independent performance benchmarks compared to other Ryzen 5 5500U systems?' ‚Äî no benchmark comparisons in chunks.",
 'reference_context_ids': ['B0BF18F6R7',
  'B09PFSVK44',
  'B0BZ5R7CVP',
  'B0CF57H28T',
  'B0C9XFF3CT'],
 'reference_descriptions': ['KEEPRO Pencil 2nd Generation for iPad, Magnetic Wireless Charge Tilt Sensitivity Palm Rejection Active Pen for Apple iPad Pro 11" 4/3/2/1, iPad Pro 12.9" 6/5/4/3, iPad Air 4/5, iPad Mini 6 [Compatibility]- ONLY co

In [14]:
result = rag_pipeline(reference_input["question"])

In [15]:
result

{'answer': 'From the available products, here are examples of five question types that cannot be answered because the chunks do not contain the needed information:\n\n- Questions about price or current sale status for any item.\n  Example: ‚ÄúHow much does this 5-in-1 USB C to Multi Charging Cable cost today?‚Äù\n\n- Questions about availability or stock status (in-stock, out-of-stock) for items not mentioned.\n  Example: ‚ÄúIs the 1TB USB Flash Drive currently available in black?‚Äù\n\n- Questions about compatibility beyond what is stated in the descriptions.\n  Example: ‚ÄúWill the MFi cable work with an iPad Pro or other tablets?‚Äù (noted as not for iPad in the description)\n\n- Questions about warranty terms or return policy specifics not detailed in the text.\n  Example: ‚ÄúWhat is the exact return window for the iPhone charger cables?‚Äù\n\n- Questions about performance details not covered, such as data transfer speeds for the USB cables (only charging speeds are specified for s

In [16]:
async def ragas_faithfulness(run, example):

    sample = SingleTurnSample(
            user_input=run["question"],
            response=run["answer"],
            retrieved_contexts=run["retrieved_context"]
        )
    scorer = Faithfulness(llm=ragas_llm)

    return await scorer.single_turn_ascore(sample)

In [17]:
await ragas_faithfulness(result, "")

1.0

In [18]:
async def ragas_response_relevancy(run, example):

    sample = SingleTurnSample(
            user_input=run["question"],
            response=run["answer"],
            retrieved_contexts=run["retrieved_context"]
        )
    scorer = ResponseRelevancy(llm=ragas_llm, embeddings=ragas_embeddings)

    return await scorer.single_turn_ascore(sample)

In [19]:
await ragas_response_relevancy(result, "")

np.float64(0.0)

In [20]:
async def ragas_context_precision_id_based(run, example):

    sample = SingleTurnSample(
            retrieved_context_ids=run["retrieved_context_ids"],
            reference_context_ids=example["reference_context_ids"]
        )
    scorer = IDBasedContextPrecision()

    return await scorer.single_turn_ascore(sample)

In [21]:
await ragas_context_precision_id_based(result, reference_output)

0.2

In [22]:
async def ragas_context_recall_id_based(run, example):

    sample = SingleTurnSample(
            retrieved_context_ids=run["retrieved_context_ids"],
            reference_context_ids=example["reference_context_ids"]
        )
    scorer = IDBasedContextRecall()

    return await scorer.single_turn_ascore(sample)

In [23]:
await ragas_context_recall_id_based(result, reference_output)

0.2