In [2]:
import pandas as pd

In [3]:
import os

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

## Vector embeddings (Qdrant)¶

In [None]:
from qdrant_client import QdrantClient, models

In [None]:
qd_client = QdrantClient("http://localhost:6333")

In [None]:
documents = pd.read_json('data/AWSBedrockRAG.json')

In [None]:
documents = documents.to_dict(orient='records')

In [8]:
from fastembed import TextEmbedding

In [9]:
TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [10]:
import json

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model['dim'] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-zh-v1.5",
  "sources": {
    "hf": "Qdrant/bge-small-zh-v1.5",
    "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.09,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "Qdrant/clip-ViT-B-32-text",
  "sources": {
    "hf": "Qdrant/clip-ViT-B-32-text",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model.onnx",
  "description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
  "license": "mit",
  "size_in_GB": 0.25,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "jinaai/jina-embeddings-v2-small-e

In [11]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [12]:
collection_name = "bedrock_rag"

In [13]:
#qd_client.delete_collection(collection_name=collection_name)

In [14]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [15]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="category",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="title",
    field_schema="keyword"
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [17]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="tags",
    field_schema="keyword"
)

UpdateResult(operation_id=5, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
points = []

for i, doc in enumerate(documents):
    text = doc['title'] + ' ' + doc['content']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [20]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
import boto3

In [22]:
def vector_search(question):
    print('vector_search is used')
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [76]:
prompt_template = """
You're a aws expert. Answer the QUESTION based on the CONTEXT from our aws core service database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
id: {id}
service: {service}
category: {category}
title: {title}
content: {content}
tags: {tags}
""".strip()

In [77]:
def build_prompt(query, search_results):
    context =""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [24]:
def llm(prompt, model):
    client = boto3.client("bedrock-runtime", region_name="us-east-1")

    messages = [{"role": "user", "content": [{"text": prompt}] }]

    inference_config = {"temperature": 0.1, "topP": 0.9}

    response = client.converse(modelId=model, messages=messages, inferenceConfig=inference_config)
    
    try:
        return response["output"]["message"]["content"][0]["text"]
    except (KeyError, IndexError, TypeError):
        return ""

In [25]:
def rag(query, model):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [57]:
question = "LLM evaluation 0 points: No evaluation of final LLM output is provided 1 point: Only one approach (e.g., one prompt) is evaluated 2 points: Multiple approaches are evaluated, and the best one is used = can you explain which are multiple appraoches"
model_id = "amazon.nova-micro-v1:0"
answer = rag(question, model_id)
print(answer)

Based on the provided context from AWS core service database, the question pertains to the evaluation of multiple approaches in the context of Amazon Bedrock Guardrails. Specifically, the question asks which scenarios involve multiple approaches.

The context provided does not explicitly mention any evaluation of multiple approaches for the final LLM (Large Language Model) output. However, it does describe various functionalities related to Amazon Bedrock Guardrails, such as creating versions, adding denied topics, supported regions for safeguard tiers, deleting guardrail versions, and content filters and prompt attacks language support.

To answer the question about multiple approaches, we need to infer from the context. The most relevant section that hints at multiple approaches is:

**id: 200**
service: Amazon Bedrock
category: Batch Inference
title: Content filters and prompt attacks language support

This section discusses the language support for text-based content filters and pr

## Retrieval Evaluation

In [27]:
df_question = pd.read_csv('data/ground-truth-retrieval.csv')

In [28]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the definition of a foundation model i...
1,0,How can a foundation model generate a variety ...
2,0,What types of data can a foundation model conv...
3,0,What is required before using an Amazon Bedroc...
4,0,What are the different use cases for a foundat...


In [29]:
ground_truth = df_question.to_dict(orient='records')

In [30]:
ground_truth[0]

{'id': 0,
 'question': 'What is the definition of a foundation model in Amazon Bedrock?'}

In [31]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [32]:
def vector_search(question):
    #print('vector_search is used')
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [33]:
relevance_total = []

In [34]:
def evaluate(ground_truth, search_function):

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [35]:
from tqdm.auto import tqdm

In [36]:
evaluate(ground_truth, lambda q: vector_search(q['question']))

  0%|          | 0/1620 [00:00<?, ?it/s]

{'hit_rate': 0.10679012345679012, 'mrr': 0.051862139917695484}

## RAG evaluation Appraoch 1

In [37]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

Return only valid JSON in this exact format:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}

Do not include any text before or after the JSON object.
""".strip()

In [38]:
len(ground_truth)

1620

In [39]:
record = ground_truth[0]

In [40]:
print(answer)

The cost for fine-tuning Amazon Titan Text models in Amazon Bedrock depends on the number of epochs used, with each epoch processing the entire training dataset once. Specific hyperparameters like `epochCount` directly impact the customization cost. For precise pricing details, refer to the AWS pricing page for Amazon Bedrock.


In [41]:
prompt = prompt2_template.format(question=question, answer_llm=answer)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: how much it cost for fine tunning model? answer in concise way.
Generated Answer: The cost for fine-tuning Amazon Titan Text models in Amazon Bedrock depends on the number of epochs used, with each epoch processing the entire training dataset once. Specific hyperparameters like `epochCount` directly impact the customization cost. For precise pricing details, refer to the AWS pricing page for Amazon Bedrock.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

Return only valid JSON in this exact format:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation

In [42]:
import json

In [43]:
df_sample = df_question.sample(n=200, random_state=1)

In [44]:
sample = df_sample.to_dict(orient='records')

In [45]:
model = 'us.anthropic.claude-3-5-haiku-20241022-v1:0'
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt, model)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [46]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [47]:
df_eval.relevance.value_counts(normalize=True)

relevance
NON_RELEVANT       0.880
PARTLY_RELEVANT    0.085
RELEVANT           0.035
Name: proportion, dtype: float64

In [50]:
df_eval.to_csv('data/rag-eval-claude-3-5-haiku.csv', index=False)

In [56]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
0,"Based on the provided context, there is no spe...",480,What role do custom outputs play in integratin...,NON_RELEVANT,The generated answer explicitly states that th...
1,"Based on the provided CONTEXT, I cannot find s...",543,How does Amazon Bedrock secure data at rest an...,NON_RELEVANT,The generated answer explicitly states that it...
2,"I apologize, but I cannot find specific inform...",121,Which permissions does the service role assume...,NON_RELEVANT,The generated answer explicitly states that it...
3,"I apologize, but based on the provided CONTEXT...",65,Which API operations does the TwelveLabs Pegas...,NON_RELEVANT,The generated answer explicitly states that th...
4,"Based on the provided CONTEXT, I cannot find s...",452,How does Amazon Bedrock Flows facilitate inter...,NON_RELEVANT,The generated answer explicitly states that it...
...,...,...,...,...,...
195,"I apologize, but based on the provided CONTEXT...",469,What techniques are used for model customizati...,NON_RELEVANT,The generated answer explicitly states that it...
196,"Based on the provided context, I cannot find s...",121,What is the recommended practice for configuri...,NON_RELEVANT,The generated answer explicitly states that it...
197,"I apologize, but based on the provided CONTEXT...",238,What is required to send a StopEvaluationJob r...,NON_RELEVANT,The generated answer explicitly states that it...
198,"Based on the provided CONTEXT, there is no spe...",99,If I have the `AmazonBedrockFullAccess` policy...,NON_RELEVANT,The generated answer does not provide any subs...


In [59]:
model_id = "amazon.nova-micro-v1:0"
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt, model)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [61]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [62]:
df_eval.relevance.value_counts()

relevance
NON_RELEVANT       176
PARTLY_RELEVANT     16
RELEVANT             8
Name: count, dtype: int64

In [63]:
df_eval.relevance.value_counts(normalize=True)

relevance
NON_RELEVANT       0.88
PARTLY_RELEVANT    0.08
RELEVANT           0.04
Name: proportion, dtype: float64

In [65]:
df_eval.to_csv('data/rag-eval-amazon.nova-micro.csv', index=False)

## RAG evaluation Appraoch 2

In [88]:
prompt_template_2 = """
You are an expert AWS assistant. Use the context to answer the user query step-by-step and then give a final answer.

Example 1:
Q: What is S3 used for?
A: 
Step 1: Identify S3 function → object storage.
Step 2: Common use cases → backups, static website hosting, data lake.
Final Answer: S3 is an object storage service used for backups, hosting, and analytics.

Example 2:
Q: How to secure an EC2 instance?
A:
Step 1: Consider network layer → security groups.
Step 2: Consider permissions → IAM roles.
Step 3: Consider data protection → patching, encryption.
Final Answer: Secure EC2 with security groups, IAM roles, patching, and encryption.

Context:
{context}

User Question: {query}
Answer step-by-step, then give Final Answer:
""".strip()

In [89]:
def build_prompt_2(query, search_results):
    context =""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template_2.format(question=query, context=context).strip()
    return prompt

In [90]:
def rag_2(query, model):
    search_results = vector_search(query)
    prompt = build_prompt_2(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [91]:
question = "LLM evaluation 0 points: No evaluation of final LLM output is provided 1 point: Only one approach (e.g., one prompt) is evaluated 2 points: Multiple approaches are evaluated, and the best one is used = can you explain which are multiple appraoches"
model_id = "amazon.nova-micro-v1:0"
answer = rag_2(question, model_id)
print(answer)

KeyError: 'top_retrieved_passages'

In [None]:
model = 'us.anthropic.claude-3-5-haiku-20241022-v1:0'
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag_2(question, model) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm_2(prompt, model)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))