## Load PDF and Asynchronously Extract Page

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(r"NPL.pdf")
pages = []
async for page in loader.alazy_load():
    pages.append(page)

## Define a Pydantic Schema and Clean Page Content

In [2]:
from typing import Dict, Any
from pydantic import BaseModel, Field
import re

class Document_Custom(BaseModel):
    metadata: Dict[str, Any] = Field(..., description="Metadata of the page")
    page_content: str = Field(..., description="The text content of the page")


pages_schema = [
    Document_Custom(
        metadata={
            'page': page.metadata['page'],
            'total_pages': page.metadata['total_pages'],
            'page_label': page.metadata['page_label']
        },
        page_content=re.sub(
            r'^\s*Nepal\s*2009\s*Compiled by Sagar Mishra\s*',
            '',
            re.sub(r'\xa0+|\s{4,}', ' ', page.page_content)
        ).strip()
    )
    for page in pages
]

pages_schema[0]
print(f"Total Documents: {len(pages_schema)}\n\n")
print(f"First Document: {pages_schema[0]}")

Total Documents: 39


First Document: metadata={'page': 0, 'total_pages': 39, 'page_label': '1'} page_content='NEPAL Country profile: 1) General information of Nepal \nFormal Name: The Federal Democratic Republic of Nepal. \nNepal is a landlocked country and the world’s youngest republic. The establishment and \ndeclaration of a federal dem ocratic republic is 28 May 2008 by the elected constitution \nassembly.  Nepal has always been an independent  and sovereign country with glorious \nhistory, culture and tradition that date back to time immemorial.  \nNepal is characterized by a rugged topogr aphy, very high relief, variable climatic \nconditions, complex geological structure wit h active tectonic process and continued \nseismic activities. It is situated in centra l part of the Himalayan belt. It is situated \nbetween the latitudes of 26’22’’to 30’27’’north and the longitudes of 80’4”to 88’12’’ east. \nIt covers a landmass area of 147,181sq km and 4,000 sq.km Inland water bodies la

## Split Documents into Overlapping Chunks

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=200,
)
chunk_texts = text_splitter.split_documents(pages_schema)

print(f"Total chunks: {len(chunk_texts)}")

Total chunks: 262


## Set Up AWS Credentials and Clients

In [4]:
from dotenv import load_dotenv
import boto3
import os

# load dotenv
load_dotenv()

session = boto3.Session(profile_name=os.getenv('AWS_PROFILE_NAME'))
credentials = session.get_credentials().get_frozen_credentials()

bedrock = boto3.client("bedrock-runtime", 
                       region_name="us-east-1",
                        aws_access_key_id=credentials.access_key,
                        aws_secret_access_key=credentials.secret_key,
                        aws_session_token=credentials.token
                       )

s3vectors = boto3.client("s3vectors", 
                         region_name="us-east-1",
                         aws_access_key_id=credentials.access_key,
                         aws_secret_access_key=credentials.secret_key,
                         aws_session_token=credentials.token
                         )

## Define Function to Get Embeddings from Bedrock

In [5]:
def get_embedding(text: str) -> list:
    import json
    body = json.dumps({"inputText": text})

    response = bedrock.invoke_model(
        modelId="amazon.titan-embed-text-v2:0", 
        body=body,
        contentType="application/json"
    )

    response_body = json.loads(response['body'].read())
    embedding = response_body.get("embedding", [])
    return embedding

emd = get_embedding("i am miraj")
len(emd)

1024

## Generate Embeddings for All Chunks

In [6]:
Embeddings=[]
for text in chunk_texts:
    emb=get_embedding(text.page_content)
    Embeddings.append(emb)

## Prepare Items for S3 Vector Storage

In [7]:
import uuid

items_to_insert = []
for vector, text in zip(Embeddings, chunk_texts):
    item = {
        "key": f"npl-page-{text.metadata['page']}-{uuid.uuid4()}",
         "data": {"float32": vector},
        "metadata": {
            "page": text.metadata['page'],
            "total_pages": text.metadata['total_pages'],
            "page_label": text.metadata['page_label'],
            "content": text.page_content
        }
    }
    items_to_insert.append(item)
  

## Write Embeddings and Metadata to S3 Vector Index

In [8]:
# Write embeddings into vector index with metadata.
s3vectors.put_vectors(
    vectorBucketName="npl-bucket",   
    indexName="npl-index",   
    vectors=items_to_insert
)

{'ResponseMetadata': {'RequestId': 'b60ef2f1-0bdd-4387-a420-63928100ed69',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 28 Aug 2025 11:46:59 GMT',
   'content-type': 'application/json',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amz-request-id': 'b60ef2f1-0bdd-4387-a420-63928100ed69',
   'access-control-allow-origin': '*',
   'vary': 'origin, access-control-request-method, access-control-request-headers',
   'access-control-expose-headers': '*'},
  'RetryAttempts': 0}}

## Write Embeddings and Metadata to S3 Vector Index

In [15]:
import json

input_text = "how many people died in Koshi flood 2008?"

input_embeddings = get_embedding(input_text)

# Query vector index with a metadata filter.
retrival_response = s3vectors.query_vectors(
    vectorBucketName="npl-bucket",
    indexName="npl-index",
    queryVector={"float32": input_embeddings}, 
    topK=3, 
    # filter={"page": 16},
    returnDistance=True,
    returnMetadata=True
)
print(json.dumps(retrival_response["vectors"], indent=2))
    

[
  {
    "key": "npl-page-12-7cfa7141-492e-4d21-a835-36876d479e89",
    "metadata": {
      "page": 12,
      "total_pages": 39,
      "content": "damage and the risk could not be ignored anymore. \n(2) Koshi flood 2008 \nTorrential rain that start ed in the beginning of august 17,2008 killed more than 20 and \nmissing more than 50 injured 200, affect ed approximately 150000 thousand and 20000 \nhousehold completely destroyed nearly $3 billion in properly damaged and loss. \n(3) Landslide (August 2007) \nFloods and landslides triggered by torrential rains in August 2007 killed 203, affected",
      "page_label": "13"
    },
    "distance": 0.21141356229782104
  },
  {
    "key": "npl-page-5-ac60f478-30a1-42fb-b830-43573a6521bc",
    "metadata": {
      "content": "infrastructure like roads and bridges. Inundati ons have disrupted social and economic \ndevelopment of many parts of terrain region in the country. The flood of august 2008 in \nKoshi river was the most devastating .The fo 

## RAG Model: Generate Answer Using Retrieved Context

In [16]:
def RagModel(prompt_template, bedrock_client, model_selected):
        
        # Inference configuration
        inference_config = {
            "temperature": 0.5,
            "topP": 0.5,
            "stopSequences": [],
            "maxTokens": 4096,
        }

        additional_model_fields = {
            "top_k": 250,
            
        }
    
    
        chat_messages =  [
            {"role": "user", "content": [{"text": prompt_template}]}
        ]
        

        try:
            response = bedrock_client.converse(
                # system=system_prompts,
                modelId=model_selected,
                messages=chat_messages,
                inferenceConfig=inference_config,
                additionalModelRequestFields=additional_model_fields
            )
            
            output_message = response['output']['message']
            answer = output_message['content'][0]['text']
            return answer
    
        except Exception as e:
            print(f"Error during conversation: {str(e)}")
            return "An error occurred during inference."


prompt= """"

You are an expert in Nepal's history,Your task is to provide a comprehensive answer based on the provided context.
Please read the following context carefully and answer the question at the end.
Context:
{context}\n\n

Question: {question}\n\n

Answer:"""


model_response = RagModel(
    prompt_template=prompt.format(context=retrival_response, question= input_text),
    bedrock_client=bedrock,
    model_selected="us.anthropic.claude-3-7-sonnet-20250219-v1:0"
)

print("Model Response:", model_response)

Model Response: Based on the provided context, the Koshi flood of 2008, which began with torrential rain on August 17, 2008, killed more than 20 people. The flood also left more than 50 people missing, injured approximately 200 people, affected around 150,000 people, and completely destroyed about 20,000 households. The disaster caused nearly $3 billion in property damage and loss. This flood was described as "the most devastating" among the types of floods observed in Nepal.
