In [2]:
!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu chromadb transformers torch

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloa

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load PDF
print("Loading PDF...")
loader = PyPDFLoader("/content/BNBC 2020_Earthquake load.pdf")
documents = loader.load()

# Split into chunks
print("Splitting documents...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
texts = text_splitter.split_documents(documents)
print(f"Created {len(texts)} text chunks")

# Create embeddings
print("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create vector store
print("Building vector store...")
vectorstore = FAISS.from_documents(texts, embeddings)
print("✅ Vector store created!")

Loading PDF...
Splitting documents...
Created 172 text chunks
Creating embeddings...


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Building vector store...
✅ Vector store created!


In [6]:
class BuildingGuidelinesRAG:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.vectorstore = None

    def setup(self):
        from langchain_community.document_loaders import PyPDFLoader
        from langchain_text_splitters import RecursiveCharacterTextSplitter
        from langchain_community.embeddings import HuggingFaceEmbeddings
        from langchain_community.vectorstores import FAISS

        # Load PDF
        print("📄 Loading PDF...")
        loader = PyPDFLoader(self.pdf_path)
        documents = loader.load()
        print(f"   Loaded {len(documents)} pages")

        # Split
        print("✂️  Splitting into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        texts = text_splitter.split_documents(documents)
        print(f"   Created {len(texts)} chunks")

        # Create vector store
        print("🔍 Creating embeddings and vector store...")
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        self.vectorstore = FAISS.from_documents(texts, embeddings)
        print("✅ RAG system ready!\n")

    def search(self, question, k=3):
        """Search for relevant chunks"""
        docs = self.vectorstore.similarity_search(question, k=k)
        return docs

    def query(self, question, k=3):
        """Query and display results"""
        print(f"\n❓ Question: {question}\n")
        print("=" * 80)

        docs = self.search(question, k=k)

        for i, doc in enumerate(docs, 1):
            page_num = doc.metadata.get('page', 'N/A')
            print(f"\n📄 Result {i} (Page {page_num}):")
            print("-" * 80)
            print(doc.page_content)
            print("=" * 80)

        return docs

    def get_context(self, question, k=3):
        """Get context for use with external LLM"""
        docs = self.search(question, k=k)
        context = "\n\n".join([doc.page_content for doc in docs])
        return context

# Usage
rag = BuildingGuidelinesRAG("/content/BNBC 2020_Earthquake load.pdf")
rag.setup()

# Ask questions
rag.query("What are the seismic zone classifications?")

📄 Loading PDF...
   Loaded 64 pages
✂️  Splitting into chunks...
   Created 172 chunks
🔍 Creating embeddings and vector store...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✅ RAG system ready!


❓ Question: What are the seismic zone classifications?


📄 Result 1 (Page 0):
--------------------------------------------------------------------------------
earthquake ground motions in order to minimize the risk to life for all 
structures, to increase the expected performance of higher occupancy structures 
as compared to ordinary structures, and to improve the capability of essential 
structures to function after an earthquake.  It is not economically feasible to 
design and construct buildings without any damage for a major earthquake 
event. The intent is therefore to allow inelastic deformation and structural 
damage at preferred locations in the structure without endangering structural 
integrity and to prevent structural collapse during a major earthquake. 
The seismic zoning map (Fig. 6.2.24) divides the country into four seismic zones 
with different expected levels of intensity of ground motion. Each seismic zone 
has a zone coefficient which provides

[Document(id='5dee3e8d-0366-4e03-9546-995b96023560', metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2026-02-05T11:37:29+00:00', 'source': '/content/BNBC 2020_Earthquake load.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1'}, page_content='earthquake ground motions in order to minimize the risk to life for all \nstructures, to increase the expected performance of higher occupancy structures \nas compared to ordinary structures, and to improve the capability of essential \nstructures to function after an earthquake.  It is not economically feasible to \ndesign and construct buildings without any damage for a major earthquake \nevent. The intent is therefore to allow inelastic deformation and structural \ndamage at preferred locations in the structure without endangering structural \nintegrity and to prevent structural collapse during a major earthquake. \nThe seismic zoning map (Fig. 6.2.24) divides the country into four seismic zones \nwith di

In [7]:
from transformers import pipeline

class BuildingGuidelinesRAGWithLLM:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.vectorstore = None
        self.qa_pipeline = None

    def setup(self):
        from langchain_community.document_loaders import PyPDFLoader
        from langchain_text_splitters import RecursiveCharacterTextSplitter
        from langchain_community.embeddings import HuggingFaceEmbeddings
        from langchain_community.vectorstores import FAISS

        # Load PDF
        print("📄 Loading PDF...")
        loader = PyPDFLoader(self.pdf_path)
        documents = loader.load()

        # Split
        print("✂️  Splitting into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        texts = text_splitter.split_documents(documents)

        # Create vector store
        print("🔍 Creating embeddings...")
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        self.vectorstore = FAISS.from_documents(texts, embeddings)

        # Load QA model
        print("🤖 Loading question-answering model...")
        self.qa_pipeline = pipeline(
            "question-answering",
            model="distilbert-base-cased-distilled-squad"
        )

        print("✅ RAG system ready!\n")

    def query(self, question, k=3):
        """Query with answer generation"""
        print(f"\n❓ Question: {question}\n")

        # Get relevant documents
        docs = self.vectorstore.similarity_search(question, k=k)

        # Combine context
        context = "\n\n".join([doc.page_content for doc in docs])

        # Generate answer
        try:
            result = self.qa_pipeline(question=question, context=context)

            print("💡 Answer:")
            print("-" * 80)
            print(result['answer'])
            print(f"\nConfidence: {result['score']:.2%}")
            print("=" * 80)

            print("\n📚 Source Documents:")
            print("=" * 80)
            for i, doc in enumerate(docs, 1):
                page_num = doc.metadata.get('page', 'N/A')
                print(f"\nSource {i} (Page {page_num}):")
                print("-" * 80)
                print(doc.page_content[:300] + "...")
            print("=" * 80)

        except Exception as e:
            print(f"Error generating answer: {e}")
            print("\n📚 Relevant Documents:")
            for i, doc in enumerate(docs, 1):
                print(f"\n{i}. Page {doc.metadata.get('page', 'N/A')}:")
                print(doc.page_content)

        return docs

# Usage
rag = BuildingGuidelinesRAGWithLLM("/content/BNBC 2020_Earthquake load.pdf")
rag.setup()


📄 Loading PDF...
✂️  Splitting into chunks...
🔍 Creating embeddings...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


🤖 Loading question-answering model...


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/102 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

✅ RAG system ready!



In [None]:

print(rag.query("?"))

In [8]:
rag.query("What are the seismic zone classifications?")



❓ Question: What are the seismic zone classifications?

💡 Answer:
--------------------------------------------------------------------------------
site 
profile and evaluated soil properties

Confidence: 61.71%

📚 Source Documents:

Source 1 (Page 0):
--------------------------------------------------------------------------------
earthquake ground motions in order to minimize the risk to life for all 
structures, to increase the expected performance of higher occupancy structures 
as compared to ordinary structures, and to improve the capability of essential 
structures to function after an earthquake.  It is not economicall...

Source 2 (Page 4):
--------------------------------------------------------------------------------
evsjv‡`k †M‡RU, AwZwi³, †deªæqvwi 11, 2021 3189    
 
  
 
The intent of the site investigation is to classify the Site into one of types SA, SB, 
SC, SD, SE, S 1 and S 2 as defined in Sec 2.5.3.2. Such classification is based on site 
profile and evaluated soi

[Document(id='75ecddbd-113a-4385-a9f2-6165cf2ad005', metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2026-02-05T11:37:29+00:00', 'source': '/content/BNBC 2020_Earthquake load.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1'}, page_content='earthquake ground motions in order to minimize the risk to life for all \nstructures, to increase the expected performance of higher occupancy structures \nas compared to ordinary structures, and to improve the capability of essential \nstructures to function after an earthquake.  It is not economically feasible to \ndesign and construct buildings without any damage for a major earthquake \nevent. The intent is therefore to allow inelastic deformation and structural \ndamage at preferred locations in the structure without endangering structural \nintegrity and to prevent structural collapse during a major earthquake. \nThe seismic zoning map (Fig. 6.2.24) divides the country into four seismic zones \nwith di