In [77]:
!pip install langchain sentence-transformers faiss-cpu streamlit python-docx pdfplumber beautifulsoup4 requests


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

print("Setup successful!")

model = SentenceTransformer("all-MiniLM-L6-v2")

vec = model.encode(["hello world"])
print("Embedding shape:", vec.shape)

# FAISS test
dim = vec.shape[1]
index = faiss.IndexFlatIP(dim)
faiss.normalize_L2(vec)
index.add(vec)
print("Index size:", index.ntotal)


Setup successful!


In [None]:
import os

folders = ["data_raw", "data_processed"]
for f in folders:
    os.makedirs(f, exist_ok=True)

print("Folders created:", folders)


Folders created: ['data_raw', 'data_processed']


In [None]:
import pdfplumber

def load_pdf(path):
    pages_text = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            pages_text.append(text)
    return pages_text



In [None]:
pages = load_pdf('data_raw/sample.pdf')
print(pages[:1])

['Somalia Flood Exposure Methodology Note\nAnalysis for 2024 HNRP\nThis technical note summarises the methodology used to calculate the number of people\npotentially exposed to flooding in Somalia in the 2024 Somalia Humanitarian Needs and\nResponse Plan (HNRP). The UN OCHA Centre for Humanitarian Data worked with awiderange\nof technical partners to develop a methodology that was then endorsed by the Somalia ICCG\nandHCT.\nDaily FloodScan (1998-2022) & WorldPop (2020 UN Adjusted) raster data wasanalysedtogain\nunderstanding of flood conditions across Somalia for both March-April-May (MAM) and\nOctober-November-December (OND) seasons. FloodScan daily flood fraction Standard Flood\nExposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites\nforboththeMAMandONDseasonsforallyearsofhistoricalFloodScandata(1998-2022).\nTheyearlyseasonalSFEDcompositeswerethenprocessed/reclassifiedintwodistinctways:\n1. Compositesreclassifiedtobinaryusinga20percentfloodfractionthr

# pull text from any website URL so it can use that information for retrieval & answering questions.

In [None]:
import requests
from bs4 import BeautifulSoup

def load_url(url):
    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # remove script + style
    for tag in soup(["script", "style"]):
        tag.decompose()
    
    text = soup.get_text(separator="\n")
    clean_text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
    return clean_text

# Example:
text = load_url("https://example.com")
print(text[:500])


Example Domain
Example Domain
This domain is for use in documentation examples without needing permission. Avoid use in operations.
Learn more


In [None]:
def ingest_file(path_or_url):
    if path_or_url.startswith("http"):
        content = load_url(path_or_url)
        return [{"source": path_or_url, "text": content}]
    
    if path_or_url.endswith(".pdf"):
        pages = load_pdf(path_or_url)
        return [{"source": path_or_url, "text": p} for p in pages]
    
    if path_or_url.endswith(".csv"):
        rows = load_csv(path_or_url)
        return [{"source": path_or_url, "text": r} for r in rows]
    
    return []


In [None]:
test_pdf = "data_raw/sample.pdf"
docs = ingest_file(test_pdf)

print("Loaded:", len(docs), "items")
docs[:2]


Loaded: 2 items


[{'source': 'data_raw/sample.pdf',
  'text': 'Somalia Flood Exposure Methodology Note\nAnalysis for 2024 HNRP\nThis technical note summarises the methodology used to calculate the number of people\npotentially exposed to flooding in Somalia in the 2024 Somalia Humanitarian Needs and\nResponse Plan (HNRP). The UN OCHA Centre for Humanitarian Data worked with awiderange\nof technical partners to develop a methodology that was then endorsed by the Somalia ICCG\nandHCT.\nDaily FloodScan (1998-2022) & WorldPop (2020 UN Adjusted) raster data wasanalysedtogain\nunderstanding of flood conditions across Somalia for both March-April-May (MAM) and\nOctober-November-December (OND) seasons. FloodScan daily flood fraction Standard Flood\nExposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites\nforboththeMAMandONDseasonsforallyearsofhistoricalFloodScandata(1998-2022).\nTheyearlyseasonalSFEDcompositeswerethenprocessed/reclassifiedintwodistinctways:\n1. Compositesreclass

# Processing & Chunking Text
Chunking is critical for RAG.
Instead of sending entire documents to the model, we break them into small meaningful pieces.

In [None]:
!pip install langchain
!pip install langchain-text-splitters


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

def chunk_documents(docs):
    """
    Input: docs = [{"source": "...", "text": "..."}]
    Output: list of chunks with metadata
    """
    processed = []

    for d in docs:
        chunks = text_splitter.split_text(d["text"])
        for i, c in enumerate(chunks):
            processed.append({
                "source": d["source"],
                "chunk_id": i,
                "text": c
            })

    return processed


In [None]:
docs = ingest_file("data_raw/sample.pdf")  # reuse from Step 2
chunks = chunk_documents(docs)

print("Original pages:", len(docs))
print("Total chunks:", len(chunks))
chunks[:2]  # show first two chunks


Original pages: 2
Total chunks: 5


[{'source': 'data_raw/sample.pdf',
  'chunk_id': 0,
  'text': 'Somalia Flood Exposure Methodology Note\nAnalysis for 2024 HNRP\nThis technical note summarises the methodology used to calculate the number of people\npotentially exposed to flooding in Somalia in the 2024 Somalia Humanitarian Needs and\nResponse Plan (HNRP). The UN OCHA Centre for Humanitarian Data worked with awiderange\nof technical partners to develop a methodology that was then endorsed by the Somalia ICCG\nandHCT.\nDaily FloodScan (1998-2022) & WorldPop (2020 UN Adjusted) raster data wasanalysedtogain\nunderstanding of flood conditions across Somalia for both March-April-May (MAM) and\nOctober-November-December (OND) seasons. FloodScan daily flood fraction Standard Flood\nExposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites'},
 {'source': 'data_raw/sample.pdf',
  'chunk_id': 1,
  'text': 'Exposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites\nforbot

In [None]:
import json

def save_chunks(chunks, filename="processed.json"):
    path = f"data_processed/{filename}"
    with open(path, "w") as f:
        json.dump(chunks, f, indent=2)
    print("Saved:", path)


In [None]:
save_chunks(chunks, "sample_chunks.json")


Saved: data_processed/sample_chunks.json


In [None]:
def load_chunks(filename="processed.json"):
    with open(f"data_processed/{filename}", "r") as f:
        return json.load(f)


# Embeddings + Vector Database (FAISS)

In [None]:
import json

def load_chunks(filename):
    with open(f"data_processed/{filename}", "r") as f:
        return json.load(f)


chunks = load_chunks("sample_chunks.json")
print("Loaded chunks:", len(chunks))
chunks[:2]


Loaded chunks: 5


[{'source': 'data_raw/sample.pdf',
  'chunk_id': 0,
  'text': 'Somalia Flood Exposure Methodology Note\nAnalysis for 2024 HNRP\nThis technical note summarises the methodology used to calculate the number of people\npotentially exposed to flooding in Somalia in the 2024 Somalia Humanitarian Needs and\nResponse Plan (HNRP). The UN OCHA Centre for Humanitarian Data worked with awiderange\nof technical partners to develop a methodology that was then endorsed by the Somalia ICCG\nandHCT.\nDaily FloodScan (1998-2022) & WorldPop (2020 UN Adjusted) raster data wasanalysedtogain\nunderstanding of flood conditions across Somalia for both March-April-May (MAM) and\nOctober-November-December (OND) seasons. FloodScan daily flood fraction Standard Flood\nExposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites'},
 {'source': 'data_raw/sample.pdf',
  'chunk_id': 1,
  'text': 'Exposure Depiction (SFED) was aggregated to yearly seasonal maximum fraction composites\nforbot

In [None]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded!")


Embedding model loaded!


In [None]:
import numpy as np

def embed_chunks(chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embed_model.encode(
        texts, 
        convert_to_numpy=True, 
        show_progress_bar=True
    )
    return embeddings

embeddings = embed_chunks(chunks)
embeddings.shape


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.63it/s]


(5, 384)

In [None]:
import faiss

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]

    # Create IndexFlatIP (Inner Product = cosine similarity after normalization)
    index = faiss.IndexFlatIP(dim)

    # Normalize to make inner product = cosine similarity
    faiss.normalize_L2(embeddings)
    
    # Add vectors to index
    index.add(embeddings)

    return index

index = build_faiss_index(embeddings)
print("FAISS index size:", index.ntotal)


FAISS index size: 5


In [None]:
import pickle

def save_faiss(index, chunks, embeddings, filename="faiss_store.pkl"):
    store = {
        "index": index,
        "chunks": chunks,
        "embeddings": embeddings
    }
    with open(filename, "wb") as f:
        pickle.dump(store, f)
    print("Saved:", filename)

save_faiss(index, chunks, embeddings)


Saved: faiss_store.pkl


In [None]:
def load_faiss(filename="faiss_store.pkl"):
    with open(filename, "rb") as f:
        return pickle.load(f)

faiss_store = load_faiss()
faiss_store.keys()


dict_keys(['index', 'chunks', 'embeddings'])

In [None]:
def search_index(query, store, k=3):
    index = store["index"]
    chunks = store["chunks"]

    q_emb = embed_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)

    scores, indices = index.search(q_emb, k)

    results = []
    for idx, score in zip(indices[0], scores[0]):
        results.append({
            "score": float(score),
            "source": chunks[idx]["source"],
            "chunk_id": chunks[idx]["chunk_id"],
            "text": chunks[idx]["text"]
        })
    
    return results

query = "What is the document talking about?"
results = search_index(query, faiss_store, k=3)

for r in results:
    print("-----")
    print("Score:", r["score"])
    print("Source:", r["source"])
    print("Chunk:", r["chunk_id"])
    print("Text Preview:", r["text"][:300], "...")


-----
Score: 0.13010777533054352
Source: data_raw/sample.pdf
Chunk: 0
Text Preview: Somalia Flood Exposure Methodology Note
Analysis for 2024 HNRP
This technical note summarises the methodology used to calculate the number of people
potentially exposed to flooding in Somalia in the 2024 Somalia Humanitarian Needs and
Response Plan (HNRP). The UN OCHA Centre for Humanitarian Data wo ...
-----
Score: 0.11402948200702667
Source: data_raw/sample.pdf
Chunk: 3
Text Preview: season for MAM 2024, the range of populationexposed wasbasedonthe50th-95thpercentile
levels. Since there was no available data to inform predictions for OND 2024, the 25-75th
percentile values were used to bound the range. The lower and upper limits of therangeswere
Page1of2 ...
-----
Score: 0.09291001409292221
Source: data_raw/sample.pdf
Chunk: 0
Text Preview: calculated at the administrative level using all of the historical yearly seasonal flood exposure
estimates. The two sets of range estimates were combined conserva

# Reranking + LLM Answer Generation + Citations

In [None]:
!pip install sentence-transformers
!pip install python-dotenv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
print("Reranker loaded!")


Reranker loaded!


In [None]:
def rerank_results(query, retrieved_chunks):
    pairs = [[query, item["text"]] for item in retrieved_chunks]
    scores = reranker.predict(pairs)

    for item, score in zip(retrieved_chunks, scores):
        item["rerank_score"] = float(score)

    # Sort by score (descending)
    ranked = sorted(retrieved_chunks, key=lambda x: x["rerank_score"], reverse=True)

    return ranked


In [None]:
def build_prompt(query, top_chunks):
    context = ""

    for c in top_chunks:
        context += f"\n[Source: {c['source']} | Chunk: {c['chunk_id']}] \n{c['text']}\n"

    prompt = f"""
You are a helpful assistant. Use ONLY the information from the context below to answer the user's question.

If the context does not contain the answer, reply:
"I don’t know based on the provided documents."

Always include a "Sources:" section at the end.

### CONTEXT:
{context}

### QUESTION:
{query}

### ANSWER (with citations):
"""

    return prompt


In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
print("GROQ KEY =", os.getenv("GROQ_API_KEY"))

from groq import Groq
client = Groq(api_key=os.getenv("GROQ_API_KEY"))


GROQ KEY = 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


In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

print("Current working directory:", os.getcwd())
print("Environment key value:", os.getenv("GROQ_API_KEY"))


Current working directory: /Users/apoorvnathtripathi/Desktop/multi_source_rag_project
Environment key value: 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


In [None]:
test = client.chat.completions.create(
    model="llama3-70b-8192",
    messages=[{"role": "user", "content": "Say hello"}]
)

print(test.choices[0].message.content)


BadRequestError: Error code: 400 - {'error': {'message': 'The model `llama3-70b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}

In [None]:
client

<groq.Groq at 0x123856b70>

In [None]:
try:
    test = client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[{"role": "user", "content": "Say hello"}],
        max_tokens=20
    )
    print(test.choices[0].message.content)

except Exception as e:
    import json
    print("ERROR TYPE:", type(e))
    try:
        print("ERROR DETAILS:", e.args[0])
    except:
        print("RAW ERROR:", e)


ERROR TYPE: <class 'groq.BadRequestError'>
ERROR DETAILS: Error code: 400 - {'error': {'message': 'The model `llama-3.1-70b-versatile` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}


In [None]:
def generate_answer_groq(prompt):
    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content


In [None]:
def rag_pipeline_groq(query, store, k=5):
    # 1. Retrieve from FAISS
    retrieved = search_index(query, store, k)

    # 2. Rerank retrieved chunks
    ranked = rerank_results(query, retrieved)

    # 3. Pick top 3
    top_chunks = ranked[:3]

    # 4. Build the LLM prompt
    prompt = build_prompt(query, top_chunks)
    
    # 5. Get the answer from Groq LLaMA 3.1
    answer = generate_answer_groq(prompt)

    return answer, top_chunks


In [None]:
models = client.models.list()
for m in models.data:
    print(m.id)


groq/compound-mini
meta-llama/llama-prompt-guard-2-22m
meta-llama/llama-prompt-guard-2-86m
playai-tts-arabic
openai/gpt-oss-safeguard-20b
qwen/qwen3-32b
moonshotai/kimi-k2-instruct-0905
whisper-large-v3
llama-3.3-70b-versatile
playai-tts
meta-llama/llama-4-scout-17b-16e-instruct
whisper-large-v3-turbo
groq/compound
meta-llama/llama-guard-4-12b
moonshotai/kimi-k2-instruct
openai/gpt-oss-120b
openai/gpt-oss-20b
llama-3.1-8b-instant
meta-llama/llama-4-maverick-17b-128e-instruct
allam-2-7b


In [None]:
query = "What is the document talking about?"

answer, used_chunks = rag_pipeline_groq(query, faiss_store)


print("ANSWER:\n", answer)
print("\n--- SOURCES USED ---")
for c in used_chunks:
    print(f"{c['source']} (chunk {c['chunk_id']})")


ANSWER:
 The document is discussing the methodology used to calculate the number of people potentially exposed to flooding in Somalia, specifically for the 2024 Somalia Humanitarian Needs and Response Plan (HNRP). It outlines the process of aggregating historical flood data, reclassifying and processing the data, and estimating population exposure to flooding.

Sources:
[Source: data_raw/sample.pdf | Chunk: 1], 
[Source: data_raw/sample.pdf | Chunk: 0]

--- SOURCES USED ---
data_raw/sample.pdf (chunk 1)
data_raw/sample.pdf (chunk 0)
data_raw/sample.pdf (chunk 0)
