In [1]:
# Imports 

# Core packages
import os
import tempfile
from dotenv import load_dotenv

# pdf processing
from pdfminer.high_level import extract_text

# vector search
import faiss
import numpy as np

# Text chunking and embedding
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings

# LLM and RAG
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.vectorstores import FAISS

# Interface
import gradio as gr

# Token counting
import tiktoken

# loading environment variables from .env file

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is not set in the environment variables")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PDF extraction and chunking

def extract_pdf_text(file_path: str) -> str:
    """Extract raw text from a PDF file using pdfminer."""
    return extract_text(file_path)

def split_text_to_chunks(text: str, chunk_size=1000, chunk_overlap=200) -> list:
    """Split text into overlapping chunks using LangChain's text splitter."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " "]
    )
    return splitter.split_text(text)

In [3]:
# sample run
pdf_path = "notes.pdf"
raw_text = extract_pdf_text(pdf_path)
text_chunks = split_text_to_chunks(raw_text)

print(f"Extracted {len(text_chunks)} text chunks from the PDF.")
print("Sample chunk:\n", text_chunks[0][:200])

Extracted 646 text chunks from the PDF.
Sample chunk:
 The 

Kubernetes 

Book

2025 Edition

Weapons-grade Kubernetes learning!

Nigel Poulton @nigelpoulton

About this edition

This edition was published in February 2025.

In writing this edition, I've


In [4]:
# Embedding + FAISS Index creation

from langchain_community.vectorstores import FAISS

def create_faiss_index(chunks: list, model_name: str = "gpt-4o-mini") -> FAISS:
    """
    Generate embeddings for text chunks using a specific OpenAI embedding model,
    then store them in a FAISS vector index.
    
    Default is 'text-embedding-3-small' (gpt-4o-mini embeddings).
    """
    embedding_model = OpenAIEmbeddings(model=model_name)
    vectorstore = FAISS.from_texts(text_chunks, embedding_model)
    return vectorstore

In [5]:
# Sample run
# Creating a model instance
embedding_model = OpenAIEmbeddings()

# Use gpt-4o-mini's embedding model (text-embedding-3-small)
vectorstore = create_faiss_index(text_chunks, model_name="text-embedding-3-small")

# Save for reuse
vectorstore.save_local("faiss_index_store")


  embedding_model = OpenAIEmbeddings()


In [6]:
# Retrieve Top-k Chunks and Build RAG Prompt

def retrieve_relevant_chunks(vectorstore, query: str, k: int = 4) -> list:
    """Return top-k relevant text chunks for the query."""
    docs = vectorstore.similarity_search(query, k=k)
    return [doc.page_content for doc in docs]

def build_rag_prompt(query: str, context_chunks: list) -> str:
    """Format retrieved chunks and user query into a prompt for LLM."""
    context_text = "\n\n---\n\n".join(context_chunks)
    prompt = f"""You are an expert assistant. Use the following context from a document to answer the user's question. If unsure, say so.

Context:
{context_text}

Question:
{query}

Answer:"""
    return prompt


In [7]:
# Testing
# Step 1: Get top 4 chunks for the query
query = "Explain resource limits in Kubernetes."
top_chunks = retrieve_relevant_chunks(vectorstore, query, k=4) # Adjust k for context length

# Step 2: Build prompt for LLM
prompt = build_rag_prompt(query, top_chunks)

print(prompt[:1000])  # Preview prompt


You are an expert assistant. Use the following context from a document to answer the user's question. If unsure, say so.

Context:
worker nodes not only allows the scheduler to spread your

applications over multiple availability zones, but it may also

render DoS attacks on any single node or zone ineﬀective (or

less eﬀective).

You should also conﬁgure appropriate limits for the following:

Memory

CPU

Storage

Limits like these can help prevent essential system resources

from being starved, therefore preventing potential DoS.

Limiting Kubernetes objects can also be a good practice. This

includes limiting things such as the number of ReplicaSets,

Pods, Services, Secrets, and ConﬁgMaps in a particular

Namespace.

Here’s an example manifest that limits the number of Pod

objects in the  skippy  Namespace to 100.

apiVersion: v1

kind: ResourceQuota
metadata:

  name: pod-quota
  namespace: skippy

spec:
  hard:

    pods: "100"

One more feature —  podPidsLimit  — restricts the

In [8]:
# Call OpenAI LLM with the formatted RAG prompt

def get_llm_response(prompt: str, model_name: str = "gpt-4o-mini", temperature: float = 0.2) -> str:
    """
    Sends the RAG prompt to the specified OpenAI Chat model and returns the response.

    Parameters:
    - model_name: e.g., "gpt-3.5-turbo", "gpt-4", "gpt-4o"
    - temperature: controls randomness (0.0 = deterministic)

    Returns:
    - Response text from LLM
    """
    llm = ChatOpenAI(model=model_name, temperature=temperature)
    response = llm([HumanMessage(content=prompt)])
    return response.content


In [9]:
# Sample usage
rag_response = get_llm_response(prompt, model_name="gpt-4o")
print("Answer:\n", rag_response)

  llm = ChatOpenAI(model=model_name, temperature=temperature)
  response = llm([HumanMessage(content=prompt)])


Answer:
 Resource limits in Kubernetes are constraints set on the maximum amount of CPU and memory resources that a container within a Pod can use. These limits are specified in the Pod's YAML configuration under the `resources` section. The limits ensure that a container does not consume more resources than allocated, which helps maintain stability and performance across the cluster by preventing any single container from monopolizing resources.

For example, in a Pod YAML configuration, you might see:

```yaml
resources:
  limits:
    cpu: 1.0
    memory: 512Mi
```

This configuration sets a cap of one CPU and 512Mi of memory for the container. The Kubernetes runtime enforces these limits, ensuring that the container cannot exceed these specified resources. While a container can use more resources if available, it cannot surpass the defined limits, thus preventing resource starvation for other containers and potential denial of service (DoS) scenarios.


In [10]:
# Gradio Interface for PDF QA Bot

import gradio as gr
from pathlib import Path

# Globals to cache state
VECTORSTORE = None
CHUNKS = []
PDF_NAME = None

def process_pdf(file_obj, embedding_model: str = "text-embedding-3-small"):
    """Extracts text, creates chunks, and builds FAISS index from uploaded PDF."""
    global VECTORSTORE, CHUNKS, PDF_NAME

    if not file_obj:
        return "❗ No file provided."

    file_path = file_obj.name  # Gradio passes NamedString (with .name = path)
    PDF_NAME = Path(file_path).stem

    text = extract_pdf_text(file_path)
    CHUNKS = split_text_to_chunks(text)
    VECTORSTORE = create_faiss_index(CHUNKS, model_name=embedding_model)

    return f"✅ Processed {len(CHUNKS)} chunks from: {PDF_NAME}"

def handle_question(question: str, model: str = "gpt-4o"):
    """Handles the user query after PDF is processed."""
    if VECTORSTORE is None:
        return "❗ Please upload and process a PDF first."
    
    relevant = retrieve_relevant_chunks(VECTORSTORE, question, k=4)
    prompt = build_rag_prompt(question, relevant)
    answer = get_llm_response(prompt, model_name=model)
    return answer

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 📄 RAG-based PDF QA Bot (OpenAI + FAISS)")

    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        embedding_model = gr.Textbox(label="Embedding Model", value="text-embedding-3-small")
        process_btn = gr.Button("📚 Process PDF")

    status = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question = gr.Textbox(label="Ask a question")
        model_choice = gr.Dropdown(choices=["gpt-4o", "gpt-4", "gpt-3.5-turbo"], value="gpt-4o", label="LLM Model")
        ask_btn = gr.Button("🔍 Get Answer")

    answer_output = gr.Textbox(label="Answer", lines=8)

    # Button actions
    process_btn.click(process_pdf, inputs=[pdf_input, embedding_model], outputs=status)
    ask_btn.click(handle_question, inputs=[question, model_choice], outputs=answer_output)

# Launch the app
demo.launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


