# 🧪 Educational RAG in Google Colab (OpenAI SDK v1.x compatible)
Minimal Retrieval-Augmented Generation pipeline using free components for indexing and the OpenAI API for generation.

**What this does:**
- Index your PDFs or .txt files
- Create embeddings with `sentence-transformers`
- Store vectors in FAISS
- Retrieve top-k chunks
- Generate an answer with the **OpenAI Python SDK v1.x** (`openai` package)

---
## How to use
1) Run **Install** ➜ 2) **Setup & Helpers** ➜ 3) **Load documents** ➜ 4) **Index** ➜ 5) **Ask**
If you don't have files yet, there's an inline `sample_text` fallback.


In [None]:
!pip -q install faiss-cpu sentence-transformers pypdf openai

In [None]:
import os, numpy as np, faiss
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
from typing import List, Dict
from openai import OpenAI

# ---- OpenAI client (v1.x) ----
# Set your API key in the Colab environment: Runtime > Run again after setting
# or do: os.environ['OPENAI_API_KEY'] = 'sk-...'
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY', None))
if client.api_key is None:
    print('⚠️ OPENAI_API_KEY is not set. Set it with `os.environ["OPENAI_API_KEY"] = "sk-..."` or in Colab > Secrets.')

# ---- Chunker ----
def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 100):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(len(words), start + chunk_size)
        chunks.append(' '.join(words[start:end]))
        start = end - chunk_overlap if end - chunk_overlap > start else end
    return chunks

# ---- Prompt template ----
def build_prompt(question: str, context: str) -> str:
    sys = 'Use ONLY the provided context to answer. If the answer is not in the context, say you do not know.'
    return f"{sys}\n\nQuestion: {question}\n\nContext:\n{context}"


In [None]:
# 📄 1) Load your documents (PDF or .txt). Add absolute paths below after uploading to Colab Files.
doc_paths = []  # e.g., ['/content/your.pdf', '/content/notes.txt']

all_chunks: List[str] = []
chunk_meta: List[Dict] = []

for path in doc_paths:
    text = ''
    if path.lower().endswith('.pdf'):
        reader = PdfReader(path)
        text = '\n'.join([(p.extract_text() or '') for p in reader.pages])
    else:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
    chunks = chunk_text(text)
    all_chunks.extend(chunks)
    chunk_meta.extend([{'source': path}] * len(chunks))

# Fallback: inline sample text so the notebook works even with no files
if not all_chunks:
    sample_text = (
        'Retrieval-augmented generation (RAG) combines a retriever and a generator. '
        'Embeddings turn text into vectors. FAISS enables efficient nearest-neighbor search. '
        'Chunking controls how text is split before indexing.'
    )
    all_chunks = chunk_text(sample_text)
    chunk_meta = [{'source': 'inline_sample'} for _ in all_chunks]

print(f'Loaded {len(all_chunks)} chunks from {len(doc_paths)} documents (or inline sample).')

In [None]:
# 🔍 2) Build embeddings & FAISS index (robust)
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
X = emb_model.encode(all_chunks, normalize_embeddings=True)
X = np.atleast_2d(X).astype('float32')  # ensure 2D float32

if X.size == 0:
    raise ValueError('No embeddings to index. Ensure documents or sample_text are present.')

d = X.shape[1]
index = faiss.IndexFlatIP(d)  # cosine via inner-product on normalized vectors
index.add(X)
print('FAISS index size:', index.ntotal)

In [None]:
def retrieve(query: str, top_k: int = 5):
    if index.ntotal == 0:
        raise ValueError('The FAISS index is empty. Run the indexing cell after loading documents.')
    qv = emb_model.encode([query], normalize_embeddings=True).astype('float32')
    D, I = index.search(qv, min(top_k, index.ntotal))
    results = [(float(D[0][j]), all_chunks[i], chunk_meta[i]) for j, i in enumerate(I[0]) if i != -1]
    return results

# Quick test
retrieve('What is RAG?')

In [None]:
def rag_answer(question: str, k: int = 5, model: str = 'gpt-4o-mini') -> str:
    results = retrieve(question, top_k=k)
    if not results:
        return 'I could not retrieve any context. Please add documents first.'
    context = '\n\n---\n\n'.join([r[1] for r in results])
    prompt = build_prompt(question, context)
    if client.api_key is None:
        return 'OPENAI_API_KEY not set. Please set it before calling the model.'
    resp = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0.2,
        max_tokens=400
    )
    return resp.choices[0].message.content


In [None]:
# 💡 Try your own question here
q = 'Explain how chunk size affects retrieval.'
print(rag_answer(q))