<a href="https://colab.research.google.com/github/navami-b/RetrievalQA/blob/main/RetrievalQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1 — install required packages (run once)
# NOTE: installs langchain + helper packages, faiss (CPU), sentence-transformers, transformers, and huggingface helper package
!pip install -qU langchain langchain-huggingface langchain-community langchain-text-splitters \
               sentence-transformers transformers huggingface-hub faiss-cpu pypdf


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.5 MB[0m [31m15.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m39.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/483.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.4/483.4 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2 — (optional) set Hugging Face token if you'll use HF cloud endpoints
# If you don't want to use HF cloud inference, skip this cell and we'll use a local small model instead.
import os, getpass
if not os.environ.get("HUGGINGFACEHUB_API_TOKEN"):
    print("If you plan to use the Hugging Face Inference API, paste your token now (or press Enter to skip):")
    token = getpass.getpass("HUGGINGFACEHUB_API_TOKEN (paste, or blank to skip): ")
    if token:
        os.environ["HUGGINGFACEHUB_API_TOKEN"] = token
        print("HUGGINGFACEHUB_API_TOKEN set in env.")
    else:
        print("No token set — will default to local HF model option.")


If you plan to use the Hugging Face Inference API, paste your token now (or press Enter to skip):
HUGGINGFACEHUB_API_TOKEN (paste, or blank to skip): ··········
No token set — will default to local HF model option.


In [3]:
# Cell 3 — Upload your sample company policy file (TXT or PDF) to Colab
from google.colab import files
uploaded = files.upload()   # choose the company-policy .txt or .pdf
if len(uploaded) == 0:
    raise SystemExit("Upload a file and re-run this cell.")
filepath = next(iter(uploaded.keys()))
print("Uploaded:", filepath)


Saving blind school.pdf to blind school.pdf
Uploaded: blind school.pdf


In [4]:
# Cell 4 — Load, split, embed, build FAISS, and run RetrievalQA
# Robust imports with fallbacks across LangChain versions
import os
from pprint import pprint

# ---- loaders & text splitters ----
try:
    # recent versions: community loaders
    from langchain_community.document_loaders import TextLoader, PyPDFLoader
except Exception:
    from langchain.document_loaders import TextLoader, PyPDFLoader

try:
    # recommended text splitter package
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except Exception:
    # fallback to langchain's own text_splitter
    from langchain.text_splitter import RecursiveCharacterTextSplitter

# ---- embeddings & vectorstore ----
# We'll try LangChain HuggingFace embeddings first (wraps sentence-transformers).
# If import path differs, try alternate packages.
HuggingFaceEmbeddings = None
try:
    # langchain-huggingface / langchain_huggingface wrapper
    from langchain_huggingface.embeddings import HuggingFaceEmbeddings
    HuggingFaceEmbeddings = HuggingFaceEmbeddings
except Exception:
    try:
        from langchain.embeddings.huggingface import HuggingFaceEmbeddings
        HuggingFaceEmbeddings = HuggingFaceEmbeddings
    except Exception:
        # last fallback: use sentence-transformers directly below
        HuggingFaceEmbeddings = None

try:
    # FAISS vectorstore from langchain-community
    from langchain_community.vectorstores import FAISS
except Exception:
    from langchain.vectorstores import FAISS

# ---- LLM wrappers (local HF pipeline or Hugging Face cloud) ----
local_llm_available = True
try:
    # preferred partner package for Hugging Face + LangChain integration
    from langchain_huggingface import HuggingFacePipeline
    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
except Exception:
    # fallback to langchain.llms.HuggingFacePipeline (older imports)
    try:
        from langchain.llms import HuggingFacePipeline
        from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
    except Exception:
        HuggingFacePipeline = None
        local_llm_available = False

# ---------------- PARAMETERS ----------------
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"   # small, fast, good quality
LOCAL_GEN_MODEL = "google/flan-t5-small"                     # text2text model for local inference (small)
QUESTION = "What is the refund policy?"

# ---------------- 1) Load documents ----------------
ext = filepath.lower().split('.')[-1]
if ext == "txt":
    loader = TextLoader(filepath, encoding="utf-8", autodetect_encoding=True)
    docs = loader.load()
elif ext == "pdf":
    loader = PyPDFLoader(filepath)
    docs = loader.load()
else:
    raise ValueError("Unsupported file type. Use .txt or .pdf")

print(f"Loaded {len(docs)} document(s).")

# ---------------- 2) Split into chunks ----------------
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# splitter may expose split_documents or split_text depending on version
documents = []
for d in docs:
    # ensure we have text
    text = getattr(d, "page_content", str(d))
    # prefer split_documents if present (keeps Document metadata)
    if hasattr(splitter, "split_documents"):
        parts = splitter.split_documents([d])
        documents.extend(parts)
    else:
        parts = splitter.split_text(text)
        # create simple Document objects if the splitter returned strings
        from langchain_core.documents import Document
        for p in parts:
            documents.append(Document(page_content=p, metadata=getattr(d, "metadata", {})))

print("After splitting: total chunks =", len(documents))

# ---------------- 3) Create embeddings ----------------
# Try to use HuggingFaceEmbeddings wrapper (which uses sentence-transformers under the hood)
if HuggingFaceEmbeddings is not None:
    print("Using HuggingFaceEmbeddings (LangChain wrapper) with model:", EMBEDDING_MODEL)
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={"device": "cpu"})
    # FAISS.from_documents helper will call embeddings under the hood
    db = FAISS.from_documents(documents, embeddings)
else:
    # fallback: use sentence-transformers directly and then build a FAISS index manually via LangChain
    print("HuggingFaceEmbeddings not available — using sentence-transformers directly.")
    from sentence_transformers import SentenceTransformer
    import numpy as np
    model = SentenceTransformer(EMBEDDING_MODEL)
    texts = [d.page_content for d in documents]
    embs = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    # create FAISS index via langchain_community FAISS constructor:
    import faiss
    dim = embs.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embs)
    # wrap into LangChain FAISS vectorstore object
    from langchain_core.documents import Document as LC_Doc
    # create docstore mapping
    from langchain_community.docstore.in_memory import InMemoryDocstore
    from uuid import uuid4
    ids = [str(uuid4()) for _ in documents]
    docstore = InMemoryDocstore({i: doc for i, doc in zip(ids, documents)})
    db = FAISS(embedding_function=None, index=index, docstore=docstore, index_to_docstore_id={i: ids[idx] for idx, i in enumerate(range(len(ids)))})
    # Note: this low-level fallback is more complex; prefer HuggingFaceEmbeddings path.

print("Vector store ready. Number of vectors approx:", len(documents))

# ---------------- 4) Make a retriever ----------------
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 3})

# ---------------- 5A) Option: Build a local LLM via transformers pipeline (no HF token required) ----
llm = None
if local_llm_available:
    try:
        print("Loading local small text2text HF model (may download ~100-300MB):", LOCAL_GEN_MODEL)
        pipe = pipeline("text2text-generation", model=LOCAL_GEN_MODEL, max_new_tokens=256, device=-1)  # CPU
        # Wrap pipeline into LangChain LLM wrapper (langchain_huggingface.HuggingFacePipeline)
        try:
            # modern partner package
            from langchain_huggingface import HuggingFacePipeline as HFFPipe
            llm = HFFPipe(pipeline=pipe)
        except Exception:
            from langchain_huggingface import HuggingFacePipeline as HFFPipe
            llm = HFFPipe(pipeline=pipe)
        print("Local LLM ready.")
    except Exception as e:
        print("Local HF model option failed or not available:", e)
        llm = None

# ---------------- 5B) Option: Use Hugging Face Inference / Hub via token (cloud) ----
# If you set HUGGINGFACEHUB_API_TOKEN above, you can uncomment and use this instead of local model:
if llm is None and os.environ.get("HUGGINGFACEHUB_API_TOKEN"):
    try:
        print("Falling back to HuggingFaceHub/cloud LLM (requires HF token).")
        from langchain_huggingface import HuggingFaceEndpoint, HuggingFacePipeline, HuggingFaceHub
        # Example: use a text2text model available as an inference endpoint (model ID example)
        repo_id = "google/flan-t5-small"  # or any model you have access to on HF
        # use HuggingFacePipeline.from_model_id or HuggingFaceHub wrapper
        llm = HuggingFacePipeline.from_model_id(model_id=repo_id, task="text2text-generation", pipeline_kwargs={"max_new_tokens":256})
        print("Hugging Face cloud LLM ready.")
    except Exception as e:
        print("Hugging Face cloud LLM option failed:", e)
        llm = None

if llm is None:
    raise SystemExit("No LLM available — either install langchain_huggingface + transformers or provide HUGGINGFACEHUB_API_TOKEN. See earlier instructions.")

# ---------------- 6) Build a RetrievalQA (classic) and run query ----------------
# NOTE: RetrievalQA.from_chain_type is still widely used (works). The modern pattern is create_retrieval_chain.
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

# Run the question
res = qa({"query": QUESTION})
print("\n===== ANSWER =====")
pprint(res.get("result") if isinstance(res, dict) else res)

# Print source documents returned (if any)
if isinstance(res, dict) and res.get("source_documents"):
    print("\n--- SOURCES (top retrieved chunks) ---")
    for i, s in enumerate(res["source_documents"], 1):
        preview = s.page_content[:400].replace("\n", " ")
        print(f"[source {i}] (metadata={s.metadata})\n{preview}\n")


Loaded 2 document(s).
After splitting: total chunks = 4
Using HuggingFaceEmbeddings (LangChain wrapper) with model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready. Number of vectors approx: 4
Loading local small text2text HF model (may download ~100-300MB): google/flan-t5-small


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Local LLM ready.


  res = qa({"query": QUESTION})
Token indices sequence length is longer than the specified maximum sequence length for this model (1710 > 512). Running this sequence through the model will result in indexing errors



===== ANSWER =====
('d i t o n d i t o n d i n d i n d i n d i n d i n d i n d i n d i n d i n d '
 'i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i n '
 'd i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i n d i '
 'n d i n d i n d i n d i n d')

--- SOURCES (top retrieved chunks) ---
[source 1] (metadata={'producer': 'Skia/PDF m133 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'blind school', 'source': 'blind school.pdf', 'total_pages': 2, 'page': 1, 'page_label': '2'})
3 .  F e e d b a c k  o n  A s s i s t i v e  T e c h n o l o g y  : - ● A  m a j o r i t y  o f  s t u d e n t s  p r e f e r  a u d i t o r y  f e e d b a c k  ( b e e p s  o r  a l a r m s )  r a t h e r  t h a n  v i b r a t o r y  f e e d b a c k  d u e  t o  c l a r i t y  a n d  f a m i l i a r i t y . ● T h e  i n t e g r a t i o n  o f  m o i s t u r e , l i g h t  a n d  u l t r a s o n 

[source 2] (metadata={'producer': 'Skia/PDF m133 G