In [1]:
!pip install langchain langchain-community transformers sentence-transformers chromadb faiss-cpu pypdf


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinu

In [2]:
from pathlib import Path
from typing import List, Tuple
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma, FAISS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
DEFAULT_EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_LLM_MODEL = "google/flan-t5-base"
BASE_DATA_DIR = Path("./data")
PERSIST_DIR = Path("./vector_store")
FAISS_DIR = Path("./faiss_store")

BASE_DATA_DIR.mkdir(parents=True, exist_ok=True)
PERSIST_DIR.mkdir(parents=True, exist_ok=True)
FAISS_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
def read_uploaded_files(paths: List[str]) -> List[Tuple[str, str]]:
    results = []
    for path in paths:
        suffix = Path(path).suffix.lower()
        if suffix == ".pdf":
            loader = PyPDFLoader(path)
            pages = loader.load()
            text = "\n".join([doc.page_content for doc in pages])
        else:
            loader = TextLoader(path, encoding="utf-8")
            docs = loader.load()
            text = "\n".join([d.page_content for d in docs])
        results.append((path, text))
    return results

In [5]:
def chunk_texts(texts: List[str], chunk_size=800, chunk_overlap=120):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", ".", " "]
    )
    chunks = []
    for t in texts:
        chunks.extend(splitter.split_text(t))
    return chunks

In [6]:
def load_embeddings(model_name=DEFAULT_EMBED_MODEL):
    return HuggingFaceEmbeddings(model_name=model_name)

In [7]:

def build_vectorstore(db_type, embeddings, collection_name="rag_collection"):
    if db_type == "Chroma":
        return Chroma(collection_name=collection_name, embedding_function=embeddings, persist_directory=str(PERSIST_DIR))
    elif db_type == "FAISS":
        index_path = FAISS_DIR / "index"
        if (index_path / "index.faiss").exists():
            return FAISS.load_local(str(index_path), embeddings, allow_dangerous_deserialization=True)
        else:
            return FAISS.from_texts([""], embeddings)
    else:
        raise ValueError("Unsupported DB type")

In [8]:
def persist_faiss(vs: FAISS):
    index_path = FAISS_DIR / "index"
    index_path.mkdir(parents=True, exist_ok=True)
    vs.save_local(str(index_path))

In [9]:
def add_chunks_to_store(vs, chunks: List[str], db_type: str):
    if db_type == "FAISS" and len(getattr(vs, "docstore", {})) == 1:
        embs = vs.embedding_function
        vs = FAISS.from_texts(chunks, embs)
        persist_faiss(vs)
        return vs
    else:
        vs.add_texts(chunks)
        if db_type == "Chroma":
            vs.persist()
        elif db_type == "FAISS":
            persist_faiss(vs)
        return vs

In [10]:
def build_hf_llm(model_name=DEFAULT_LLM_MODEL, max_new_tokens=256, temperature=0.1):
    tok = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    gen = pipeline(
        "text2text-generation",
        model=mdl,
        tokenizer=tok,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        repetition_penalty=1.1,
    )
    return HuggingFacePipeline(pipeline=gen)

In [11]:

def build_qa_chain(retriever, llm):
    template = (
        "You are a helpful assistant. Use ONLY the provided context to answer. "
        "If the answer is not in the context, say you don't know.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n"
        "Answer:"
    )
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt}
    )


In [12]:
import gradio as gr

embeddings = load_embeddings()
vectorstore = build_vectorstore("FAISS", embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = build_hf_llm()
qa_chain = build_qa_chain(retriever, llm)

def rag_answer(question):
    if not question.strip():
        return "⚠️ Please enter a question", ""

    result = qa_chain({"query": question})
    answer = result.get("result", "(no answer)")
    sources = result.get("source_documents", [])
    sources_text = "\n\n".join(
        [f"{i+1}. {d.page_content[:200]}..." for i, d in enumerate(sources)]
    )
    return answer, sources_text

  return HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  return HuggingFacePipeline(pipeline=gen)


In [13]:


with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Retrieval-Augmented Generation (RAG) – Q&A")

    q_input = gr.Textbox(label="❓ Ask a Question", placeholder="Type your question here...")
    answer_out = gr.Textbox(label="🧾 Answer")
    sources_out = gr.Textbox(label="📚 Sources")

    q_btn = gr.Button("💬 Get Answer")
    q_btn.click(fn=rag_answer, inputs=q_input, outputs=[answer_out, sources_out])

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://941a0e6b13140ca834.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


