In [1]:
!pip install sentence-transformers faiss-cpu transformers accelerate

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [27]:
with open("sample_docs/input.txt") as f:
    text = f.read()


In [28]:
print(text)

Recurrent Depressive Disorder (RDD) is a mood disorder characterized by repeated episodes of depression. 
Each depressive episode typically includes symptoms such as persistent low mood, loss of interest or pleasure, 
reduced energy, disturbed sleep, changes in appetite, feelings of guilt or worthlessness, poor concentration, 
and thoughts of death or suicide.

A diagnosis of Recurrent Depressive Disorder requires that the individual has experienced at least two 
depressive episodes separated by periods of remission. These episodes must last for at least two weeks 
and should not be attributable to substance use or organic mental disorders.

When Recurrent Depressive Disorder is described as “currently in remission,” it indicates that the individual 
does not currently meet the full diagnostic criteria for a depressive episode, although they have a documented 
history of recurrent episodes. During remission, symptoms may be absent or present only in mild form.

In the International Cla

In [29]:
def chunk_text(text, chunk_size=400, overlap=50):

    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start = end - overlap  # moving back by overlap tokens

    return chunks


In [30]:
chunks=chunk_text(text)
print(chunk_text(text))

['Recurrent Depressive Disorder (RDD) is a mood disorder characterized by repeated episodes of depression. Each depressive episode typically includes symptoms such as persistent low mood, loss of interest or pleasure, reduced energy, disturbed sleep, changes in appetite, feelings of guilt or worthlessness, poor concentration, and thoughts of death or suicide. A diagnosis of Recurrent Depressive Disorder requires that the individual has experienced at least two depressive episodes separated by periods of remission. These episodes must last for at least two weeks and should not be attributable to substance use or organic mental disorders. When Recurrent Depressive Disorder is described as “currently in remission,” it indicates that the individual does not currently meet the full diagnostic criteria for a depressive episode, although they have a documented history of recurrent episodes. During remission, symptoms may be absent or present only in mild form. In the International Classificat

In [31]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(chunks)


In [32]:
import faiss
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [33]:
query = input("Enter your question:")
query_embedding = embedder.encode([query])
D, I = index.search(query_embedding, k=3)


Enter your question:Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission


In [34]:
from transformers import pipeline

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device=-1  # CPU
)


Device set to use cpu


In [35]:
k = 3  # number of chunks to retrieve

query_embedding = embedder.encode([query])
distances, indices = index.search(query_embedding, k)

retrieved_chunks = [chunks[i] for i in indices[0]]


In [36]:
context = "\n\n".join(retrieved_chunks)


In [37]:
prompt = f"""
Answer the question using ONLY the context below.

Context:
{context}

Question:
{query}

If the answer is not present in the context, say "Not found in the provided documents."

"""

response = llm(prompt, max_new_tokens=200)
print(response[0]["generated_text"])


Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors


F33.4


In [38]:
!pip install gradio




In [44]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.5.0-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.5.0


# Single Runner Script

In [46]:

import faiss
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from pypdf import PdfReader
import numpy as np

# ---------------- CONFIG ----------------
CHUNK_SIZE = 300
OVERLAP = 50
TOP_K = 4
FINAL_K = 2
MAX_CONTEXT_WORDS = 350
RELEVANCE_THRESHOLD = 0.15  # cosine similarity threshold

# ---------------- CACHE ----------------
query_cache = {}

# ---------------- MODELS ----------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device=-1
)

# ---------------- FILE READING ----------------
def read_file(file):
    if file.name.endswith(".pdf"):
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
        return text.strip()
    else:
        return file.read().decode("utf-8").strip()

# ---------------- CHUNKING ----------------
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunks.append(" ".join(words[start:end]))
        start = end - overlap
    return chunks

# ---------------- RERANKING ----------------
def rerank(chunks, embeddings, query_embedding, final_k=2):
    scores = embeddings @ query_embedding
    ranked_idx = np.argsort(scores)[::-1]
    return [chunks[i] for i in ranked_idx[:final_k]]

# ---------------- WEB SEARCH (SAFE FALLBACK) ----------------
def web_search_fallback(query):
    """
    Placeholder for real web search.
    This is intentionally explicit and honest.
    """
    return (
        "WEB SEARCH FALLBACK USED\n\n"
        "No sufficiently relevant context was found in the uploaded document.\n"
        "A real deployment would query a web search API here (e.g., Bing, Tavily).\n\n"
        f"Query: {query}"
    )

# ---------------- CORE RAG ----------------
def rag_pipeline(text, query):
    if query in query_cache:
        return query_cache[query]

    chunks = chunk_text(text)
    embeddings = embedder.encode(chunks, normalize_embeddings=True)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    query_embedding = embedder.encode([query], normalize_embeddings=True)[0]
    scores, indices = index.search(query_embedding.reshape(1, -1), TOP_K)

    top_score = scores[0][0]

    # ---------- NO CONTEXT FOUND ----------
    if top_score < RELEVANCE_THRESHOLD:
        answer = web_search_fallback(query)
        result = {
            "source": "web_search",
            "context": None,
            "used_chunks": None,
            "answer": answer
        }
        query_cache[query] = result
        return result

    # ---------- CONTEXT FOUND ----------
    candidate_chunks = [chunks[i] for i in indices[0]]
    candidate_embeddings = embeddings[indices[0]]

    selected_chunks = rerank(
        candidate_chunks,
        candidate_embeddings,
        query_embedding,
        FINAL_K
    )

    context = "\n\n".join(selected_chunks)
    context = " ".join(context.split()[:MAX_CONTEXT_WORDS])

    prompt = f"""
Answer the question using ONLY the context below.
If the answer is not present in the context, say "Not found in the provided documents."

Context:
{context}

Question:
{query}
"""

    response = llm(prompt, max_new_tokens=150)
    answer = response[0]["generated_text"]

    result = {
        "source": "document",
        "context": context,
        "used_chunks": selected_chunks,
        "answer": answer
    }

    query_cache[query] = result
    return result

# ---------------- UI ----------------
def ui(file, question):
    if file is None or question.strip() == "":
        return "Please upload a document and enter a question."

    extracted_text = read_file(file)

    if not extracted_text:
        return "Could not extract text from the document (possibly scanned PDF)."

    result = rag_pipeline(extracted_text, question)

    output = "===== EXTRACTED TEXT =====\n"
    output += extracted_text[:1500] + "\n\n"

    if result["source"] == "document":
        output += "===== RETRIEVED CONTEXT =====\n"
        output += result["context"] + "\n\n"

        output += "===== CONTEXT CHUNKS USED =====\n"
        for i, chunk in enumerate(result["used_chunks"], 1):
            output += f"[Chunk {i}]\n{chunk}\n\n"

        output += "===== ANSWER (FROM DOCUMENT) =====\n"
        output += result["answer"]

    else:
        output += "===== NO RELEVANT CONTEXT FOUND =====\n"
        output += "Falling back to web search.\n\n"
        output += "===== ANSWER (FROM WEB SEARCH) =====\n"
        output += result["answer"]

    return output

# ---------------- GRADIO APP ----------------
gr.Interface(
    fn=ui,
    inputs=[
        gr.File(label="Upload TXT or PDF"),
        gr.Textbox(lines=2, label="Question")
    ],
    outputs="text",
    title="Mini RAG QA System with Fallback Search",
    description="Shows extracted text, retrieved context, source attribution, and fallback behavior."
).launch()


Device set to use cpu


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8a6cdaf01da65de74f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


