In [None]:
!pip -q install pdfplumber scikit-learn transformers accelerate

import os, re, pdfplumber
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

NOT_FOUND = "I couldn’t find that in the provided document."

# Load small LLM
gen = pipeline("text2text-generation", model="google/flan-t5-base", device_map="auto")

def extract_text(path):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "rb") as f:
            text = f.read().decode("utf-8", errors="ignore")
    elif ext == ".pdf":
        with pdfplumber.open(path) as pdf:
            pages = [p.extract_text() or "" for p in pdf.pages]
        text = "\n\n".join(pages)
    else:
        raise ValueError("Unsupported file type. Use .txt or .pdf")
    return text.strip()

def chunk_text(text, max_chars=1500, overlap=200):
    text = re.sub(r"\r", "", text)
    chunks = []
    i, n = 0, len(text)
    while i < n:
        end = min(i + max_chars, n)
        piece = text[i:end]
        if end < n:
            cut = piece.rfind(". ")
            if cut > max_chars * 0.6:
                piece = piece[:cut+1]
                end = i + cut + 1
        piece = piece.strip()
        if piece:
            chunks.append(piece)
        if end >= n:
            break
        i = max(0, end - overlap)
    return chunks

class Retriever:
    def __init__(self, chunks):
        self.chunks = chunks
        self.vectorizer = TfidfVectorizer(lowercase=True, token_pattern=r"[A-Za-z0-9]+")
        self.mat = self.vectorizer.fit_transform(chunks)
    def search(self, query, k=4):
        q = self.vectorizer.transform([query])
        sims = cosine_similarity(q, self.mat)[0]
        idxs = sims.argsort()[::-1][:k]
        return [(int(i), float(sims[i]), self.chunks[int(i)]) for i in idxs]

def build_prompt(question, contexts):
    ctx = "\n\n".join([f"[#{i+1}]\n{c}" for i,c in enumerate(contexts)])
    return (
        "You are a careful assistant that answers using ONLY the provided context.\n"
        f"If not in the context, reply exactly: \"{NOT_FOUND}\"\n\n"
        f"Context:\n{ctx}\n\nQuestion: {question}\nAnswer:"
    )

def ask_question(question, retriever):
    hits = retriever.search(question, k=4)
    if not hits or hits[0][1] < 0.08:
        return NOT_FOUND
    contexts = [h[2] for h in hits]
    prompt = build_prompt(question, contexts)
    out = gen(prompt, max_new_tokens=256, temperature=0.0)
    text = out[0]["generated_text"].strip()
    return text if text else NOT_FOUND

path = "/content/test.txt"  # change to your file
text = extract_text(path)
chunks = chunk_text(text)
retriever = Retriever(chunks)

print(f"✅ Loaded {len(chunks)} chunks from {path}")

# Now loop questions in console
while True:
    q = input("Ask a question (or 'exit'): ").strip()
    if q.lower() in ("exit", "quit"):
        break
    ans = ask_question(q, retriever)
    print("Answer:", ans)
