In [3]:
import pdfplumber
import docx
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr

In [4]:
# 1. Load any document
def load_document(file_path):
    if file_path.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(
                page.extract_text() for page in pdf.pages if page.extract_text()
            )
    elif file_path.endswith(".docx"):
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file type. Use .pdf, .docx, or .txt")
    return text

In [5]:
# 2. Chunk text
def chunk_text(text, chunk_size=150):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


In [6]:
# 3. Embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    return np.array(embedder.encode(chunks))

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def retrieve_chunks(query, chunks, index, top_k=5):
    query_emb = embedder.encode([query])
    D, I = index.search(query_emb, top_k)
    return [chunks[i] for i in I[0]]

In [7]:
# 4. Answering (use Flan-T5 for concise answers)
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def answer_question(query, context_chunks):
    context = "\n".join(context_chunks)
    prompt = f"Answer the question based on the context:\n\n{context}\n\nQuestion: {query}\nAnswer:"
    response = qa_pipeline(prompt, max_new_tokens=200)
    return response[0]["generated_text"]


Device set to use cpu


In [17]:
# 5. Gradio Interface

def chat_with_doc(file, question):
    # Load document
    text = load_document(file.name)
    # Chunk
    chunks = chunk_text(text)
    # Embed + Index
    embeddings = embed_chunks(chunks)
    index = build_faiss_index(embeddings)
    # Retrieve + Answer
    retrieved_chunks = retrieve_chunks(question, chunks, index)
    answer = answer_question(question, retrieved_chunks)
    return answer

with gr.Blocks() as demo:
    gr.Markdown("## 📘 Chat with Your Document")
    with gr.Row():
        file_input = gr.File(label="Upload Document", type="filepath")
    question_input = gr.Textbox(label="Ask a Question")
    answer_output = gr.Textbox(label="Answer")
    btn = gr.Button("Get Answer")
    btn.click(chat_with_doc, inputs=[file_input, question_input], outputs=answer_output)

demo.launch()

* Running on local URL:  http://127.0.0.1:7866
* To create a public link, set `share=True` in `launch()`.


