In [None]:
# -------------------------
# Step 0 - Install libraries
# -------------------------
!pip install -q pdfplumber sentence-transformers faiss-cpu transformers gradio nltk


In [None]:
# -------------------------
# Step 1 - Imports
# -------------------------
import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
import gradio as gr

In [None]:
# -------------------------
# Step 2 - Download NLTK data
# -------------------------
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)  # Fix for newer NLTK versions

True

In [None]:
# -------------------------
# Step 3 - Load Models (once at startup)
# -------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Fast embeddings model
generator = pipeline("text2text-generation", model="google/flan-t5-base")  # Small Q&A model

# Warmup models for faster first response
_ = embedder.encode(["Warmup"], convert_to_tensor=False)
_ = generator("This is a warmup test.", max_length=10)

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
# -------------------------
# Step 4 - PDF Reading
# -------------------------
def read_pdf(file_path):
    """Extract text from a PDF, skipping empty pages."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


In [None]:
# -------------------------
# Step 5 - Text Chunking
# -------------------------
def chunk_text(text, chunk_size=200):
    """Split text into smaller chunks for retrieval."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_len = 0
    for sent in sentences:
        if current_len + len(sent.split()) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
        current_chunk.append(sent)
        current_len += len(sent.split())
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [None]:
# -------------------------
# Step 6 - Create FAISS Index
# -------------------------
def create_faiss_index(chunks):
    """Embed chunks and store in FAISS index."""
    embeddings = embedder.encode(chunks, convert_to_tensor=False)
    dim = len(embeddings[0])
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index, chunks

In [None]:
# -------------------------
# Step 7 - Retrieve Relevant Chunks
# -------------------------
def retrieve(query, index, chunks, top_k=3):
    """Retrieve top relevant chunks for the question."""
    query_embedding = embedder.encode([query], convert_to_tensor=False)
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0]]


In [None]:
# -------------------------
# Step 8 - Generate Answer
# -------------------------
def generate_answer(question, context):
    """Use the generator model to answer from context."""
    prompt = f"Answer the question using the context below:\n\nContext: {context}\n\nQuestion: {question}\nAnswer:"
    result = generator(prompt, max_length=150, clean_up_tokenization_spaces=True)
    return result[0]["generated_text"]

In [None]:
# -------------------------
# Step 9 - Full Pipeline
# -------------------------
def process_pdf_and_answer(pdf_file, question):
    # Handle both file object and string path
    if isinstance(pdf_file, str):
        file_path = pdf_file
    else:
        file_path = pdf_file.name

    text = read_pdf(file_path)
    if not text:
        return "No text could be extracted from this PDF. It might be scanned or image-based."

    chunks = chunk_text(text)
    index, chunks_list = create_faiss_index(chunks)
    top_chunks = retrieve(question, index, chunks_list)
    context = " ".join(top_chunks)
    answer = generate_answer(question, context)
    return answer

In [None]:
# -------------------------
# Step 10 - Gradio UI
# -------------------------
def gradio_interface(pdf_file, question):
    if not pdf_file or not question:
        return "Please upload a PDF and ask a question."
    return process_pdf_and_answer(pdf_file, question)

ui = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload your Course Notes PDF", file_types=[".pdf"]),
        gr.Textbox(label="Ask a Question")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Course Notes Q&A Chatbot",
    description="Upload your PDF notes and ask questions!"
)

ui.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://19bfeaef825fc701e0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# New Section