<a href="https://colab.research.google.com/github/marthaharika4-glitch/AI-Powered-PDF-based-stack-squad-/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =======================
# 🛠️ Install Dependencies
# =======================
!pip install -U pip
!pip install gradio pymupdf faiss-cpu sentence-transformers transformers accelerate --quiet

# =======================
# 📚 Imports and Setup
# =======================
import fitz  # PyMuPDF
import faiss
import gradio as gr
import torch
import tempfile
import os
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from typing import List

# =======================
# 📌 Load Models
# =======================
def load_models():
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

    # ✅ Replace with an open-source instruct model (you can later switch to IBM Granite if available)
    model_id = "mistralai/Mistral-7B-Instruct-v0.1"  # Open-source alternative
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
    return embedder, qa_pipeline

embedder, qa_pipeline = load_models()

# =======================
# 📄 PDF Extraction
# =======================
def extract_text_from_pdfs(pdf_files) -> str:
    full_text = ""
    for file in pdf_files:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            tmp.write(file.read())
            tmp.flush()
            doc = fitz.open(tmp.name)
            for page in doc:
                full_text += page.get_text()
            doc.close()
        os.remove(tmp.name)
    return full_text

# =======================
# 🔪 Text Chunking
# =======================
def chunk_text(text: str, chunk_size=300, overlap=50) -> List[str]:
    sentences = text.split('.')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence == "":
            continue
        sentence_length = len(sentence.split())
        if current_length + sentence_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = current_chunk[-overlap:]
            current_length = sum(len(s.split()) for s in current_chunk)
        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# =======================
# 🔍 FAISS Search
# =======================
def build_faiss_index(chunks: List[str], embedder):
    embeddings = embedder.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index, embeddings

def retrieve_relevant_chunks(query, index, chunks, embedder, embeddings, top_k=3):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0]]

# =======================
# 🧠 Generate Answer
# =======================
def generate_answer(context, question, qa_pipeline):
    prompt = f"""### Instruction:
Answer the question based on the provided context.

### Context:
{context}

### Question:
{question}

### Answer:"""
    output = qa_pipeline(prompt)[0]["generated_text"]
    return output.split("### Answer:")[-1].strip()

# =======================
# 🔗 Gradio Interface
# =======================
def process_pdfs(files):
    text = extract_text_from_pdfs(files)
    chunks = chunk_text(text)
    index, embeddings = build_faiss_index(chunks, embedder)
    return "PDFs processed successfully.", chunks, index, embeddings

def handle_question(question, chunks, index, embeddings):
    if not chunks or not index:
        return "Please upload and process PDFs first."
    top_chunks = retrieve_relevant_chunks(question, index, chunks, embedder, embeddings)
    context = "\n".join(top_chunks)
    answer = generate_answer(context, question, qa_pipeline)
    return answer

with gr.Blocks() as demo:
    gr.Markdown("## 📘 StudyMate: AI PDF Q&A for Students")

    state_chunks = gr.State([])
    state_index = gr.State(None)
    state_embeddings = gr.State(None)

    with gr.Row():
        pdf_input = gr.File(label="Upload PDF(s)", file_types=[".pdf"], file_count="multiple")
        process_btn = gr.Button("📄 Process PDFs")

    status_box = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question_input = gr.Textbox(label="Ask a Question", placeholder="e.g., What is the main idea in Chapter 2?")
        answer_output = gr.Textbox(label="Answer", lines=6)

    process_btn.click(
        fn=process_pdfs,
        inputs=pdf_input,
        outputs=[status_box, state_chunks, state_index, state_embeddings]
    )

    question_input.submit(
        fn=handle_question,
        inputs=[question_input, state_chunks, state_index, state_embeddings],
        outputs=answer_output
    )

demo.launch()


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c467837db5ad1cb6fd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


