#  Study Buddy - PDF Quizzer

An interactive PDF question-answering system using extractive QA with DistilBERT.

**Tech Stack:**
- Model: distilbert-base-cased-distilled-squad
- Pipeline: question-answering
- Purpose: Extract answers from textbook chapters


#Install Requirements
Run this cell first to install all dependencies.

In [2]:
# Install required packages
!pip install transformers torch PyPDF2 ipywidgets

!jupyter nbextension enable --py widgetsnbextension

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, jedi
Successfully installed PyPDF2-3.0.1 jedi-0.19.2
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
from transformers import pipeline
import PyPDF2
from ipywidgets import FileUpload, Textarea, Button, VBox, HBox, HTML, Layout
from IPython.display import display, clear_output

#Load the Model



In [4]:
qa_pipeline = pipeline(
    'question-answering',
    model='distilbert-base-cased-distilled-squad'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


#PDF Text Extraction and Cleaning

In [5]:
def extract_text_from_pdf(pdf_bytes):

    try:
        from io import BytesIO
        pdf_file = BytesIO(pdf_bytes)
        reader = PyPDF2.PdfReader(pdf_file)

        # Extract text from all pages
        text = ""
        for page_num, page in enumerate(reader.pages, 1):
            text += page.extract_text() + " "

        # Clean up text
        text = " ".join(text.split())  # Remove extra whitespace


        return text

    except Exception as e:
        print(f" Error extracting PDF: {str(e)}")
        return None



In [6]:
def chunk_text(text, chunk_size=2000, overlap=200):

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks



In [7]:
def answer_question_improved(context, question):

    chunks = chunk_text(context, chunk_size=2000, overlap=200)

    print(f"Searching through {len(chunks)} text chunks...")

    best_answer = None
    best_score = 0

    # Try each chunk
    for i, chunk in enumerate(chunks):
        try:
            result = qa_pipeline(question=question, context=chunk)

            # best answer
            if result['score'] > best_score:
                best_score = result['score']
                best_answer = result
                print(f"  → Chunk {i+1}: Found answer with {result['score']*100:.1f}% confidence")

        except Exception as e:
            continue

    if best_answer:
        print(f"Best answer found with {best_answer['score']*100:.1f}% confidence")
    else:
        print(" No answer found in any chunk")

    return best_answer



#Upload PDF and ask Question

In [8]:
# Global variable to store extracted text
pdf_context = None

# Create widgets
file_upload = FileUpload(
    accept='.pdf',
    multiple=False,
    description='Upload PDF'
)

question_input = Textarea(
    placeholder='Type your question here (e.g., "What is the definition of photosynthesis?")',
    description='Question:',
    layout=Layout(width='100%', height='80px')
)

ask_button = Button(
    description='Get Answer',
    button_style='primary',
    icon='search'
)

output_area = HTML()

def chunk_text(text, chunk_size=2000, overlap=200):

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

def answer_question_improved(context, question):
    """
    Answer question by searching through text chunks and returning best answer.
    """
    # Split context into chunks
    chunks = chunk_text(context, chunk_size=2000, overlap=200)

    best_answer = None
    best_score = 0

    for i, chunk in enumerate(chunks):
        try:
            result = qa_pipeline(question=question, context=chunk)

            if result['score'] > best_score:
                best_score = result['score']
                best_answer = result

        except Exception as e:
            continue

    return best_answer

# File upload handler
def on_file_upload(change):
    global pdf_context

    if file_upload.value:
        # Get the uploaded file
        uploaded_file = list(file_upload.value.values())[0]
        pdf_bytes = uploaded_file['content']
        filename = list(file_upload.value.keys())[0]

        output_area.value = f"<p style='color: blue;'>Processing {filename}...</p>"

        # Extract text
        pdf_context = extract_text_from_pdf(pdf_bytes)

        if pdf_context:
            # Calculate chunks
            num_chunks = len(chunk_text(pdf_context))

            output_area.value = f"""
            <div style='background-color: #d4edda; padding: 15px; border-radius: 5px; border: 1px solid #c3e6cb;'>
                <h4 style='color: #155724; margin-top: 0;'>PDF Loaded Successfully</h4>
                <p style='color: #155724;'>
                    <strong>File:</strong> {filename}<br>
                    <strong>Characters extracted:</strong> {len(pdf_context):,}<br>
                    <strong>Text chunks created:</strong> {num_chunks}<br>
                    <strong>Ready for questions.</strong> Type your question below and click "Get Answer".
                </p>
            </div>
            """
        else:
            output_area.value = "<p style='color: red;'>Failed to extract text from PDF. Please try another file.</p>"

# Question answering handler
def on_ask_button_click(b):
    global pdf_context

    if pdf_context is None:
        output_area.value = "<p style='color: red;'>Please upload a PDF first.</p>"
        return

    question = question_input.value.strip()

    if not question:
        output_area.value = "<p style='color: red;'>Please enter a question.</p>"
        return

    output_area.value = "<p style='color: blue;'>Searching for answer...</p>"

    result = answer_question_improved(pdf_context, question)

    if result:
        output_area.value = f"""
        <div style='background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #dee2e6;'>
            <h4 style='color: #212529; margin-top: 0;'>Answer Found</h4>
            <p style='color: #212529;'>
                <strong>Question:</strong> {question}<br><br>
                <strong>Answer:</strong> <span style='font-size: 1.1em;'>{result['answer']}</span>
            </p>
        </div>
        """
    else:
        output_area.value = "<p style='color: red;'>Could not find an answer. Try rephrasing your question.</p>"

# Connect handlers
file_upload.observe(on_file_upload, names='value')
ask_button.on_click(on_ask_button_click)

# Display interface
display(VBox([
    HTML("<h2>Study Buddy - PDF Quizzer</h2>"),
    HTML("<p>Upload a PDF textbook chapter and ask questions to get instant answers.</p>"),
    HTML("<hr>"),
    HTML("<h3>Step 1: Upload Your PDF</h3>"),
    file_upload,
    HTML("<h3>Step 2: Ask Your Question</h3>"),
    question_input,
    ask_button,
    HTML("<hr>"),
    output_area
]))


VBox(children=(HTML(value='<h2>Study Buddy - PDF Quizzer</h2>'), HTML(value='<p>Upload a PDF textbook chapter …