
# Workflow for AI Quiz Generator


In [None]:
# Imports
import os
from pathlib import Path
import fitz  # PyMuPDF
import json
import re
import unicodedata
from collections import Counter

### Setup Output Structure

In [None]:
def setup_output_structure(pdf_name, display_path=False):
    """
    Create the output folder structure for a given PDF as follows:
    ./outputs/<pdf_name>/
        - raw_text.txt
        - chunks.json
        - questions.json
        - run_metadata.json
    """
    base_output = Path("./outputs") / pdf_name
    base_output.mkdir(parents=True, exist_ok=True)

    raw_text_file = base_output / "raw_text.txt"
    
    output_paths = {
        "base": base_output,
        "raw_text": raw_text_file,
        "chunks_json": base_output / "chunks.json",
        "questions_json": base_output / "questions.json",
        "metadata_json": base_output / "run_metadata.json"
    }

    if display_path:
        print(f"\nOutput structure will be created as follows: {output_paths['base']}")
        print(f"  - raw_text.txt: {output_paths['raw_text']}")
        print(f"  - chunks.json: {output_paths['chunks_json']}")
        print(f"  - questions.json: {output_paths['questions_json']}")
        print(f"  - run_metadata.json: {output_paths['metadata_json']}")
        print('\n')
    
    return output_paths

### Extract PDF Text

In [None]:
def extract_text_from_pdf(pdf_path, output_paths, return_pages=False, save_pages=False):
    """
    Extract text from a PDF file.
    """
    
    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"File not found: {pdf_path}")

    doc = fitz.open(pdf_path)
    pages_text = []
    for page in doc:
        text = page.get_text("text")
        pages_text.append(text)
    doc.close()

    print(f"Extracted {len(pages_text)} pages")
    print("--- Page 1 preview ---")
    print(pages_text[0][:500])

    # Save extracted text to raw_text.txt
    if save_pages:
        raw_text_file = output_paths["raw_text"]
        with open(raw_text_file, "w", encoding="utf-8") as f:
            f.write("\n\n".join(pages_text))
        print(f"Saved extracted_text.txt to {raw_text_file}")

    if return_pages:
        return pages_text

### Cleaning the Text

In [None]:
def clean_text(text):
    """
    Minimal cleaning for PDF-extracted text.

    What this does:
    1. Unicode normalization (fixes ligatures like ﬁ, ﬀ, etc.)
    2. Normalize newlines and whitespace
    3. Remove standalone page numbers (e.g., '15', '203')

    What this intentionally does NOT do (may implement later):
    - No header/footer detection
    - No paragraph restructuring
    - No figure/table removal
    - No heuristic guessing
    """

    # 1. Unicode normalization (fix ligatures and weird characters)
    text = unicodedata.normalize("NFKC", text)

    # 2. Normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # 3. Normalize whitespace (keep line structure)
    text = re.sub(r"[ \t]+", " ", text)
    text = "\n".join(line.rstrip() for line in text.split("\n"))

    # 4. Remove standalone page numbers
    cleaned_lines = []
    for line in text.split("\n"):
        stripped = line.strip()
        if stripped.isdigit() and len(stripped) <= 4:
            continue
        cleaned_lines.append(line)

    # 5. Collapse excessive blank lines (keep max 1)
    cleaned_text = "\n".join(cleaned_lines)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)

    return cleaned_text.strip()

### Chunking Text

In [None]:
def chunk_fixed_size(text, output_paths, chunk_size = 4000, overlap= 400, min_chunk_size = 800, save_pages=False):
    """
    Splits text into fixed-size character chunks with overlap.

    Returns:
      List of chunks:
      {
        "chunk_id": int,
        "text": str,
        "char_len": int
      }
    """
    chunks = []
    text_len = len(text)
    chunk_id = 0
    start = 0

    while start < text_len:
        end = start + chunk_size
        chunk_text = text[start:end].strip()

        if len(chunk_text) >= min_chunk_size:
            chunks.append({
                "chunk_id": chunk_id,
                "text": chunk_text,
                "char_len": len(chunk_text)
            })
            chunk_id += 1

        # move start forward, keeping overlap
        start += max(1, chunk_size - overlap)

    # Save chunks to JSON
    if save_pages:
        chunks_file = output_paths["chunks_json"]
        with open(chunks_file, "w", encoding="utf-8") as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)
        print(f"Saved chunks to {chunks_file}")

    return chunks


### Generate Questions

##### Generate Prompt

In [None]:
num_questions = 10

def question_prompt(chunk_text, num_questions=10):
    '''
    Generate a prompt for creating quiz questions from a text chunk.
    '''
    
    return f"""
    You are an expert at creating quiz questions that test deep understanding of the material. 
    Generate {num_questions} challenging quiz questions of medium difficulty based on the following except. 
    The excerpt may contain PDF artifacts (page numbers, figure labels, axis ticks, glossary/margin terms, broken line wraps).
    Ignore non-explanatory artifacts and focus on the conceptual content. 

    Each question should have 4 answer options (A, B, C, D) with only one correct answer. 
    The questions should require critical thinking and not be answerable by simple keyword matching.

    Generate {num_questions} total questions and return them in the following format:
    {{
        questions:[
            {{
                question_id: 1,
                question: question_text,
                options: ['A', 'B', 'C', 'D']
                correct_answer: 'A'
            }},

            {{
                question_id: 2,
                question: question_text,
                options: ['A', 'B', 'C', 'D']
                correct_answer: 'C'
            }}
        ]
    }}

    Excerpt: 
    \"\"\"\n{chunk_text}\n\"\"\"
    """.strip()

Potential improvements to consider:
- Add difficulty option later
- Add a questions validation function later on, to validate questions and answer format generated by LLM

### Main Execution Workflow

In [None]:
# pdf_file = Path("test_pdfs/islr_chap_2.pdf")
pdf_file = Path("test_pdfs/multi_column_research_paper.pdf")
pdf_name = pdf_file.stem  # Get filename without extension

if pdf_file.exists():
    # Setup output structure
    output_paths = setup_output_structure(pdf_name, display_path=True)
    
    # Extract text from PDF
    pages = extract_text_from_pdf(pdf_file, output_paths, return_pages=True, save_pages=True)
    text_pages = "\n\n".join(pages)
    
    # Text cleaning
    text_cleaned = clean_text(text_pages)
    print('\n' + text_cleaned[:1500])

    # Text chunking
    text_chunked = chunk_fixed_size(
                            text_cleaned,
                            output_paths,
                            chunk_size=4000,
                            overlap=400,
                            min_chunk_size=800,
                            save_pages=True                            
                        )
    print(f"Created {len(text_chunked)} chunks")

    # Generate Questions



else:
    print(f"Please place your PDF at: {pdf_file}")