
# Workflow for AI Quiz Generator


In [None]:
# Imports
import os
from pathlib import Path
import fitz  # PyMuPDF
import json
import re
import unicodedata
from collections import Counter
from dotenv import load_dotenv
from openai import OpenAI
from datetime import datetime
import uuid

### Setup Output Structure

In [None]:
def setup_output_structure(pdf_name, display_path=False):
    """
    Create the output folder structure for a given PDF as follows:
    ./outputs/<pdf_name>/
        - raw_text.txt
        - chunks.json
        - questions.json
        - run_metadata.json
    """
    base_output = Path("./outputs") / pdf_name
    base_output.mkdir(parents=True, exist_ok=True)

    raw_text_file = base_output / "raw_text.txt"
    
    output_paths = {
        "base": base_output,
        "raw_text": raw_text_file,
        "chunks_json": base_output / "chunks.json",
        "questions_json": base_output / "questions.json",
        "metadata_json": base_output / "run_metadata.json"
    }

    if display_path:
        print(f"\nOutput structure will be created as follows: {output_paths['base']}")
        print(f"  - raw_text.txt: {output_paths['raw_text']}")
        print(f"  - chunks.json: {output_paths['chunks_json']}")
        print(f"  - questions.json: {output_paths['questions_json']}")
        print(f"  - run_metadata.json: {output_paths['metadata_json']}")
        print('\n')
    
    return output_paths

### Extract PDF Text

In [None]:
def extract_text_from_pdf(pdf_path, output_paths, return_pages=False, save_pages=False):
    """
    Extract text from a PDF file.
    """
    
    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"File not found: {pdf_path}")

    doc = fitz.open(pdf_path)
    pages_text = []
    for page in doc:
        text = page.get_text("text")
        pages_text.append(text)
    doc.close()

    # Save extracted text to raw_text.txt
    if save_pages:
        raw_text_file = output_paths["raw_text"]
        with open(raw_text_file, "w", encoding="utf-8") as f:
            f.write("\n\n".join(pages_text))
        print(f"Saved extracted_text.txt to {raw_text_file}")

    if return_pages:
        return pages_text

### Cleaning the Text

In [None]:
def clean_text(text):
    """
    Minimal cleaning for PDF-extracted text.

    What this does:
    1. Unicode normalization (fixes ligatures like ﬁ, ﬀ, etc.)
    2. Normalize newlines and whitespace
    3. Remove standalone page numbers (e.g., '15', '203')

    What this intentionally does NOT do (may implement later):
    - No header/footer detection
    - No paragraph restructuring
    - No figure/table removal
    - No heuristic guessing
    """

    # 1. Unicode normalization (fix ligatures and weird characters)
    text = unicodedata.normalize("NFKC", text)

    # 2. Normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # 3. Normalize whitespace (keep line structure)
    text = re.sub(r"[ \t]+", " ", text)
    text = "\n".join(line.rstrip() for line in text.split("\n"))

    # 4. Remove standalone page numbers
    cleaned_lines = []
    for line in text.split("\n"):
        stripped = line.strip()
        if stripped.isdigit() and len(stripped) <= 4:
            continue
        cleaned_lines.append(line)

    # 5. Collapse excessive blank lines (keep max 1)
    cleaned_text = "\n".join(cleaned_lines)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)

    return cleaned_text.strip()

### Chunking Text

In [None]:
def chunk_fixed_size(text, output_paths, chunk_size = 4000, overlap= 400, min_chunk_size = 800, save_pages=False):
    """
    Splits text into fixed-size character chunks with overlap.

    Returns:
      List of chunks:
      {
        "chunk_id": int,
        "text": str,
        "char_len": int
      }
    """
    chunks = []
    text_len = len(text)
    chunk_id = 0
    start = 0

    while start < text_len:
        end = start + chunk_size
        chunk_text = text[start:end].strip()

        if len(chunk_text) >= min_chunk_size:
            chunks.append({
                "chunk_id": chunk_id,
                "text": chunk_text,
                "char_len": len(chunk_text)
            })
            chunk_id += 1

        # move start forward, keeping overlap
        start += max(1, chunk_size - overlap)

    # Save chunks to JSON
    if save_pages:
        chunks_file = output_paths["chunks_json"]
        with open(chunks_file, "w", encoding="utf-8") as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)
        print(f"Saved chunks to {chunks_file}")

    return chunks


### Generate Questions

#### Simpler Promt for Testing

In [None]:
def build_simple_prompt(chunk_text, n_questions = 2):
    return f"""
Generate {n_questions} quiz questions from the following text.
Ignore formatting artifacts from PDFs such as page numbers, figure labels,
axis ticks, or broken line wraps.

Generate {n_questions} total questions and return them in the following format:
    {{
        questions:[
            {{
                question_id: 1,
                question: question_text,
                options: ['A', 'B', 'C', 'D']
                correct_answer: 'A'
            }},

            {{
                question_id: 2,
                question: question_text,
                options: ['A', 'B', 'C', 'D']
                correct_answer: 'C'
            }}
        ]
    }}

Text:
\"\"\"\n{chunk_text}\n\"\"\"
""".strip()

In [None]:
def call_llm(client, prompt):
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": "You generate clear quiz questions from textbook content."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.4,
        max_tokens=600
    )
    return response.choices[0].message.content.strip()

In [None]:
def generate_questions_simple(chunks, questions_per_chunk = 2, max_chunks= 5):
    results = []
    client = OpenAI()

    for chunk in chunks[:max_chunks]:
        prompt = build_simple_prompt(
            chunk["text"],
            n_questions=questions_per_chunk
        )
        response = call_llm(client, prompt)

        results.append({
            "chunk_id": chunk["chunk_id"],
            "questions_text": response
        })

    return results

#### Main Model

##### Generate Prompt

In [None]:
def build_question_prompt(chunk_text, num_questions=10):
    '''
    Generate a prompt for creating quiz questions from a text chunk.
    '''
    
    return f"""
    You are an expert at creating quiz questions that test deep understanding of the material. 
Generate {num_questions} challenging quiz questions of medium difficulty based on the following except. 
The excerpt may contain PDF artifacts (page numbers, figure labels, axis ticks, glossary/margin terms, broken line wraps).
Ignore non-explanatory artifacts and focus on the conceptual content. 

Each question should have 4 answer options (A, B, C, D) with only one correct answer. 
The questions should require critical thinking and not be answerable by simple keyword matching.

Generate {num_questions} total questions and return them in the following format:
{{
    "questions":[
        {{
            "type": "mcq",
            "question_id": 1,
            "question": "What is the main topic of this excerpt?",
            "choices": ["A", "B", "C", "D"],
            "answer_index": 0
        }},

        {{
            "type": "mcq",
            "question_id": 2,
            "question": "What is the main topic of this excerpt?",
            "choices": ["A", "B", "C", "D"],
            "answer_index": 2
        }}
    ]
}}

Rules:
- choices must be exactly 4 items
- answer_index must be an integer 0..3
- questions must be answerable from the given text (no outside knowledge)
- avoid trivial questions (e.g., “What is the chapter number?”)

Excerpt: 
\"\"\"\n{chunk_text}\n\"\"\"
""".strip()

Potential improvements to consider:
- Add difficulty option later
- Add a questions validation function later on, to validate questions and answer format generated by LLM

In [None]:
def call_openai_text(client, prompt, model = "gpt-4.1-mini"):
    """
    Call the OpenAI API with a text prompt and return the response text
    """
    resp = client.responses.create(
        model=model,
        input=prompt,
        temperature=0.3,
        max_output_tokens=900,
    )
    return resp.output_text.strip()

##### Validate generated JSON Structure

In [None]:
def safe_json_loads(text):
    """
    Ensure result is a JSON object otherwise return first {...} block it can find
    """
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            return json.loads(text[start:end + 1])
        raise

def validate_json(q):
    question_type = q.get("type")
    if question_type != "mcq":
        return "type must be 'mcq'"

    question = q.get("question")
    if not isinstance(question, str) or not question.strip():
        return "missing/invalid question"

    choices = q.get("choices")
    if not isinstance(choices, list) or len(choices) != 4 or not all(isinstance(c, str) and c.strip() for c in choices):
        return "choices must be a list of exactly 4 non-empty strings"

    answer = q.get("answer_index")
    if not isinstance(answer, int) or not (0 <= answer <= 3):
        return "answer_index must be an int in [0..3]"

    # Optional explanation
    if "explanation" in q and q["explanation"] is not None and not isinstance(q["explanation"], str):
        return "explanation must be a string if present"

    # Light anti-noise check (optional but helpful)
    bad_markers = ["doi.org", "©", "springer", "page "]
    low_q = question.lower()
    if any(m in low_q for m in bad_markers):
        return "question looks like it used PDF boilerplate"

    return None

In [None]:
def generate_mcqs_over_chunks(chunks, pdf_name, output_paths, model = "gpt-4.1-mini", mcqs_per_chunk = 2, max_chunks = None):
    """
    Reads the chunks, generates MCQs, validates them, and saves them
    """

    run_ts = datetime.now().astimezone().isoformat(timespec="seconds")

    questions_file = output_paths["questions_json"]
    metadata_file = output_paths["metadata_json"]

    subset = chunks if max_chunks is None else chunks[:max_chunks]

    # Maintain list of valid and invalid questions/results
    all_valid = []
    all_invalid = []

    client = OpenAI()

    for i, chunk in enumerate(subset, 1):
        prompt = build_question_prompt(chunk["text"], num_questions=mcqs_per_chunk)

        try:
            raw = call_openai_text(client, prompt, model=model)
            data = safe_json_loads(raw)
            qs = data.get("questions", [])

            if not isinstance(qs, list) or len(qs) != mcqs_per_chunk:
                raise ValueError(f"Expected {mcqs_per_chunk} questions, got {len(qs) if isinstance(qs, list) else 'non-list'}")

            for q in qs:
                if not isinstance(q, dict):
                    all_invalid.append({
                        "chunk_id": chunk["chunk_id"],
                        "error": "question is not an object",
                        "raw_value": q
                    })
                    continue

                err = validate_json(q)
                if err:
                    all_invalid.append({
                        "chunk_id": chunk["chunk_id"],
                        "error": err,
                        "raw_value": q
                    })
                    continue

                q_out = dict(q)
                q_out["question_id"] = str(uuid.uuid4())
                q_out["chunk_id"] = chunk["chunk_id"]
                # convenience field
                q_out["answer"] = q_out["choices"][q_out["answer_index"]]
                all_valid.append(q_out)

            print(f"[{i}/{len(subset)}] chunk_id={chunk['chunk_id']} ✓ valid so far={len(all_valid)} invalid so far={len(all_invalid)}")

        except Exception as e:
            all_invalid.append({
                "chunk_id": chunk["chunk_id"],
                "error": f"generation/parsing failed: {e}",
                "raw_value": None
            })
            print(f"[{i}/{len(subset)}] chunk_id={chunk['chunk_id']} ✗ failed: {e}")

    # Write questions.json
    with open(questions_file, "w", encoding="utf-8") as f:
        json.dump(all_valid, f, ensure_ascii=False, indent=2)

    # Write run_metadata.json (repro + debugging)
    metadata = {
        "pdf_name": pdf_name,
        "timestamp": run_ts,
        "inputs": {
            "num_chunks_available": len(chunks),
            "num_chunks_processed": len(subset),
        },
        "chunk_schema": {
            "fields": ["chunk_id", "text", "char_len"]
        },
        "generation": {
            "model": model,
            "mcqs_per_chunk": mcqs_per_chunk,
            "temperature": 0.3,
            "max_output_tokens": 900
        },
        "outputs": {
            "questions_path": str(questions_file),
            "metadata_path": str(metadata_file),
        },
        "results": {
            "valid_questions": len(all_valid),
            "invalid_items": len(all_invalid),
        },
        # Keep some invalid samples for debugging without bloating file
        "invalid_samples": all_invalid[:10],
    }

    with open(metadata_file, "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)

    return {
        "questions_path": str(questions_file),
        "metadata_path": str(metadata_file),
        "valid_questions": all_valid,
        "invalid_items": all_invalid,
    }

### Main Execution Workflow

In [None]:
# pdf_file = Path("test_pdfs/islr_chap_2.pdf")
pdf_file = Path("test_pdfs/multi_column_research_paper.pdf")
pdf_name = pdf_file.stem.lower().replace(' ', '_')  # Get filename without extension

if pdf_file.exists():
    # Setup output structure
    output_paths = setup_output_structure(pdf_name, display_path=True)
    
    # Extract text from PDF
    pages = extract_text_from_pdf(pdf_file, output_paths, return_pages=True, save_pages=True)
    text_pages = "\n\n".join(pages)

    # print(f"Extracted {len(pages_text)} pages")
    # print("--- Page 1 preview ---")
    # print(pages_text[0][:500])
    
    # Text cleaning
    text_cleaned = clean_text(text_pages)
    # print('\n' + text_cleaned[:1500])

    # Text chunking
    text_chunked = chunk_fixed_size(
                            text_cleaned,
                            output_paths,
                            chunk_size=4000,
                            overlap=400,
                            min_chunk_size=800,
                            save_pages=True                            
                        )
    # print(f"Created {len(text_chunked)} chunks")

    # Generate Questions

    # Simple initial model
    # num_questions = 10
    # results = generate_questions_simple(text_chunked, questions_per_chunk=5, max_chunks=5)

    # print(results[0]["questions_text"])

    # More advanced prompt + validation
    result = generate_mcqs_over_chunks(
        text_chunked,
        pdf_name,
        output_paths,
        model="gpt-4.1-mini",
        mcqs_per_chunk=2,
        max_chunks=4,  # keep small while testing cost/quality
    )

    print(result["questions_path"])
    print(result["metadata_path"])
    print("valid:", len(result["valid_questions"]), "invalid:", len(result["invalid_items"]))


else:
    print(f"Please place your PDF at: {pdf_file}")

In [None]:
result['valid_questions']

In [None]:
result['invalid_items']