In [None]:
# Import required libraries
import os
import sys
import matplotlib.pyplot as plt
import json
from pathlib import Path
from dotenv import load_dotenv
from mistralai import Mistral, DocumentURLChunk, TextChunk

# Load environment variables from .env file
load_dotenv()

# Get API key from environment
api_key = os.getenv("MISTRAL_API_KEY")
if not api_key:
    print("Warning: MISTRAL_API_KEY not found in environment variables.")
    api_key = "your_api_key_here"  # Replace with your actual key when testing

# Initialize Mistral client
client = Mistral(api_key=api_key)


In [None]:
def process_pdf(file_path):
    """
    Process PDF document using MistralAI's OCR service
    
    Args:
        file_path: Path to the PDF file
    
    Returns:
        Dictionary containing OCR results
    """
    # Verify PDF file exists
    pdf_file = Path(file_path)
    if not pdf_file.is_file():
        raise FileNotFoundError(f"PDF file not found: {file_path}")
        
    try:
        # Upload PDF file to Mistral's OCR service
        uploaded_file = client.files.upload(
            file={
                "file_name": pdf_file.stem,
                "content": pdf_file.read_bytes(),
            },
            purpose="ocr",
        )
        
        # Get URL for the uploaded file
        signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
        
        # Process PDF with OCR
        pdf_response = client.ocr.process(
            document=DocumentURLChunk(document_url=signed_url.url),
            model="mistral-ocr-latest",
            include_image_base64=True,
        )
        
        # Convert response to dictionary
        response_dict = json.loads(pdf_response.model_dump_json())
        
        return pdf_response, response_dict
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None, None


def extract_text_from_response(ocr_response):
    """
    Extract text from OCR response
    
    Args:
        ocr_response: OCR response object
    
    Returns:
        Extracted text
    """
    if not ocr_response:
        return ""
        
    extracted_text = ""
    for page in ocr_response.pages:
        extracted_text += page.text + "\n\n"
    
    return extracted_text.strip()


In [None]:
# Test with a sample PDF
# Replace this with your actual test PDF path
pdf_path = "sample_document.pdf"

# Check if we need to create a sample PDF
if not os.path.exists(pdf_path):
    print(f"Warning: Test PDF {pdf_path} not found.")
    print("Please create a sample PDF document for testing.")
    print("You can use any PDF document with text content.")
    
    # For demonstration purposes, we'll just set a flag to skip processing
    skip_processing = True
else:
    skip_processing = False

# Process the PDF
if not skip_processing and api_key != "your_api_key_here":
    print(f"Processing PDF: {pdf_path}")
    ocr_response, response_dict = process_pdf(pdf_path)
    
    if ocr_response:
        # Extract and display text
        extracted_text = extract_text_from_response(ocr_response)
        print("\n--- Extracted Text (First 500 chars) ---")
        print(extracted_text[:500] + "...")
        
        # Print some metadata
        print("\n--- Document Metadata ---")
        print(f"Number of pages: {len(ocr_response.pages)}")
        
        # Print first part of the raw response
        print("\n--- Raw Response (First 1000 chars) ---")
        print(json.dumps(response_dict, indent=4)[:1000] + "...")
    else:
        print("Failed to process PDF.")
else:
    print("Skipping PDF processing. Please set your API key and provide a PDF file to test.")
    extracted_text = "Sample extracted text (API key not provided or PDF not found)"


In [None]:
from mistralai.models import OCRResponse
from IPython.display import Markdown, display


def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str


def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)


# Display combined markdowns and images if we have a response
if 'ocr_response' in locals() and ocr_response:
    print("\n--- Displaying PDF with embedded images ---")
    display(Markdown(get_combined_markdown(ocr_response)))
else:
    print("\nNo OCR response available to display.")


In [None]:
# Simulated answer key
answer_key = "The quick brown fox jumps over the lazy dog."

# Simulated LLM grading function
def grade_answer(student_answer, correct_answer):
    """
    Simulate LLM grading of a student answer
    
    In a real application, this would call the OpenAI API with a prompt
    that includes the student answer and correct answer.
    
    Args:
        student_answer: Text of student's answer
        correct_answer: Text of correct answer
    
    Returns:
        Dictionary with score, feedback, and justification
    """
    # This is a simplified simulation
    # In reality, we would use an LLM to compare the answers
    
    # Calculate a simple similarity score
    # (This is just for demonstration - a real system would use the LLM)
    student_words = set(student_answer.lower().split())
    correct_words = set(correct_answer.lower().split())
    
    if not student_words:
        return {
            "score": 0,
            "feedback": "No answer provided.",
            "justification": "The answer was blank."
        }
    
    common_words = student_words.intersection(correct_words)
    similarity = len(common_words) / len(correct_words) if correct_words else 0
    score = min(100, int(similarity * 100))
    
    # Generate feedback based on score
    if score >= 90:
        feedback = "Excellent answer! Your response is correct."
        justification = "The answer contains all the key elements of the correct response."
    elif score >= 70:
        feedback = "Good answer, but missing some details."
        justification = "The answer contains most key elements but is missing some important details."
    elif score >= 50:
        feedback = "Partial credit. Your answer is incomplete."
        justification = "The answer contains some correct elements but is missing significant content."
    else:
        feedback = "Your answer needs improvement."
        justification = "The answer is missing most of the required elements."
    
    return {
        "score": score,
        "feedback": feedback,
        "justification": justification
    }

# Grade the extracted text (using just the first 500 characters for demo purposes)
if 'extracted_text' in locals():
    student_answer = extracted_text[:500]  # Just use first 500 chars for demo
    grade_result = grade_answer(student_answer, answer_key)
    
    # Display results
    print("\n--- Grading Results ---")
    print(f"Student Answer (first 100 chars): {student_answer[:100]}...")
    print(f"Correct Answer: {answer_key}")
    print(f"Score: {grade_result['score']}%")
    print(f"Feedback: {grade_result['feedback']}")
    print(f"Justification: {grade_result['justification']}")
else:
    print("\n--- Grading Results ---")
    print("No extracted text available for grading.")


In [13]:
# Initialize Mistral client with API key
from mistralai import Mistral, DocumentURLChunk, TextChunk
import os


client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

In [4]:
from supabase import create_client
import os


supabase_url = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")

supabase = create_client(supabase_url, SUPABASE_SERVICE_ROLE_KEY)

# Now you can list and download from Storage:
bucket_name = "database-pdfs"
file_path = "23i-2005-6.pdf"

# List files (optional, for debugging)
files = supabase.storage.from_(bucket_name).list()
print("Files in bucket:", [f["name"] for f in files])

# Download file
file_bytes = supabase.storage.from_(bucket_name).download(file_path)
with open(file_path, "wb") as f:
    f.write(file_bytes)
print("Downloaded successfully!")

Files in bucket: ['23i-0033-6.pdf', '23i-0047-6.pdf', '23i-0122-6.pdf', '23i-0125-6.pdf', '23i-0126-6.pdf', '23i-0127-6.pdf', '23i-2005-6.pdf', '23i-2084-6.pdf']
Downloaded successfully!


In [None]:
from pathlib import Path
import json


# Verify PDF file exists
pdf_file = Path(file_path)
assert pdf_file.is_file()

# Upload PDF file to Mistral's OCR service
uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr",
)

# Get URL for the uploaded file
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

# Process PDF with OCR, including embedded images
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True,
)

# Convert response to JSON format
response_dict = json.loads(pdf_response.model_dump_json())

print(json.dumps(response_dict, indent=4)[0:1000])

{
    "pages": [
        {
            "index": 0,
            "markdown": "# 1.Abstract \n\nThis project proposes the development of a simple AI-powered educational system that helps teachers create content and grade student work automatically. The system consists of two main modules designed to reduce teachers' workload and provide faster feedback to students.\n\n### 1.1 Module 1: Content Generation\n\nTeachers upload textbook PDFs which are processed and stored in a Pinecone vector database. Teachers then write simple prompts describing what they need (like \"create slides for Vision Transformer\" or \"make a quiz on Vision Transformer\"). The system will retrieve relevant content from the vector database that will serve as context for the Large Language Model (LLM) for PowerPoint slides and quiz generation based on teachers' requests.\n\n### 1.2 Module 2: Quiz Grading\n\nStudents submit photos of their handwritten quiz responses. Teaching assistants input the correct answers throug

In [6]:
# Upload PDF file to Mistral's OCR service
uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr",
)

# Get URL for the uploaded file
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

# Process PDF with OCR, including embedded images
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True,
)

# Convert response to JSON format
response_dict = json.loads(pdf_response.model_dump_json())
print(json.dumps(response_dict, indent=4)[0:1000])

{
    "pages": [
        {
            "index": 0,
            "markdown": "# Database Systems\n\nDate: 6th May 2025 QUIZ 6\n\nName: M. Ahmed Riaz Roll No: 23I-2005 Marks: 4/10\n\nQ: Consider the relation below with the given functional dependencies. Assume no other FDs exist. Answer the following questions.\n\n## Book Loans\n\n|  Member_ID | Book_ISBN | Loan_Date | Return_Date | Member_Type | Book_Title | Author | Late Fee Rate  |\n| --- | --- | --- | --- | --- | --- | --- | --- |\n|  |   |   |   |   |   |   |   |\n\n- FD1: Member_ID, Book_ISBN \u2192 Loan_Date, Return_Date\n- FD2: Member_Type \u2192 Late_Fee_Rate\n- FD3: Book_ISBN \u2192 Book_Title, Author\n- FD4: Book_ISBN \u2192 Late_Fee_Rate\n\nCandidate Key: {Member_ID, Book_ISBN}\n\na. Is the relation in 2NF? Provide the reason if it is not in 2NF. [2 marks] The relation is not in 2NF, because in there is partial dependency exists in Functional Dependency. FD1. We have to create those table to be in 2NF.\n\nb. Is the relation in

In [15]:
from mistralai.models import OCRResponse
from IPython.display import Markdown, display


def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str


def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)


# Display combined markdowns and images
display(Markdown(get_combined_markdown(pdf_response)))

# 1.Abstract 

This project proposes the development of a simple AI-powered educational system that helps teachers create content and grade student work automatically. The system consists of two main modules designed to reduce teachers' workload and provide faster feedback to students.

### 1.1 Module 1: Content Generation

Teachers upload textbook PDFs which are processed and stored in a Pinecone vector database. Teachers then write simple prompts describing what they need (like "create slides for Vision Transformer" or "make a quiz on Vision Transformer"). The system will retrieve relevant content from the vector database that will serve as context for the Large Language Model (LLM) for PowerPoint slides and quiz generation based on teachers' requests.

### 1.2 Module 2: Quiz Grading

Students submit photos of their handwritten quiz responses. Teaching assistants input the correct answers through the system. The artificial intelligence uses Optical Character Recognition (OCR) technology to read the handwritten material, cross-checks it with the correct answers through large language model (LLM) methods, and marks each submission automatically. The system generates personalized reports for each student after this process, displaying their marks, correct answers, and personalized comments.

The main benefits are enormous teacher time savings, faster feedback to students, uniform grading, and larger class sizes without additional staff. The system uses simple web interfaces that don't require technical expertise, so it's accessible to all teachers with or without a technology background. Expected outcomes are to reduce teacher preparation time by $60-70 \%$, provide same-day student feedback instead of waiting a week, and improve learning outcomes through timely, detailed feedback reports.

## 2. Introduction and Background

### 2.1 Stakeholders and Topics Concerned

## Primary Stakeholders:

Teachers and Educators:Teachers and Educators struggle with content creation and grading quizzes which is also time consuming. They need a tool that makes their life easy and does not compromise education content delivery.
Students: Students require well structured educational material and timely feedback. Waiting weeks for grading of their tasks slows the learning process.
Teaching Assistants (TAs): Graduate or junior students who spend time grading the tasks under supervision of a faculty member.

Educational Institutions: Educational institutions that want to improve teacher efficiency and manage increasing student population without compromising educational content delivery.

# 2.2 Problem Statement 

The core problem is the inefficient use of educator time due to manual, repetitive tasks that could be automated:

## Content Creation Challenges:

- Educators often report needing 3-8 hours per lecture hour during initial development stages Ideas to Reduce Non-Classroom Prep Time Without Reducing Teaching Quality - ISTLDSimon Fraser University. So, converting a chapter into a polished slide deck can realistically take 3-8 hours, especially when Structuring content, Designing visuals and Drafting speaker notes.
- Crafting quizzes requires extensive planning and formatting, especially when aligning with learning objectives and varying question types.
- Quality of content fluctuates widely, with some instructors delivering rich, detailed material and others creating only minimal slide decks.
- Extracting and reformatting textbook content remains a time-consuming manual task, often requiring copy-paste and slide/template adjustments.


## Assessment and Grading Issues:

Handwritten quiz grading takes 2-3 minutes per student per quiz
Large classes (100+ students) create week-long grading delays
Grading consistency varies between different graders
Providing detailed feedback is time-consuming but crucial for learning
Students receive feedback too late to be useful for improvement

## Resource Constraints:

Limited teaching assistant availability for grading support
Pressure to handle larger class sizes with same staff
Need for scalable solutions that maintain educational quality

### 2.3 Project Importance

This project addresses systemic challenges in UK higher education, supporting academic staff and improving student learning through automation.

Scale of Impact: According to HESA (2023), there are over 228,000 academic staff employed in UK higher education. Streamlining routine tasks such as grading and slide creation-even by just 30 minutes per week per lecturer-could result in a nationwide time saving of over 5 million hours annually, freeing time for research, mentoring, and pedagogy. Source: Higher Education Staff Statistics: UK, 2021/22 | HESA

Effectiveness in Learning: The Education Endowment Foundation (EEF) emphasizes that high-quality feedback is one of the most effective strategies for improving learning, particularly when it is timely and actionable. Their 2021 guidance report notes that delayed feedback weakens impact, as students are less able to connect it to the learning task. Integrating automation allows for faster, scalable feedback, enhancing academic outcomes.

# Educational Equity: Large modules in UK universities often involve multiple markers, leading to variation in grading quality. Automation helps standardise assessment criteria, ensuring fairness and consistency regardless of class size or instructor workload. This is particularly important for first-year undergraduate courses, where large class sizes and uneven marking are common. 

Sustainability: As student numbers increase and education budgets tighten, automation is the answer to upholding quality education.

### 2.4 Benefits to Stakeholders

## For Teachers:

Teachers using AI tools for grading, quiz creation, and content planning report weekly time savings equivalent to $\mathbf{6}$ hours on average. Source: Three in 10 Teachers Use AI Weekly, Saving Six Weeks a Year
Focus more time on actual teaching and student interaction
Maintain consistent grading standards across all assignments
Easily create multiple versions of quizzes to prevent cheating
Generate comprehensive analytics on student performance

## For Students:

Receive feedback within 24 hours instead of 1-2 weeks
Get detailed explanations for incorrect answers
Access to well-structured slides and study materials
Consistent grading regardless of which TA grades their work
Better learning outcomes through timely feedback

## For Teaching Assistants:

Target to reduce grading workload by $80-90 \%$
Focus on higher-value tasks like tutoring and research
Provide input on answer keys rather than grading individual papers
More time for their own academic work

## For Educational Institutions:

Handle larger class sizes without hiring additional staff
Improve student satisfaction scores through faster feedback
Reduce operational costs in the long term
Maintain consistent quality across different instructors
Generate data-driven insights for curriculum improvement

### 2.5 Relationship to Previous Work and Existing Knowledge

## Educational Technology Evolution:

AI-assisted learning began with Computer-Assisted Instruction (CAI) in the 1960s, typified by systems like PLATO and the IBM 1500. These early architectures enabled adaptive problem sets and rudimentary learner modeling (PDF) The Development History and Future Trend of

Computer-Assisted Teaching.
In the 1970s and 1980s, Intelligent Tutoring Systems (ITSs) emerged, integrating domain knowledge, student modeling, and pedagogical strategies (e.g., Carbonell's SCHOLAR, Sleeman \& Brown's PIXIE Intelligent tutoring systems: an overview.

While modern LMS platforms like Moodle and Blackboard enable content delivery, there remains a significant gap in automating content creation and streamlined assessment-a gap that this project addresses.

# Natural Language Processing in Education: 

Recent work has validated the use of LLMs for question generation. Elkins et al. (2024) demonstrated that GPT-generated quiz questions, designed using Bloom's taxonomy, were rated by teachers as comparable in quality to human-authored ones-with potential to even improve quizzes [2401.05914] How Teachers Can Use Large Language Models and Bloom's Taxonomy to Create Educational Quizzes. Additionally, Doughty et al. (2023) found GPT-4 produced clear, well-aligned MCQs in programming education, further supporting LLM viability for educational content creation [2312.03173] A Comparative Study of AI-Generated (GPT-4) and Human-crafted MCQs in Programming Education.

## Optical Character Recognition Progress:

Transformer-based OCR (TrOCR): Li et al. (2021) introduced TrOCR, a transformer-based OCR model that achieved state-of-the-art results on handwritten and printed text [2109.10282] TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models.

TrOCR accuracy on handwritten data: A comprehensive 2024 evaluation found that TrOCR achieved $\sim 95 \%$ character accuracy on the IAM handwritten dataset A Comprehensive Evaluation of TrOCR with Varying Image Effects - NHSJS.

## Vector Database Applications:

The emergence of vector databases like Pinecone has revolutionized information retrieval. Educational applications are just beginning to explore these capabilities for intelligent content search and organization.

## Automated Assessment Research:

Studies comparing humans and LLM graders show promising outcomes. For example, a South African study reported average grade discrepancies of just $4 \%$ between ChatGPT-4 and human markers across university scripts ai versus human graders: assessing the role of large language models in higher education.

## Gap in Current Solutions:

While individual components exist (OCR tools, LLM APIs, vector databases), no integrated system specifically addresses the combined workflow of content generation from textbooks and handwritten assessment grading. Most educational AI tools focus on either content creation OR grading, but not both in a unified system.

## Integration with Existing Systems:

Unlike previous attempts that required complete replacement of existing workflows, this project is designed to complement current teaching practices by automating specific time-intensive tasks while preserving teacher control and judgment.

## 3. Aims and Objectives

### 3.1 Primary Aim

The primary aim of this project is to develop and deploy a user-friendly AI system that automates educational content creation and handwritten quiz grading. The target is reducing teacher workload by

60-70\% while maintaining educational quality and providing faster student feedback.

# 3.2 Specific Objectives 

## Module 1: Intelligent Content Generation

## Objective 3.2.1: Book Processing and Storage System

- Develop a PDF upload system that accepts textbook files
- Implement text extraction and chunking algorithms to break content into manageable sections
- Create a Pinecone vector database integration that stores textbook content as searchable embeddings
- Build content indexing that preserves chapter structure and topic relationships


## Objective 3.2.2: Al-Powered Content Generation

- Integrate Large Language Models (GPT-4 or similar) for content creation
- Develop a prompt system where teachers can request specific content (e.g., "Create 15 slides on Vision transformer")
- Build slide generation that creates PowerPoint presentations with proper formatting
- Implement quiz generation that produces multiple question types (multiple choice, short answer, true/false)
- Create export functionality for PowerPoint (.pptx) and Word (.docx) formats


## Module 2: Automated Quiz Assessment

## Objective 3.2.3: Handwritten Quiz Processing

- Develop image upload system for handwritten quiz photos
- Implement OCR (Optical Character Recognition) to convert handwriting to digital text
- Create image preprocessing to improve handwriting recognition accuracy
- Build confidence scoring to identify unclear handwriting that needs human review


## Objective 3.2.4: Al-Driven Grading System

- Develop answer key input system for teaching assistants
- Implement LLM-based comparison between student answers and correct answers
- Create intelligent grading that understands context, not just exact word matches
- Build scoring algorithms that assign partial credit appropriately
- Generate detailed feedback explaining correct answers and student mistakes


## Objective 3.2.5: Reporting and Analytics

- Create individual student reports showing scores, correct answers, and feedback
- Develop class-wide analytics showing common mistakes and performance trends
- Implement downloadable reports in PDF format
- Build email notification system for automatic report distribution

# 3.3 System Integration and User Experience 

Objective 3.3.1: User Interface Development

Design simple web interface requiring no technical expertise
Create separate dashboards for teachers, TAs, and students
Implement user authentication and role-based access control
Build mobile-responsive design for accessibility
Objective 3.3.2: System Performance and Reliability
Ensure processing time under 5 minutes for content generation
Achieve 24-hour turnaround for quiz grading
Implement $99.5 \%$ system uptime
Create backup and recovery systems

### 3.4 Intermediate Deliverables

## Phase 1: Foundation Components

Basic PDF upload and text extraction system
Pinecone database setup with sample textbook content
Simple OCR integration for handwriting recognition
Basic user authentication system

## Phase 2: Core Functionality

Working slide generation from textbook content
Quiz creation with multiple question types
Handwritten text recognition with $90 \%+$ accuracy
Basic grading algorithm comparing answers

## Phase 3: Integration and Enhancement

Fully integrated content generation module
Complete grading system with detailed feedback
Web interface for all user types
Export functionality for slides and reports

## Phase 4: Testing and Deployment

Beta testing with real teachers and students
Performance optimization and bug fixes
Complete documentation and user training materials
Production deployment with monitoring systems

### 3.5 Enabling Deliverables

# Technical Infrastructure: 

Cloud server setup (AWS/Google Cloud) for hosting the system
Database architecture supporting both vector and traditional data
API endpoints connecting frontend interface to AI services
Security implementation protecting student data and educational content

## AI Model Integration:

Fine-tuned prompts for educational content generation
Handwriting recognition model optimized for student work
Grading algorithms calibrated with teacher feedback
Error handling for edge cases in AI processing

## Data Pipeline:

Automated text processing workflow from PDF to searchable content
Image processing pipeline for handwritten quiz optimization
Report generation templates for consistent formatting
Backup systems ensuring no data loss

## User Support Systems:

User documentation and tutorial videos
Help desk system for technical support
Training materials for teachers and TAs
Feedback collection system for continuous improvement

### 3.6 Success Metrics

## Efficiency Metrics:

Reduce slide creation time from hours to minutes per chapter
Decrease quiz grading time from 2-3 minutes to 10 seconds per student
Achieve same-day feedback delivery instead of 7-14 day delays

## Quality Metrics:

Maintain 90\%+ teacher satisfaction with generated content quality
Achieve 85\%+ correlation between AI grades and human grader assessment
Reach $95 \%+$ accuracy in handwriting recognition

# 4. Literature Review 

### 4.1 Al in Educational Content Generation

Quiz question generation with GPT-4: Elkins et al. (2024) showed that GPT-4 can produce multiple-choice questions aligned to Bloom's Taxonomy, with educator ratings indicating comparable quality to human-authored quizzes ChatGPT prompts for generating multiple-choice questions in medical education and evidence on their validity: a literature review.

LLMs in programming education: Doughty et al. (2023) compared GPT-4-generated questions with human-crafted ones across 6 Python modules; they found Al questions to be clear, well-formed, and aligned with learning objectives [2312.03173] A Comparative Study of Al-Generated (GPT-4) and Human-crafted MCQs in Programming Education.

Automated slide generation: Shreewastav et al. (2024) introduced "Presently," a T5-based model converting research papers into PowerPoint slides. Compared to manual methods, Al-generated slides required less teacher intervention and were rated similarly in academic validity (PDF) Presently: Automated Presentation Slide Generation from Research Papers using NLP and Deep Learning (May 2024).

### 4.2 Vector Databases and Information Retrieval

Document-to-slide structuring: Mohan et al. (2021) created the "D2S" system using title-based retrieval plus summarization to convert academic papers into slides; human evaluation showed improved ROUGE scores D2S: Document-to-Slide Generation Via Query-Based Text Summarization - ACL Anthology.

Pinecone-style databases: Though educational IR remains emergent, generic benchmarks (e.g. Kumar et al. 2023) show sub-100 ms retrieval latency over 10k+ document embeddings—promising for real-time educational workflows.

### 4.3 OCR Technology and Handwriting Recognition

Child handwriting evaluation: Alheraki et al. (2023) applied CNNs to child handwriting in Arabic, reaching high character recognition rates ( $\sim 97 \%$ ) and demonstrating feasibility in varied educational contexts Handwritten Arabic Character Recognition for Children Writing Using Convolutional Neural Network and Stroke Identification | Human-Centric Intelligent Systems.

### 4.4 Automated Assessment and Grading Systems

RATAS framework (Rubric Automated Tree-based Answer Scoring): Safilian et al. (2025) introduce a generative-Al-based system achieving high accuracy and interpretable, rubric-based grading on textual exam responses, closely matching human evaluation [2505.23818] Ratas framework: A comprehensive genai-based approach to rubric-based marking of real-world textual exams.

Systematic review of automatic text assessment: Gao et al. (2023) analysed 93 studies on automatic grading of open-ended responses in higher education, demonstrating growing maturity and consistency in such systems [2308.16151] Automatic assessment of text-based responses in post-secondary education: A systematic review.

### 4.5 Educational Technology Integration

Teacher-centered AI integration (K-12): A study by researchers Ozan Filiz et al. (2025)

identified key success factors for Al adoption, including usability and minimal disruption to routine workflows Teachers and Al: Understanding the factors influencing Al integration in K-12 education | Education and Information Technologies

# 4.6 Gaps in Current Research 

While individual components of educational Al systems have been extensively studied, several gaps exist in current research:

Integrated Systems: Most studies focus on isolated components (either content generation OR grading) rather than comprehensive systems that handle both tasks in a unified workflow.

Handwritten Assessment Focus: Limited research exists on complete pipelines from handwritten quiz photos to final graded reports, particularly for diverse student populations.

Real-world Implementation: Many studies report laboratory results but lack data on actual classroom implementation and long-term usage patterns.

Teacher Workflow Integration: Few studies examine how Al tools integrate with existing teaching practices and institutional systems.

### 4.7 Foundation for Current Project

This literature review demonstrates that:

1. Al content generation is technically feasible and educationally valuable
2. Vector databases provide the necessary infrastructure for fast, relevant content retrieval
3. OCR technology has reached sufficient accuracy for practical handwriting recognition
4. Automated grading can achieve human-level performance with proper implementation
5. Integration challenges require careful attention to user experience and workflow design

The current project builds on these established foundations while addressing the identified gaps through:

- Creating an integrated system combining content generation and assessment
- Focusing specifically on handwritten quiz processing workflows
- Designing for real classroom implementation rather than laboratory testing
- Prioritizing teacher control and workflow compatibility

This research foundation provides confidence that the proposed system is both technically achievable and educationally beneficial, while identifying specific areas where innovation and careful implementation will be crucial for success.

## 5. Technologies and Resources

### 5.1 Core Technologies

Artificial Intelligence and Machine LearningLarge Language Models (LLMs):

OpenAI GPT-4 API for content generation and quiz creation
Claude API (Anthropic) as backup option for content generation
Used for: Creating slides from textbook content, generating quiz questions, grading student answers, providing feedback

# Optical Character Recognition (OCR): 

Google Cloud Vision API for handwritten text recognition
TrOCR (Transformer-based OCR) for backup processing
Used for: Converting handwritten quiz photos to digital text

## Embedding Models:

OpenAI text-embedding-ada-002 for creating text embeddings
Sentence-BERT as alternative option
Used for: Converting textbook content into searchable vectors

### 5.2 Database and Storage Systems

## Vector Database:

Pinecone for storing and querying textbook embeddings
Handles similarity search for content retrieval
Estimated storage: 2 textbooks initially

## Traditional Database:

PostgreSQL for user accounts, grades, and structured data
Stores user information, quiz results, system logs

### 5.3 Web Development Framework

## Backend Development:

Python with FastAPI framework for building APIs
Handles file uploads, Al integration, and data processing
Automatic API documentation generation

## Frontend Development:

React.js with JavaScript for user interface
Bootstrap CSS for responsive design
Simple, user-friendly interface requiring no technical knowledge

## Authentication and Security:

JWT (JSON Web Tokens) for user authentication

# 5.4 Cloud Infrastructure 

## Hosting Platform:

Amazon Web Services (AWS) for cloud hosting
Auto-scaling to handle varying user loads
$99.9 \%$ uptime guarantee

## Server Specifications:

4 vCPU, 16GB RAM for initial deployment
500GB SSD storage for databases
Additional GPU instances for AI processing when needed

## Development Tools:

Visual Studio Code for coding
Git/GitHub for version control
Docker for application packaging
GitHub Actions for automated testing and deployment

### 5.5 Data Sources and Requirements

## Primary Data Sources:Educational Textbooks:

Digital PDF versions of academic textbooks
Target: 2 textbooks initially
Sources: Open educational resources, partner institutions, public domain books
Subjects: Machine Learning and Deep Learning

## Handwriting Samples for Training:

Handwritten quiz responses from volunteer students
Diverse handwriting styles and quality levels
Collected with proper consent and privacy protection

## Assessment Standards:

Grading rubrics from educational institutions
Sample answer keys for various subjects
Feedback templates for different question types
Grading algorithm will be tested for bias using different handwritings.

## Quality Requirements:

High-resolution PDF scans (300 DPI minimum)
Clear handwriting samples with good lighting
Diverse content covering multiple educational levels
All data properly anonymized for privacy

# 5.6 Data Privacy and Compliance Resources 

## Legal and Privacy:

Privacy policy and terms of service documentation
GDPR compliance consultation
Data encryption and security audit tools
Backup and disaster recovery systems

## Quality Assurance:

Automated testing frameworks (pytest, Jest)
Performance monitoring tools
User feedback collection systems
Content quality review processes

### 5.7 Risk Mitigation Resources

## Backup Technologies:

Alternative AI providers (Claude, local models) if OpenAI becomes unavailable
Multiple OCR services to ensure handwriting recognition reliability
Backup hosting providers in case of AWS issues

## Data Protection:

Multiple backup systems across different geographic regions
Version control for all code and configurations
Regular security audits and penetration testing

## Scalability Planning:

Auto-scaling cloud infrastructure to handle growth
Modular architecture allowing easy component upgrades
Performance monitoring to identify bottlenecks early

This technology stack provides a robust, scalable foundation for both content generation and automated grading while maintaining simplicity in user interaction and system administration.

# 6. Method and Workplan 

### 6.1 Development Methodology

This project will use an Agile development approach with 3-week sprints, allowing for continuous testing and improvement. The development will be divided into 4 major phases over 12 months, with each phase building upon the previous one.Key Principles: Build working prototypes early for teacher feedback
Test each component with real users before moving forward
Keep the system simple and user-friendly throughout development
Regular stakeholder reviews to ensure educational needs are met

### 6.2 Phase 1: Foundation Setup

Goals: Establish basic infrastructure and test core technologies
Sprint 1: System Design and Setup
Set up AWS cloud infrastructure and databases
Create basic user authentication system
Design database schemas for users, content, and grades
Set up development environment and code repositories

## Sprint 2: PDF Processing and Storage

Build PDF upload system for textbook files
Implement text extraction from PDF documents
Create Pinecone vector database integration
Test with 5-10 sample textbook chapters

## Sprint 3: OCR Integration and Testing

$\square$ Integrate Google Cloud Vision API for handwriting recognition
Build image upload system for quiz photos
Test OCR accuracy with 100+ handwriting samples
Create confidence scoring for unclear handwriting

## Sprint : Basic AI Integration

Connect OpenAI GPT-4 API for content generation
Build simple prompt system for slide creation
Test LLM integration with sample textbook content
Create basic grading comparison algorithms

Working cloud infrastructure
PDF upload and text extraction system
Basic OCR handwriting recognition
Simple AI content generation prototype
$50+$ handwriting samples collected for testing

# 6.3 Phase 2: Core Module Development 

Goals: Build the two main system modules with basic functionality
Sprint 5: Content Generation Module
Implement vector search for relevant textbook content
Build slide generation with proper PowerPoint formatting
Create quiz question generation (multiple choice, short answer)
Add content difficulty assessment

## Sprint 6 : Export and Template Systems

Develop PowerPoint and Word document export
Create customizable slide templates
Build quiz formatting for different question types
Test content generation with 3 pilot teachers

## Sprint 7: Grading System Development

Build answer key input system for teaching assistants
Implement LLM-based answer comparison algorithms
Create partial credit assignment logic
Test grading accuracy with sample quiz responses

## Sprint 8: Report Generation

Develop individual student report templates
Create class analytics and performance summaries
Build PDF report generation system
Test reporting with pilot student groups

## Phase 2 Deliverables:

Functional content generation creating slides and quizzes
Working handwriting recognition and grading system
Basic report generation for student feedback
Testing results from 3 pilot teachers and 20+ students

### 6.4 Phase 3: Integration and User Interface (Months 4-5)

Goals: Create complete user experience and integrate all components
Sprint 9: Web Interface Development
Build React.js frontend for teachers, TAs, and students
Create separate dashboards for different user roles
Implement file upload interfaces and progress tracking
Design mobile-responsive interface

# Sprint 10: System Integration 

Connect frontend to all backend services
Implement real-time processing status updates
Create error handling and user feedback systems
Build notification system for completed tasks

## Sprint 11: Advanced Features

Add customizable prompts for content generation
Implement batch processing for multiple quizzes
Create teacher review and editing capabilities
Build analytics dashboard for institutional use

## Sprint 12: Beta Testing

Deploy beta version to pilot schools
Conduct user training sessions with teachers
Collect feedback and usage analytics
Fix bugs and improve user experience

## Phase 3 Deliverables:

Complete web interface for all user types
Fully integrated system with all features working
Beta testing results from 5+ teachers and 100+ students
User training materials and documentation

### 6.5 Phase 4: Testing and Deployment (Month 6)

Goals: Optimize performance, ensure reliability, and deploy production system
Sprint 13: Performance Optimization
Optimize database queries and Al processing speed
Implement caching for frequently accessed content
Test system with larger user loads
Improve response times and system reliability
Sprint 14: Security and Compliance
Implement comprehensive data encryption
Conduct security audit and penetration testing
Ensure FERPA compliance for student data

# Sprint 15: Production Deployment 

Deploy production system with monitoring
Set up automated backup and scaling systems
Create admin dashboard for system management
Implement usage analytics and reporting

## Sprint 16: Documentation and Training

Complete user documentation and video tutorials
Train support staff for help desk operations
Create administrator guides for system maintenance
Conduct final user acceptance testing

## Phase 4 Deliverables:

Production-ready system with $99.5 \%$ uptime
Complete security audit and compliance certification
User training materials and support documentation
Successfully processed 500+ student assignments

### 6.6 Project Timeline and Major Milestones

|  | Week 1 | Week 2 | Week 3 | Week 4 | Week 5 | Week 6 |
| :-- | :-- | :-- | :-- | :-- | :-- | :-- |
| Phase 1: <br> Foundation <br> Setup |  |  |  |  |  |  |
| Phase 2: Core <br> Module <br> Development |  |  |  |  |  |  |
| Phase 3: <br> Integration <br> and User <br> Interface |  |  |  |  |  |  |
| Phase 4: <br> Testing and <br> Deployment |  |  |  |  |  |  |

## Major Milestones:

## Milestone

M1: Foundation

Month
Success Criteria

1 PDF processing, OCR working, basic AI

# Complete Integration

M2: Core Features
Ready
3
Content generation and grading
modules functional

M3: Beta System
Live
5
Complete user interface, pilot testing
underway

M4: Production
Launch
6
Full deployment, user training complete

6.7 Risk Analysis and Contingency Plans

High-Risk Items:

Risk 1: OCR Accuracy Below 90%

*   *Impact:* Grading system unreliable, requires too much human correction
*   *Probability:* Medium (30%)
*   *Mitigation:* Test multiple OCR services, implement confidence scoring
*   *Contingency:* Use crowd-sourced correction, partner with specialized OCR providers

Risk 2: AI Content Quality Inconsistent

*   *Impact:* Teachers reject generated slides/quizzes, system adoption fails
*   *Probability:* Medium (25%)
*   *Mitigation:* Extensive prompt engineering, teacher feedback loops
*   *Contingency:* Implement human review step, create template-based fallbacks

Risk 3: System Performance Too Slow

*   *Impact:* User frustration, abandonment of system
*   *Probability:* Low (15%)
*   *Mitigation:* Regular performance testing, cloud auto-scaling
*   *Contingency:* Optimize algorithms, upgrade infrastructure, implement queuing

Medium-Risk Items:
Risk 4: Teacher Adoption Resistance

*   *Impact:* Limited user base, project success reduced
*   *Probability:* Medium (40%)
*   *Mitigation:* Involve teachers in design, provide extensive training
*   *Contingency:* Adjust interface based on feedback, offer incentives

# Risk 5: Data Privacy Compliance Issues 

Impact: Legal problems, inability to deploy in schools
Probability: Low (20\%)
Mitigation: Early legal consultation, privacy-by-design approach
Contingency: Implement additional security measures, on-premises options
Risk 6: Technology Changes (API Changes)
Impact: System components break, require rebuilding
Probability: Medium (30\%)
Mitigation: Use stable APIs, maintain multiple provider options
Contingency: Quick migration to alternative services, maintain local backups

### 6.8 Contingency Timeline

## If Major Delays Occur:3-Month Extension Plan:

Focus on core content generation first (months 13-15)
Simplify grading system to basic functionality
Deploy minimal viable product for pilot testing

### 6.9 Quality Assurance Strategy

## Continuous Testing:

Weekly automated testing of all system components
Monthly user testing sessions with pilot teachers
Quarterly security and performance audits

## Success Metrics:

Content generation: $90 \%$ teacher satisfaction with quality
Grading accuracy: $85 \%$ correlation with human graders
System performance: $<5$ minute processing for typical tasks
User adoption: $80 \%$ of pilot teachers actively using system

## Feedback Integration:

Bi-weekly teacher feedback sessions during beta testing
Monthly student surveys on report quality
Continuous monitoring of system usage analytics
This work plan provides a realistic path to building a functional, user-friendly system while maintaining flexibility to address challenges and incorporate stakeholder feedback throughout the development process.

# 7. Ethics, Legal, Data Protection and Safety Aspects 

### 7.1 Ethical Considerations

## Fairness in AI Grading

The grading system must give fair treatment to all students while grading. Teachers will provide answers or answers will be generated from the LLM with the questions and every student's answer will be matched with the stored answer. Through this we will achieve no bias. Teachers will have the final authority over the grades as the system will serve as an assistant not the replacement.

## Academic Integrity

The teacher will have the authority over content and quizzes to ensure academic integrity. The teacher can also generate multiple quizzes to reduce cheating.

## Transparency

The students and teachers will be provided a clear explanation the grading criteria and how the grades of students are calculated. A clear feedback will be provided to both teacher and students

### 7.2 Legal Compliance

## Educational Privacy Laws

The system complies with UK GDPR and the Data Protection Act 2018, using strict access controls, audit logs, and encrypted data handling. Only authorized staff (e.g., lecturers, TAs) can access student data. Parental access is provided where legally appropriate.

## International Compliance

The platform supports UK and EU GDPR, ensuring rights to access, deletion, and data portability. It follows the ICO Children's Code, requiring parental consent for users under 13 and defaulting to high privacy settings for minors.

### 7.3 Data Protection and Security

## Data Encryption

All data is encrypted in transit (HTTPS) and at rest (AES-256). Student personally identifiable information receives the highest protection level with field-level encryption in databases.

## Access Controls

Role-based access ensures students only see their own data, teachers access their class information, and administrators have limited system-wide access. Multi-factor authentication is required for all accounts.

## Data Retention

Student data is retained only as long as educationally necessary, with automatic deletion after course completion unless explicitly retained by the institution. Clear data retention policies align with institutional requirements.

### 7.4 Safety and Reliability

# System Reliability 

The system maintains $99.5 \%$ uptime with automated backups and disaster recovery procedures. Multiple server regions ensure continued operation during outages.

## Content Safety

AI-generated content is filtered for appropriateness, with human review mechanisms for questionable material. The system includes reporting tools for inappropriate content.

## Error Handling

Clear error messages help users understand system limitations. Low-confidence OCR results are flagged for human review, and technical failures trigger automatic notifications to administrators.

### 7.5 User Rights and Protections

## Student Rights

Students can request to see their data, correct errors, and opt out of AI grading in favor of human-only assessment. Parents can access their children's educational records and system usage.

## Teacher Autonomy

Teachers retain complete control over their educational content and grading decisions. The system cannot override teacher judgment or automatically assign final grades without approval.

## Institutional Control

Educational institutions maintain ownership of their data and can export or delete all information at any time. Local deployment options are available for sensitive environments.

This framework ensures the system enhances education while protecting all stakeholders' rights, privacy, and academic integrity.

## References

Alheraki, F., Alnuaim, A., \& Almudaires, N. (2023). Handwritten Arabic character recognition for children writing using convolutional neural network and stroke identification. Human-Centric Intelligent Systems. Handwritten Arabic Character Recognition for Children Writing Using Convolutional Neural Network and Stroke Identification | Human-Centric Intelligent Systems.

Doughty, C., Hussein, M., \& Li, S. (2023). A comparative study of AI-generated (GPT-4) and human-crafted MCQs in programming education. arXiv preprint [arXiv:2312.03173].
https://arxiv.org/abs/2312.03173
Elkins, D., Rivera, L., \& Martin, K. (2024). How teachers can use large language models and Bloom's taxonomy to create educational quizzes. arXiv preprint [arXiv:2401.05914].
https://arxiv.org/abs/2401.05914
Education Endowment Foundation (EEF). (2021). Teacher feedback to improve pupil learning: Guidance report. Teacher Feedback to Improve Pupil Learning | EEF

HESA (2023). Higher Education Staff Statistics: UK, 2021/22. Higher Education Statistics Agency. https://www.hesa.ac.uk/news/23-02-2023/staff-statistics

Mohan, M., Kamath, S., \& Chakrabarti, S. (2021). D2S: Document-to-slide generation via query-based text summarization. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP). ACL Anthology. Improving Simultaneous Translation by Incorporating Pseudo-References with Fewer Reorderings - ACL Anthology

Safilian, S., Baines, M., \& Anvari, A. (2025). RATAS framework: A comprehensive genAI-based approach to rubric-based marking of real-world textual exams. arXiv preprint [arXiv:2505.23818]. https://arxiv.org/abs/2505.23818

Shreewastav, A., Joshi, A., \& Das, P. (2024). Presently: Automated presentation slide generation from research papers using NLP and deep learning. Preprint. (PDF) Presently: Automated Presentation Slide Generation from Research Papers using NLP and Deep Learning (May 2024)

Gao, Y., Chang, L., \& Uddin, M. (2023). Automatic assessment of text-based responses in post-secondary education: A systematic review. arXiv preprint [arXiv:2308.16151].
https://arxiv.org/abs/2308.16151
Ozan, B., Filiz, O., \& Kaya, R. (2025). Teachers and AI: Understanding the factors influencing AI integration in K-12 education. Education and Information Technologies. Teachers and AI: Understanding the factors influencing AI integration in K-12 education I Education and Information Technologies

Simon Fraser University. (n.d.). Strategies to reduce lecture preparation time. Institute for the Study of Teaching and Learning in the Disciplines (ISTLD), SFU. Retrieved July 4, 2025, from Ideas to Reduce Non-Classroom Prep Time Without Reducing Teaching Quality - ISTLD - Simon Fraser University

Yale Graduate Teaching Center. (2013). The diminishing returns of lecture preparation. Yale University. Retrieved July 4, 2025, from How to spend less time preparing better lectures

In [8]:
prompt_v1 = """
<instructions>
You are a "Quiz Content Normalizer" agent. Your primary task is to process Markdown-formatted quiz content, extract the questions and student answers, and convert this information into a structured JSON object. A critical secondary task is to identify and remove "teacher remarks" from the content. These remarks are feedback or corrections provided by an instructor and should NOT be included in the final JSON output.

You must be highly intelligent and discerning in identifying teacher remarks. They are distinct from the student's actual answers and the original question statements. Your goal is to preserve the complete and accurate student responses while eliminating only the teacher's annotations.
</instructions>

<input_format>
The input will be raw Markdown text representing a quiz. It will contain questions (e.g., "a. Is the relation in 2NF?...") and the student's written answers. The original source had teacher remarks visually marked in red, but this visual cue is LOST in the Markdown conversion. You must identify them based on textual patterns and context.
</input_format>

<output_format>
Produce a JSON array of objects. Each object in the array should represent a single question-answer pair from the quiz.
Each object must have the following structure:
```json
{
  "question_number": "string", // e.g., "a", "b", "c"
  "question_text": "string",   // The full text of the question
  "student_answer": "string"   // The student's complete answer, with ALL teacher remarks removed.
}
```
</output_format>

<guidelines_for_identifying_teacher_remarks>
Teacher remarks are typically short, evaluative, corrective, or annotative comments. They are NOT part of the student's intended answer or the original question. Apply the following heuristics carefully:

1.  **Short, Non-Syntactic Lines/Phrases:** Look for very short lines or phrases (e.g., under 20-30 characters) that appear as interjections, marginalia, or brief feedback, and do not form a grammatically coherent part of the student's main explanation.
2.  **Evaluative/Corrective Language:** Identify phrases containing direct feedback, judgment, or correction. Common indicators include:
    *   "not valid", "not in XNF" (when used as a standalone correction, not part of student's argument), "sorry :|", "do not", "wrong", "correct", "no", "yes", "partial dependency exists" (when used as a direct correction to a student's incorrect statement).
    *   Phrases like "How can it be 3NF if it is not even 2NF?"
    *   Phrases like "no decomposition.", "no yes/no", "functional dependency this wouldn't be a relation if there was not any functional dependencies."
    *   Phrases like "These new table will be created" (as an instruction/correction).
3.  **Numerical/Symbolic Annotations:** Lines consisting solely of a single digit, a circled digit (e.g., '①', '②', '⓪'), or simple symbols like 'X' or '✓' that indicate a mark or score.
4.  **Syntactic Disruption:** Remarks often appear to break the flow of the student's answer, either on a new line or as an inserted comment that doesn't logically connect to the surrounding student text.

**CRITICAL EXCEPTION:** Be extremely careful not to remove legitimate parts of the student's answer that might *contain* words like "no" or "not" if they are part of their technical explanation (e.g., "The relation is **not** in 2NF because **no** partial dependency exists..."). Distinguish between student's reasoning and teacher's direct feedback. If a phrase like "no partial dependency exists" is presented as a student's justification, it should be kept. If it's a short, isolated correction from the teacher, it should be removed. Context is key.
</guidelines_for_identifying_teacher_remarks>

<chain_of_thought>
Before generating the JSON, perform the following steps mentally or explicitly:
1.  **Parse Quiz Structure:** Identify the distinct sections for each question (a, b, c, d...).
2.  **Extract Question Text:** For each section, isolate the exact question statement.
3.  **Extract Raw Student Answer:** Collect all text associated with the student's response for that question.
4.  **Filter Teacher Remarks:** Carefully review the raw student answer text. Apply the `guidelines_for_identifying_teacher_remarks` to remove all identified remarks. This step requires nuanced understanding to differentiate between student content and teacher annotations.
5.  **Construct JSON:** Assemble the extracted question number, question text, and the cleaned student answer into the specified JSON format.
</chain_of_thought>
"""


Cleaning up


In [11]:
from pydantic import BaseModel


class QuizQuestion(BaseModel):
    id: str
    text: str
    type: str
    options: list[dict] = []
    parts: list[dict] = []

class StructuredQuiz(BaseModel):
    quiz_number: int
    name: str
    roll_number: str
    questions: list[QuizQuestion]

prompt = (
    "You are an expert JSON conversion agent. "
    "Convert the markdown quiz provided in `<markdown_quiz>` tags into a JSON object. "
    "Adhere strictly to the specified schema, ensuring consistency while accommodating variations in question counts, sub-points, and multiple-choice formats. "
    "Infer question `type` (e.g., `multiple_choice`, `open_ended`) based on content.\n\n"
    "Output must conform to this JSON Schema:\n"
    "{\n"
    "  \"quiz_number\": \"integer\",\n"
    "  \"name\": \"string\",\n"
    "  \"roll_number\": \"string\",\n"
    "  \"questions\": [\n"
    "    {\n"
    "      \"id\": \"string\",\n"
    "      \"text\": \"string\",\n"
    "      \"type\": \"string\",\n"
    "      \"options\": [ {\"label\": \"string\", \"text\": \"string\"} ],\n"
    "      \"parts\": [ {\"label\": \"string\", \"text\": \"string\"} ]\n"
    "    }\n"
    "  ]\n"
    "}\n"
)

quiz_markdown = pdf_response.pages[0].markdown

# Compose the chat request
chat_response = client.chat.parse(
    model="ministral-8b-latest",
    messages=[
        {
            "role": "user",
            "content": [
                TextChunk(
                    text=(
                        f"{prompt}\n"
                        "<markdown_quiz>\n"
                        f"{quiz_markdown}\n"
                        "</markdown_quiz>"
                    )
                ),
            ],
        }
    ],
    response_format=StructuredQuiz,
    temperature=0,
)


def extract_quiz(response):
    # If response is a dict and has OpenAI/Mistral completion keys:
    if isinstance(response, dict):
        content = response.get("choices", [{}])[0].get("message", {}).get("content", None)
        if content:
            try:
                return json.loads(content)
            except Exception:
                return content  # It may already be a dict
    # If response is a Pydantic model:
    if hasattr(response, 'model_dump'):
        return response.model_dump()
    # If already dict:
    if isinstance(response, dict):
        return response
    # Fallback: just return as-is
    return response

quiz_json = extract_quiz(chat_response)
print(json.dumps(quiz_json, indent=4))

{
    "id": "3e61ea3d72274a5da15f526cf4838b25",
    "object": "chat.completion",
    "model": "ministral-8b-latest",
    "usage": {
        "prompt_tokens": 666,
        "completion_tokens": 448,
        "total_tokens": 1114
    },
    "created": 1753165301,
    "choices": [
        {
            "index": 0,
            "message": {
                "content": "{\n  \"quiz_number\": 6,\n  \"name\": \"M. Ahmed Riaz\",\n  \"roll_number\": \"23I-2005\",\n  \"questions\": [\n    {\n      \"id\": \"q1\",\n      \"text\": \"Is the relation in 2NF? Provide the reason if it is not in 2NF. [2 marks] The relation is not in 2NF, because in there is partial dependency exists in Functional Dependency. FD1. We have to create those table to be in 2NF.\",\n      \"type\": \"open_ended\",\n      \"options\": [],\n      \"parts\": []\n    },\n    {\n      \"id\": \"q2\",\n      \"text\": \"Is the relation in 3NF? Provide the reason if it is not in 3NF. [2 marks] The relation is not in 3NF form because th

In [None]:
import time

def is_valid_quiz_format(response_dict):
    # Top-level keys and types
    if not isinstance(response_dict, dict):
        return False
    for key in ["quiz_number", "name", "roll_number", "questions"]:
        if key not in response_dict:
            return False
    if not isinstance(response_dict["questions"], list):
        return False
    # Each question object
    for q in response_dict["questions"]:
        if not isinstance(q, dict):
            return False
        for key in ["id", "text", "type", "parts"]:
            if key not in q:
                return False
        if "options" in q and not isinstance(q["options"], list):
            return False
        if not isinstance(q["parts"], list):
            return False
        # Option objects (if present)
        if "options" in q:
            for opt in q["options"]:
                if not isinstance(opt, dict):
                    return False
                if "label" not in opt or "text" not in opt:
                    return False
        # Part objects (if present)
        for part in q["parts"]:
            if not isinstance(part, dict):
                return False
            if "label" not in part or "text" not in part:
                return False
    return True

max_attempts = 5
attempt = 0
while attempt < max_attempts:
    attempt += 1
    pdf_markdown = pdf_response.pages[0].markdown

    chat_response = client.chat.complete(
        model="ministral-8b-latest",
        messages=[
            {   # type: ignore
                "role": "user",
                "content": [
                    TextChunk(
                        text=f"{prompt}\n<markdown_quiz>\n{pdf_markdown}\n</markdown_quiz>"
                    ),
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    try:
        response_dict = json.loads(chat_response.choices[0].message.content)
        if is_valid_quiz_format(response_dict):
            print(f"VALID: {json.dumps(response_dict, indent=4)}")
        else:
            print(f"Attempt {attempt}: Output format invalid, retrying...")
            print(json.dumps(response_dict, indent=4))
    except Exception as e:
        print(f"Attempt {attempt}: JSON parsing failed, retrying... {e}")
    time.sleep(1)
else:
    print("Failed to get valid output after several attempts.")