### Extract Text from PDF

In [1]:
from PyPDF2 import PdfReader

def extract_pdf_pages(pdf_path: str) -> list:
    """
    Extracts text from each page of a PDF.
    Returns a list where each element is text from one page.
    """
    reader = PdfReader(pdf_path)
    pages = []

    for page in reader.pages:
        text = page.extract_text()
        pages.append(text if text else "")

    return pages


In [2]:
def create_page_chunks(
    pages: list,
    chunk_size: int = 3,
    overlap: int = 1
) -> list:
    """
    Creates page chunks with overlap.
    
    Example:
    chunk_size=3, overlap=1
    Pages: [1,2,3] → [3,4,5] → ...
    """
    chunks = []
    step = chunk_size - overlap

    chunk_number = 1
    for start in range(0, len(pages), step):
        end = start + chunk_size
        chunk_pages = pages[start:end]

        if not chunk_pages:
            break

        chunks.append({
            "chunk_number": chunk_number,
            "pages": list(range(start + 1, min(end + 1, len(pages) + 1))),
            "text": "\n".join(chunk_pages)
        })

        chunk_number += 1

        if end >= len(pages):
            break

    return chunks


import json

def save_chunks(chunks: list, output_path: str):
    """
    Saves chunks to a JSON file.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)



def load_chunks(path: str) -> list:
    """
    Loads stored PDF chunks.
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


### Use LLM to generate MCQ

In [3]:
def build_mcq_prompt(text: str) -> str:
    return f"""
You are an expert university-level instructor and question setter.

Generate  high-quality multiple-choice questions (MCQs)
STRICTLY based on the lecture content below.

CRITICAL RULES (NO EXCEPTIONS):
1. Use ONLY the provided lecture content.
2. Each question must be conceptually meaningful.
3. Each question must have EXACTLY 4 options.
4. EXACTLY one option must be correct.
5. DO NOT add explanations, comments, or extra text.
6. OUTPUT MUST BE VALID JSON ONLY.

OUTPUT FORMAT (STRICT JSON ARRAY):
[
  {{
    "question": "question text",
    "options": {{
      "A": "option text",
      "B": "option text",
      "C": "option text",
      "D": "option text"
    }},
    "answer": "A"
  }}
]

EXAMPLE OUTPUT (FORMAT ONLY):
[
  {{
    "question": "What is gradient descent?",
    "options": {{
      "A": "An optimization algorithm",
      "B": "A loss function",
      "C": "A neural network layer",
      "D": "A regularization method"
    }},
    "answer": "A"
  }}
]

LECTURE CONTENT:
<<<
{text}
>>>

RETURN ONLY THE JSON ARRAY. NO MARKDOWN. NO TEXT OUTSIDE JSON.
"""






In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)



2025-12-26 14:41:55.508734: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-26 14:41:55.523042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-26 14:41:55.540110: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-26 14:41:55.545069: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-26 14:41:55.557930: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def mistral_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    messages = f"<s>[INST] {prompt} [/INST]"

    inputs = tokenizer(messages, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id     # deterministic output
    )

    # Slice to get only the generated tokens (exclude prompt)
    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)


In [None]:
import json
import re
import csv

def clean_json_response(response: str) -> str:
    """
    Cleans the LLM response to ensure it's valid JSON.
    """
    # Remove markdown code fences
    cleaned = re.sub(r"```(?:json)?", "", response).strip()
    
    # Try to find the JSON array list
    start = cleaned.find("[")
    end = cleaned.rfind("]")
    
    if start != -1 and end != -1:
        cleaned = cleaned[start : end + 1]
        
    return cleaned

def generate_mcqs_from_chunks(
    chunks: list,
    tokenizer,
    model,
    csv_path: str,
    pdf_name: str
):
    """Generate MCQs and write to CSV immediately after each MCQ"""
    
    total_chunks = len(chunks)
    
    # Write CSV header first
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "pdf_name",
            "chunk_number",
            "total_chunks",
            "pages",
            "question",
            "option_A",
            "option_B",
            "option_C",
            "option_D",
            "correct_answer"
        ])
    
    total_mcqs = 0
    skipped_mcqs = 0
    
    for i, chunk in enumerate(chunks):
        chunk_number = chunk.get("chunk_number", i + 1)  # Use chunk_number from JSON or fallback
        print(f"\nProcessing chunk {chunk_number}/{total_chunks}...")

        try:
            prompt = build_mcq_prompt(
                text=chunk["text"]
            )

            response = mistral_generate(tokenizer, model, prompt)

            # --- JSON SAFETY ---
            try:
                cleaned_response = clean_json_response(response)
                mcq_list = json.loads(cleaned_response)
            except json.JSONDecodeError as e:
                print(f"[WARN] JSON parse failed for chunk {chunk_number}")
                print(f"ERROR: {e}")
                print(f"Skipping chunk {chunk_number} and continuing...")
                continue

            # Ensure mcq_list is actually a list
            if not isinstance(mcq_list, list):
                print(f"[WARN] Response is not a list for chunk {chunk_number}")
                print(f"Skipping chunk {chunk_number} and continuing...")
                continue

            # Write each MCQ immediately to CSV
            with open(csv_path, "a", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                for mcq_idx, mcq in enumerate(mcq_list):
                    try:
                        # Validate MCQ structure
                        if not isinstance(mcq, dict):
                            print(f"  [SKIP] MCQ {mcq_idx+1} is not a dict, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Check required fields
                        if "question" not in mcq or "options" not in mcq or "answer" not in mcq:
                            print(f"  [SKIP] MCQ {mcq_idx+1} missing required fields, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Validate options
                        options = mcq["options"]
                        if not isinstance(options, dict) or not all(key in options for key in ["A", "B", "C", "D"]):
                            print(f"  [SKIP] MCQ {mcq_idx+1} has invalid options structure, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Validate answer key
                        answer_key = mcq["answer"]
                        if answer_key not in options:
                            print(f"  [SKIP] MCQ {mcq_idx+1} has invalid answer key '{answer_key}', skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Format correct answer as "D) Option Text"
                        answer_text = options[answer_key]
                        formatted_answer = f"{answer_key}) {answer_text}"
                        
                        # Convert pages list to string
                        pages_str = ", ".join(map(str, chunk["pages"]))
                        
                        writer.writerow([
                            pdf_name,
                            chunk_number,
                            pages_str,
                            mcq["question"],
                            options["A"],
                            options["B"],
                            options["C"],
                            options["D"],
                            formatted_answer
                        ])
                        
                        total_mcqs += 1
                        # print(f"  ✓ Generated MCQ #{total_mcqs}")
                        
                    except Exception as e:
                        print(f"  [ERROR] Failed to process MCQ {mcq_idx+1}: {e}")
                        print(f"  Skipping this MCQ and continuing...")
                        skipped_mcqs += 1
                        continue
        
        except Exception as e:
            print(f"[ERROR] Failed to process chunk {chunk_number}: {e}")
            print(f"Skipping chunk {chunk_number} and continuing...")
            continue
    
    print(f"\n✅ Completed!")
    print(f"   Generated: {total_mcqs} MCQs")
    print(f"   Skipped: {skipped_mcqs} MCQs")
    print(f"   Processed: {total_chunks} chunks")
    return total_mcqs



#### Inference the model

In [7]:
# Step 1: Extract pages from the PDF
pdf_name = "LLM_cs124_week7_2025.pdf"
pages = extract_pdf_pages(pdf_name)

# Step 2: Chunk (3 pages, 1-page overlap)
chunks = create_page_chunks(
    pages,
    chunk_size=3,
    overlap=1
)

# Step 3: Store to JSON
save_chunks(chunks, "pdf_chunks.json")

print(f"Extracted {len(pages)} pages")
print(f"Created {len(chunks)} chunks")


Extracted 114 pages
Created 57 chunks


In [8]:
pdf_name = "LLM_cs124_week7_2025.pdf"

chunks = load_chunks("pdf_chunks.json")

# Generate MCQs and write to CSV on the go
total_mcqs = generate_mcqs_from_chunks(
    chunks,
    tokenizer,
    model,
    csv_path="mcq_dataset.csv",
    pdf_name=pdf_name
)



Processing chunk 1/57...


  ✓ Generated MCQ #1
  ✓ Generated MCQ #2
  ✓ Generated MCQ #3
  ✓ Generated MCQ #4

Processing chunk 2/57...
  ✓ Generated MCQ #5
  ✓ Generated MCQ #6
  ✓ Generated MCQ #7
  ✓ Generated MCQ #8

Processing chunk 3/57...
  ✓ Generated MCQ #9
  ✓ Generated MCQ #10
  ✓ Generated MCQ #11
  ✓ Generated MCQ #12

Processing chunk 4/57...
  ✓ Generated MCQ #13
  ✓ Generated MCQ #14
  ✓ Generated MCQ #15
  ✓ Generated MCQ #16

Processing chunk 5/57...
  ✓ Generated MCQ #17
  ✓ Generated MCQ #18
  ✓ Generated MCQ #19
  ✓ Generated MCQ #20

Processing chunk 6/57...
  [SKIP] MCQ 1 has invalid answer key 'A, B, C', skipping...
  ✓ Generated MCQ #21
  ✓ Generated MCQ #22
  ✓ Generated MCQ #23

Processing chunk 7/57...
  ✓ Generated MCQ #24
  ✓ Generated MCQ #25

Processing chunk 8/57...
  ✓ Generated MCQ #26
  ✓ Generated MCQ #27
  ✓ Generated MCQ #28
  ✓ Generated MCQ #29

Processing chunk 9/57...
  ✓ Generated MCQ #30
  ✓ Generated MCQ #31
  ✓ Generated MCQ #32
  ✓ Generated MCQ #33
  ✓ Generated 