### Extract Text from PDF

In [5]:
from PyPDF2 import PdfReader

def extract_pdf_pages(pdf_path: str) -> list:
    """
    Extracts text from each page of a PDF.
    Returns a list where each element is text from one page.
    """
    reader = PdfReader(pdf_path)
    pages = []

    for page in reader.pages:
        text = page.extract_text()
        pages.append(text if text else "")

    return pages


In [6]:
def create_page_chunks(
    pages: list,
    chunk_size: int = 3,
    overlap: int = 1
) -> list:
    """
    Creates page chunks with overlap.
    
    Example:
    chunk_size=3, overlap=1
    Pages: [1,2,3] → [3,4,5] → ...
    """
    chunks = []
    step = chunk_size - overlap

    chunk_number = 1
    for start in range(0, len(pages), step):
        end = start + chunk_size
        chunk_pages = pages[start:end]

        if not chunk_pages:
            break

        chunks.append({
            "chunk_number": chunk_number,
            "pages": list(range(start + 1, min(end + 1, len(pages) + 1))),
            "text": "\n".join(chunk_pages)
        })

        chunk_number += 1

        if end >= len(pages):
            break

    return chunks


import json

def save_chunks(chunks: list, output_path: str):
    """
    Saves chunks to a JSON file.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)



def load_chunks(path: str) -> list:
    """
    Loads stored PDF chunks.
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


### Use LLM to generate MCQ

In [7]:
def build_mcq_prompt(text: str) -> str:
    return f"""
You are an expert university-level instructor and question setter.

Generate high-quality multiple-choice questions (MCQs)
STRICTLY based on the lecture content below.

CRITICAL RULES (NO EXCEPTIONS):
1. Use ONLY the provided lecture content.
2. Each question must be conceptually meaningful and **non-generic**.
3. Each question must be an **actual MCQ**, not a definition recall question.
4. Avoid generic prompts such as "What is X?" or "Define Y".
5. Questions must test **application, reasoning, interpretation, comparison, edge cases, numerical reasoning, or scenario-based understanding** explicitly grounded in the lecture.
6. Each question must have EXACTLY 4 options.
7. EXACTLY one option must be correct.
8. Distractors must be **plausible and content-specific**, not obviously incorrect.
9. DO NOT add explanations, comments, hints, or extra text.
10. OUTPUT MUST BE VALID JSON ONLY.


OUTPUT FORMAT (STRICT JSON ARRAY):
[
  {{
    "question": "question text",
    "options": {{
      "A": "option text",
      "B": "option text",
      "C": "option text",
      "D": "option text"
    }},
    "answer": "A"
  }}
]

EXAMPLE OUTPUT (FORMAT ONLY):
[
  {{
    "question": "Who invented Gravity?",
    "options": {{
      "A": "Isaac Newton",
      "B": "Albert Einstein",
      "C": "Galileo Galilei",
      "D": "Marrie Curie"
    }},
    "answer": "A"
  }}
]

LECTURE CONTENT:
<<<
{text}
>>>

RETURN ONLY THE JSON ARRAY. NO MARKDOWN. NO TEXT OUTSIDE JSON.
"""


<h3 style="color:red;">1) Mistral 7B Instruct</h3>


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)



In [None]:
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    messages = f"<s>[INST] {prompt} [/INST]"

    inputs = tokenizer(messages, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id     # deterministic output
    )

    # Slice to get only the generated tokens (exclude prompt)
    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)


<h3 style="color:red;">2) LLama 8B</h3>


In [3]:
# %%
print("Loading llama8b model...")
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from transformers import pipeline
import torch


model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading llama8b model...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [6]:
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    messages = [{"role": "user", "content": prompt}]

    encoded = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    input_ids = encoded.to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,   # ✅ FIX
        max_new_tokens=max_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = outputs[0][input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()


<h3 style="color:red;">3) Phi 14B</h3>


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "microsoft/Phi-3-medium-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)


2026-01-19 22:51:14.169724: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-19 22:51:14.183787: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-19 22:51:14.200138: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-19 22:51:14.204915: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-19 22:51:14.217491: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [5]:
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    attention_mask = torch.ones_like(input_ids).to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,      # ✅ important
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = outputs[0][input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()


<h3 style="color:red;">4) Qween 7B</h3>


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)



2026-01-25 21:22:24.105817: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-25 21:22:24.120781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-25 21:22:24.182414: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-25 21:22:24.195164: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-25 21:22:24.235392: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
import torch
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    # Build chat-style messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]

    # Tokenize using chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Attention mask (important for some models)
    attention_mask = torch.ones_like(input_ids).to(model.device)

    # Generate
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract only newly generated tokens
    generated_ids = outputs[0][input_ids.shape[1]:]

    # Decode response
    response = tokenizer.decode(
        generated_ids,
        skip_special_tokens=True
    )

    return response.strip()


<h3 style="color:red;">5) Falcon 7B</h3>


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b-instruct"
)

model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)



2026-01-25 22:52:03.711990: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-25 22:52:03.726094: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-25 22:52:03.740293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-25 22:52:03.744536: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-25 22:52:03.757508: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def llm_generate(tokenizer, model, prompt: str, max_tokens: int = 1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=False,
        use_cache=True,                    
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)


In [4]:
llm_generate(tokenizer, model, "pm of india")

'\n- (a) the Prime Minister of India;\n- (b) the Prime Minister of the State of India;\n- (c) the Prime Minister of the Union Territory of Ladakh;\n- (d) the Prime Minister of the Union Territory of Jammu and Kashmir;\n- (e) the Prime Minister of the Union Territory of Dadra and Nagar Haveli and Daman and Diu;\n- (f) the Prime Minister of the Union Territory of Lakshadweep;\n- (g) the Prime Minister of the Union Territory of Puducherry;\n- (h) the Prime Minister of the Union Territory of Dadra and Nagar Haveli and Daman and Diu;\n- (i) the Prime Minister of the Union Territory of Jammu and Kashmir;\n- (j) the Prime Minister of the Union Territory of Ladakh;\n- (k) the Prime Minister of the Union Territory of Puducherry;\n- (l) the Prime Minister of the Union Territory of Dadra and Nagar Haveli and Daman and Diu;\n- (m) the Prime Minister of the Union Territory of Jammu and Kashmir;\n- (n) the Prime Minister of the Union Territory of Ladakh;\n- (o) the Prime Minister of the Union Territ

<h3 style="color:blue;">JSON Formatiing</h3>


In [8]:
import json
import re
import csv

def clean_json_response(response: str) -> str:
    """
    Cleans the LLM response to ensure it's valid JSON.
    """
    # Remove markdown code fences
    cleaned = re.sub(r"```(?:json)?", "", response).strip()
    
    # Try to find the JSON array list
    start = cleaned.find("[")
    end = cleaned.rfind("]")
    
    if start != -1 and end != -1:
        cleaned = cleaned[start : end + 1]
        
    return cleaned

def generate_mcqs_from_chunks(
    chunks: list,
    tokenizer,
    model,
    csv_path: str,
    pdf_name: str
):
    """Generate MCQs and write to CSV immediately after each MCQ"""
    
    total_chunks = len(chunks)
    
    # Write CSV header first
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "pdf_name",
            "chunk_number",
            "total_chunks",
            "pages",
            "question",
            "option_A",
            "option_B",
            "option_C",
            "option_D",
            "correct_answer"
        ])
    
    total_mcqs = 0
    skipped_mcqs = 0
    
    for i, chunk in enumerate(chunks):
        chunk_number = chunk.get("chunk_number", i + 1)  # Use chunk_number from JSON or fallback
        print(f"\nProcessing chunk {chunk_number}/{total_chunks}...")

        try:
            prompt = build_mcq_prompt(
                text=chunk["text"]
            )

            response = llm_generate(tokenizer, model, prompt)

            # --- JSON SAFETY ---
            try:
                cleaned_response = clean_json_response(response)
                mcq_list = json.loads(cleaned_response)
            except json.JSONDecodeError as e:
                print(f"[WARN] JSON parse failed for chunk {chunk_number}")
                print(f"ERROR: {e}")
                print(f"RAW RESPONSE:\n{response}\n")
                print(f"Skipping chunk {chunk_number} and continuing...")
                continue

            # Ensure mcq_list is actually a list
            if not isinstance(mcq_list, list):
                print(f"[WARN] Response is not a list for chunk {chunk_number}")
                print(f"Skipping chunk {chunk_number} and continuing...")
                continue

            # Write each MCQ immediately to CSV
            with open(csv_path, "a", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                for mcq_idx, mcq in enumerate(mcq_list):
                    try:
                        # Validate MCQ structure
                        if not isinstance(mcq, dict):
                            print(f"  [SKIP] MCQ {mcq_idx+1} is not a dict, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Check required fields
                        if "question" not in mcq or "options" not in mcq or "answer" not in mcq:
                            print(f"  [SKIP] MCQ {mcq_idx+1} missing required fields, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Validate options
                        options = mcq["options"]
                        if not isinstance(options, dict) or not all(key in options for key in ["A", "B", "C", "D"]):
                            print(f"  [SKIP] MCQ {mcq_idx+1} has invalid options structure, skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Validate answer key
                        answer_key = mcq["answer"]
                        if answer_key not in options:
                            print(f"  [SKIP] MCQ {mcq_idx+1} has invalid answer key '{answer_key}', skipping...")
                            skipped_mcqs += 1
                            continue
                        
                        # Format correct answer as "D) Option Text"
                        answer_text = options[answer_key]
                        formatted_answer = f"{answer_key}) {answer_text}"
                        
                        # Convert pages list to string
                        pages_str = ", ".join(map(str, chunk["pages"]))
                        
                        writer.writerow([
                            pdf_name,
                            chunk_number,
                            pages_str,
                            mcq["question"],
                            options["A"],
                            options["B"],
                            options["C"],
                            options["D"],
                            formatted_answer
                        ])
                        
                        total_mcqs += 1
                        # print(f"  ✓ Generated MCQ #{total_mcqs}")
                        
                    except Exception as e:
                        print(f"  [ERROR] Failed to process MCQ {mcq_idx+1}: {e}")
                        print(f"  Skipping this MCQ and continuing...")
                        skipped_mcqs += 1
                        continue
        
        except Exception as e:
            print(f"[ERROR] Failed to process chunk {chunk_number}: {e}")
            print(f"Skipping chunk {chunk_number} and continuing...")
            continue
    
    print(f"\n✅ Completed!")
    print(f"   Generated: {total_mcqs} MCQs")
    print(f"   Skipped: {skipped_mcqs} MCQs")
    print(f"   Processed: {total_chunks} chunks")
    return total_mcqs



#### Inference the model

In [10]:
# Step 1: Extract pages from the PDF
pdf_name = "notes/LLM_cs124_week7_2025.pdf"
pages = extract_pdf_pages(pdf_name)

# Step 2: Chunk (3 pages, 1-page overlap)
chunks = create_page_chunks(
    pages,
    chunk_size=3,
    overlap=1
)

# Step 3: Store to JSON
save_chunks(chunks, "results/without_image/metdata/falcon_pdf_chunks.json")

print(f"Extracted {len(pages)} pages")
print(f"Created {len(chunks)} chunks")


Extracted 114 pages
Created 57 chunks


In [11]:

chunks = load_chunks( "results/without_image/metdata/falcon_pdf_chunks.json")

# Generate MCQs and write to CSV on the go
total_mcqs = generate_mcqs_from_chunks(
    chunks,
    tokenizer,
    model,
    csv_path="results/without_image/falcon_mcq_dataset.csv",
    pdf_name=pdf_name
)



Processing chunk 1/57...


[WARN] JSON parse failed for chunk 1
ERROR: Expecting value: line 1 column 1 (char 0)
RAW RESPONSE:

The output should be a JSON array of JSON objects, where each object represents a question and its options. Each object should have a "question" key, which should be the question text, and a "options" key, which should be an array of JSON objects representing the options. Each object should have a "question" key, which should be the question text, and a "options" key, which should be an array of JSON objects representing the options. Each object should have a "question" key, which should be the question text, and a "options" key, which should be an array of JSON objects representing the options. Each object should have a "question" key, which should be the question text, and a "options" key, which should be an array of JSON objects representing the options. Each object should have a "question" key, which should be the question text, and a "options" key, which should be an array of JSON 

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Token indices sequence length is longer than the specified maximum sequence length for this model (2651 > 2048). Running this sequence through the model will result in indexing errors


[WARN] JSON parse failed for chunk 5
ERROR: Extra data: line 14 column 1 (char 233)
RAW RESPONSE:
The output of the previous example is:
[
  {
    "question": "Who wrote the book 'The Origin of Species'?",
    "options": {
      "A": "Charles Darwin",
      "B": "Charles Darwin",
      "C": "Charles Darwin",
      "D": "Charles Darwin"
    },
    "answer": "A"
  }
]

The output of the previous example is:
[
  {
    "question": "Who wrote the book 'The Origin of Species'?",
    "options": {
      "A": "Charles Darwin",
      "B": "Charles Darwin",
      "C": "Charles Darwin",
      "D": "Charles Darwin"
    },
    "answer": "A"
  }
]

The output of the previous example is:
[
  {
    "question": "Who wrote the book 'The Origin of Species'?",
    "options": {
      "A": "Charles Darwin",
      "B": "Charles Darwin",
      "C": "Charles Darwin",
      "D": "Charles Darwin"
    },
    "answer": "A"
  }
]

The output of the previous example is:
[
  {
    "question": "Who wrote the book 'The 