### Libraries

In [None]:
from dotenv import load_dotenv

# Load API keys from .env file 
load_dotenv(override=True)

# Step 1: Starter Code

New test
This code does two things:
1. **Diagnoses your inspiration corpus**
2. **Builds a retriever filtered by folder name**


In [1]:
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from tqdm.notebook import tqdm

# === Load All PDFs from Subfolders ===
def load_all_pdfs_with_metadata(folder_path):
    all_docs = []
    pdf_paths = list(Path(folder_path).rglob("*.pdf"))
    for path in tqdm(pdf_paths, desc="Loading PDFs from inspiration folders"):
        loader = PyMuPDFLoader(str(path))
        docs = loader.load()
        for doc in docs:
            doc.metadata["filename"] = path.name
            doc.metadata["source_folder"] = path.parent.name
        all_docs.extend(docs)
    return all_docs

docs = load_all_pdfs_with_metadata("documents/ToSort/Inspiration_folder")

# === Diagnostics ===
print(f"✅ Total documents loaded: {len(docs)}\\n")

# Preview a sample
print("📄 Sample document preview:")
print(docs[0].page_content[:500])

# Show folder distribution
from collections import Counter
folders = [doc.metadata.get("source_folder", "Unknown") for doc in docs]
folder_counts = Counter(folders)
print("\\n📁 Folder distribution in corpus:")
for folder, count in folder_counts.items():
    print(f"  - {folder}: {count} documents")

# === Filter by a folder (e.g., OpenSciEd) ===
target_folder = "OpenSciEd"  # <- Change this to the folder you want to test

filtered_docs = [doc for doc in docs if doc.metadata.get("source_folder") == target_folder]

print(f"\\n🔎 Using {len(filtered_docs)} documents from folder: '{target_folder}'")

# === Continue: Split, Embed, Store ===
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = splitter.split_documents(filtered_docs)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(split_docs, embeddings)
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})

print("✅ Filtered retriever is ready.")

Loading PDFs from inspiration folders:   0%|          | 0/106 [00:00<?, ?it/s]

✅ Total documents loaded: 13845\n
📄 Sample document preview:
EQuIP RUBRIC FOR SCIENCE EVALUATION  
 
  
 
How Do Eggs Become 
Chickens or Other 
Living Things? 
DEVELOPER: University of Illinois, College of Education 
GRADE: 6–7 | DATE OF REVIEW: September 2020
\n📁 Folder distribution in corpus:
  - Inspiration_folder: 114 documents
  - CRP: 208 documents
  - NGSSAligned: 1118 documents
  - OpenSciEd: 7674 documents
  - Research: 327 documents
  - Chicken: 207 documents
  - Heartworm: 105 documents
  - Pdf: 61 documents
  - Pdfs: 66 documents
  - Student: 1037 documents
  - Teacher: 2601 documents
  - Curriculum Development: 184 documents
  - ECE: 143 documents
\n🔎 Using 7674 documents from folder: 'OpenSciEd'
✅ Filtered retriever is ready.


Save File

In [2]:
with open("rag_output.txt", "w", encoding="utf-8") as f:
    f.write("QUESTION:\n")
    f.write(question + "\n\n")
    f.write("RESPONSE:\n")
    f.write(response)


NameError: name 'question' is not defined

Save Multiple Outputs (Append Mode)
If you're doing multiple runs and want to keep them all, use "a" (append mode):

In [3]:
with open("rag_log.txt", "a", encoding="utf-8") as f:
    f.write("===" * 20 + "\n")
    f.write("QUESTION:\n" + question + "\n\n")
    f.write("RESPONSE:\n" + response + "\n\n")


NameError: name 'question' is not defined

Save with Metadata (as JSON)
If you're building for research or prototyping teacher tools, consider saving as a .json:

In [4]:
import json
from datetime import datetime

log = {
    "timestamp": datetime.now().isoformat(),
    "question": question,
    "response": response
}

with open("rag_outputs.jsonl", "a", encoding="utf-8") as f:
    f.write(json.dumps(log) + "\n")


NameError: name 'question' is not defined

Test 1

In [5]:
data = {
    "topic": "climate change",
    "student_context": "Black middle school students in Oakland",
    "grade_level": "8th grade",
    "local_community_assets": "local parks, environmental justice organizations, and community gardens"
}

response = rag_chain.invoke(data)
print(response)


NameError: name 'rag_chain' is not defined

Test 2

In [6]:
data = {
    "topic": "plate tectonics and earthquakes",
    "student_context": "multilingual students in East San Jose",
    "grade_level": "7th grade"
}


In [7]:
response = rag_chain.invoke(data)
print(response)

NameError: name 'rag_chain' is not defined

In [8]:
with open("generated_unit_plan.txt", "w", encoding="utf-8") as f:
    f.write(response)


NameError: name 'response' is not defined

In [16]:
# ✅ Structured RAG Chain with Path 2 (Dictionary Input)

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define your structured prompt (no Jinja-style conditionals in LangChain PromptTemplate)
prompt = PromptTemplate.from_template("""
You are a thinking partner for a middle school science teacher designing a new unit plan.

Design a unit on the topic of: {topic}
For students: {student_context}
Grade Level: {grade_level}

Use the retrieved instructional materials to help generate:
1. Anchoring Phenomenon
2. Driving Question
3. Learning Objectives
4. Lesson Sequence Summary (3–5 lessons)
5. Assessment Ideas
6. Relevance to Students in {student_context}

Connections to local community assets (if applicable): {local_community_assets}

# Context:
{context}

# Unit Plan:
""")

# 2. Create a structured RAG chain
rag_chain = (
    RunnableMap({
        # Retriever needs a string: use topic only
        "context": lambda x: retriever.invoke(x["topic"]),
        # Pass these to fill the template
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school"),
        "local_community_assets": lambda x: x.get("local_community_assets", "None specified")
    })
    | prompt
    | llm  # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 3. Run it with structured input
data = {
    "topic": "climate change",
    "student_context": "Black middle school students in Oakland",
    "grade_level": "8th grade",
    "local_community_assets": "local parks, environmental justice organizations, and community gardens"
}

response = rag_chain.invoke(data)
print(response)


## Unit Plan: Climate Change

### Anchoring Phenomenon:
The anchoring phenomenon for this unit will be the impact of climate change on local parks and community gardens in Oakland. Students will explore how rising temperatures, changing precipitation patterns, and extreme weather events are affecting these natural spaces that they may visit or be familiar with.

### Driving Question:
How is climate change impacting our local community and what can we do to address it?

### Learning Objectives:
1. Understand the causes and effects of climate change, including how human activities contribute to global warming.
2. Analyze the impact of climate change on local ecosystems, such as parks and community gardens.
3. Evaluate potential solutions and actions that can mitigate the effects of climate change on our community.
4. Develop a deeper understanding of environmental justice issues related to climate change and its disproportionate impact on marginalized communities.

### Lesson Sequence Su

Step 1

In [10]:
# ✅ Structured RAG Chain – Step 1: Generate High-Level Unit Outline

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define the UNIT OUTLINE prompt
unit_outline_prompt = PromptTemplate.from_template("""
You are a curriculum design thinking partner for a middle school science teacher.
Use the retrieved examples from high-quality units to help generate a general outline for a new unit.

Design a unit on the topic of: {topic}
Grade Level: {grade_level}
Student Context: {student_context}

The output should include:
1. Title of the Unit
2. Anchoring Phenomenon
3. Driving Question
4. Summary of the storyline arc (3–5 sentence description)
5. List of 3–5 Lesson Sets (just short 1–2 sentence summaries)
6. NGSS Performance Expectations (if known or retrievable)

# Inspiration Context:
{context}

# Draft Unit Outline:
""")

# 2. Create the structured RAG chain for outline generation
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["topic"]),
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school")
    })
    | unit_outline_prompt
    | llm  # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 3. Example Run
example_data = {
    "topic": "ecosystems and human impact",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade"
}

response = rag_chain.invoke(example_data)
print(response)


NameError: name 'llm' is not defined

Step 2

In [11]:
# ✅ Structured RAG Chain – Step 1: Generate High-Level Unit Outline

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define the UNIT OUTLINE prompt
unit_outline_prompt = PromptTemplate.from_template("""
You are a curriculum design thinking partner for a middle school science teacher.
Use the retrieved examples from high-quality units to help generate a general outline for a new unit.

Design a unit on the topic of: {topic}
Grade Level: {grade_level}
Student Context: {student_context}

The output should include:
1. Title of the Unit
2. Anchoring Phenomenon
3. Driving Question
4. Summary of the storyline arc (3–5 sentence description)
5. List of 3–5 Lesson Sets (just short 1–2 sentence summaries)
6. NGSS Performance Expectations (if known or retrievable)

# Inspiration Context:
{context}

# Draft Unit Outline:
""")

# 2. Define the LESSON EXPANSION prompt
lesson_expansion_prompt = PromptTemplate.from_template("""
You are continuing the collaborative curriculum design process for a middle school science teacher.
Expand the following lesson set description into a complete lesson sequence.

Student Context: {student_context}
Grade Level: {grade_level}

Lesson Set Description: {lesson_summary}

Use examples from high-quality instructional materials to generate:
1. Lesson Titles
2. Learning Objectives
3. Key Activities or Investigations (briefly described)
4. Instructional Strategies and Supports
5. Assessment Opportunities (formal or informal)

# Related Context:
{context}

# Expanded Lesson Set:
""")

# 3. Create the structured RAG chain for outline generation
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["topic"]),
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school")
    })
    | unit_outline_prompt
    | llm  = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 4. Example Run for Unit Outline
data_outline = {
    "topic": "ecosystems and human impact",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade"
}

response = rag_chain.invoke(data_outline)
print(response)

# 5. Example Run for Lesson Set Expansion (you can run this after generating the unit outline)
lesson_data = {
    "lesson_summary": "Students analyze how pollution and development affect food webs in local ecosystems.",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade",
    "context": retriever.invoke("ecosystems human impact food webs")
}

lesson_expansion_chain = (
    RunnableMap({
        "lesson_summary": lambda x: x["lesson_summary"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x["grade_level"],
        "context": lambda x: x["context"]
    })
    | lesson_expansion_prompt
    | llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
    | StrOutputParser()
)

lesson_response = lesson_expansion_chain.invoke(lesson_data)
print(lesson_response)


SyntaxError: cannot assign to expression here. Maybe you meant '==' instead of '='? (4216505224.py, line 55)