### Libraries

# Starter Code

In [1]:
# 📦 1. Imports and setup
import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser
from tqdm import tqdm

from dotenv import load_dotenv

# Load API keys from .env file 
load_dotenv(override=True)

# Set API key (or use os.environ if you prefer)
#os.environ["OPENAI_API_KEY"] = "your-openai-key-here"

# 📂 2. Load all PDFs from a folder and subfolders
def load_all_pdfs(folder_path):
    all_docs = []
    pdf_paths = list(Path(folder_path).rglob("*.pdf"))  # searches subfolders too
    for path in tqdm(pdf_paths, desc="Loading PDFs from all folders"):
        loader = PyMuPDFLoader(str(path))
        loaded = loader.load()
        for doc in loaded:
            doc.metadata["source"] = path.name  # add filename as metadata
        all_docs.extend(loaded)
    return all_docs

docs = load_all_pdfs("documents/ToSort/Inspiration_folder")  # replace with your folder

# 📎 3. Split documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)
#split_docs = splitter.split_documents(docs)
split_docs = []
for doc in tqdm(docs, desc="Splitting docs"):
    split_docs.extend(splitter.split_documents([doc]))


# 🧠 4. Embed and store in FAISS
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(split_docs, embeddings)

# 🧲 5. Set up retriever
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})

# 💬 6. Build your teacher thinking partner prompt
prompt = PromptTemplate.from_template("""
You are an experienced curriculum designer and thinking partner for middle school science teachers.

Using the retrieved context below, help the teacher generate a complete science unit plan on the topic of {topic}, tailored for {student_context}.

Include the following sections:
- **Anchoring Phenomenon**
- **Driving Question**
- **Learning Objectives**
- **Key Science and Literacy Practices**
- **Lesson Summaries** (3–5)
- **Assessment Ideas**
- **Cultural/Local Relevance for {student_context}**

# Context:
{context}

# Unit Plan:
""")

# 🤖 7. Initialize LLM (you can change to gpt-4 or local model later)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# 🔗 8. Build the RAG chain
# Combine retriever with a "question" key only for embedding
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["topic"]),  # Use only topic for retrieval
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"]
    })
    | prompt
    | llm
    | StrOutputParser()
)

# 🧪 9. Try it out!
question = {
    "topic": "climate change",
    "student_context": "Black middle school students in Oakland"
}

response = rag_chain.invoke(question)
print(response)


Loading PDFs from all folders: 100%|██████████| 106/106 [00:32<00:00,  3.24it/s]
Splitting docs: 100%|██████████| 13845/13845 [00:01<00:00, 9375.88it/s] 


# Unit Plan: Climate Change and Its Impacts

## Anchoring Phenomenon:
The increase in extreme weather events, such as hurricanes and wildfires, and the impact on communities and ecosystems.

## Driving Question:
How does climate change affect our local community in Oakland and what can we do about it?

## Learning Objectives:
1. Understand the difference between climate change and global warming.
2. Identify the causes and effects of climate change on ecosystems and society.
3. Analyze the impact of climate change on marginalized communities, including Black communities in Oakland.
4. Explore solutions and actions that can mitigate the effects of climate change.

## Key Science and Literacy Practices:
1. Analyzing data on temperature changes and extreme weather events.
2. Constructing arguments supported by evidence on the causes and effects of climate change.
3. Engaging in discussions on climate justice and environmental racism.
4. Researching and presenting solutions to address clim

In [4]:
print(f"Total documents loaded: {len(docs)}")


Total documents loaded: 8161


Save File

In [None]:
with open("rag_output.txt", "w", encoding="utf-8") as f:
    f.write("QUESTION:\n")
    f.write(question + "\n\n")
    f.write("RESPONSE:\n")
    f.write(response)


Save Multiple Outputs (Append Mode)
If you're doing multiple runs and want to keep them all, use "a" (append mode):

In [19]:
with open("rag_log.txt", "a", encoding="utf-8") as f:
    f.write("===" * 20 + "\n")
    f.write("QUESTION:\n" + question + "\n\n")
    f.write("RESPONSE:\n" + response + "\n\n")


TypeError: can only concatenate str (not "dict") to str

Save with Metadata (as JSON)
If you're building for research or prototyping teacher tools, consider saving as a .json:

In [25]:
import json
from datetime import datetime

log = {
    "timestamp": datetime.now().isoformat(),
    "question": question,
    "response": response
}

with open("rag_outputs.jsonl", "a", encoding="utf-8") as f:
    f.write(json.dumps(log) + "\n")


New test
This code does two things:
1. **Diagnoses your inspiration corpus**
2. **Builds a retriever filtered by folder name**


In [11]:
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from tqdm.notebook import tqdm

# === Load All PDFs from Subfolders ===
def load_all_pdfs_with_metadata(folder_path):
    all_docs = []
    pdf_paths = list(Path(folder_path).rglob("*.pdf"))
    for path in tqdm(pdf_paths, desc="Loading PDFs from inspiration folders"):
        loader = PyMuPDFLoader(str(path))
        docs = loader.load()
        for doc in docs:
            doc.metadata["filename"] = path.name
            doc.metadata["source_folder"] = path.parent.name
        all_docs.extend(docs)
    return all_docs

docs = load_all_pdfs_with_metadata("documents/ToSort/Inspiration_folder")

# === Diagnostics ===
print(f"✅ Total documents loaded: {len(docs)}\\n")

# Preview a sample
print("📄 Sample document preview:")
print(docs[0].page_content[:500])

# Show folder distribution
from collections import Counter
folders = [doc.metadata.get("source_folder", "Unknown") for doc in docs]
folder_counts = Counter(folders)
print("\\n📁 Folder distribution in corpus:")
for folder, count in folder_counts.items():
    print(f"  - {folder}: {count} documents")

# === Filter by a folder (e.g., OpenSciEd) ===
target_folder = "OpenSciEd"  # <- Change this to the folder you want to test

filtered_docs = [doc for doc in docs if doc.metadata.get("source_folder") == target_folder]

print(f"\\n🔎 Using {len(filtered_docs)} documents from folder: '{target_folder}'")

# === Continue: Split, Embed, Store ===
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = splitter.split_documents(filtered_docs)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(split_docs, embeddings)
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})

print("✅ Filtered retriever is ready.")

Loading PDFs from inspiration folders:   0%|          | 0/106 [00:00<?, ?it/s]

✅ Total documents loaded: 13845\n
📄 Sample document preview:
EQuIP RUBRIC FOR SCIENCE EVALUATION  
 
  
 
How Do Eggs Become 
Chickens or Other 
Living Things? 
DEVELOPER: University of Illinois, College of Education 
GRADE: 6–7 | DATE OF REVIEW: September 2020
\n📁 Folder distribution in corpus:
  - Inspiration_folder: 114 documents
  - CRP: 208 documents
  - NGSSAligned: 1118 documents
  - OpenSciEd: 7674 documents
  - Research: 327 documents
  - Chicken: 207 documents
  - Heartworm: 105 documents
  - Pdf: 61 documents
  - Pdfs: 66 documents
  - Student: 1037 documents
  - Teacher: 2601 documents
  - Curriculum Development: 184 documents
  - ECE: 143 documents
\n🔎 Using 7674 documents from folder: 'OpenSciEd'
✅ Filtered retriever is ready.


Test 1

In [20]:
data = {
    "topic": "climate change",
    "student_context": "Black middle school students in Oakland",
    "grade_level": "8th grade",
    "local_community_assets": "local parks, environmental justice organizations, and community gardens"
}

response = rag_chain.invoke(data)
print(response)


## Unit Plan: Climate Change

### Anchoring Phenomenon:
The anchoring phenomenon for this unit will be the impact of climate change on local parks and community gardens in Oakland. Students will explore how rising temperatures, changing precipitation patterns, and extreme weather events are affecting these natural spaces that they may visit or be familiar with.

### Driving Question:
How can we as a community address the impacts of climate change on our local environment and work towards a more sustainable future?

### Learning Objectives:
1. Understand the causes and effects of climate change, including how human activities contribute to global warming.
2. Analyze the impact of climate change on local ecosystems, such as parks and community gardens.
3. Explore solutions and actions that can be taken to mitigate and adapt to climate change at the local level.
4. Develop critical thinking skills to evaluate scientific evidence and communicate findings effectively.

### Lesson Sequence S

Test 2

In [21]:
data = {
    "topic": "plate tectonics and earthquakes",
    "student_context": "multilingual students in East San Jose",
    "grade_level": "7th grade"
}


In [22]:
response = rag_chain.invoke(data)
print(response)

## Anchoring Phenomenon:
The anchoring phenomenon for this unit could be a video or news article showing the aftermath of a recent earthquake in a region with active plate tectonics. This could include footage of buildings damaged, people affected, and the geological changes caused by the earthquake.

## Driving Question:
The driving question for this unit could be: "How do plate tectonics and earthquakes shape the Earth's surface and impact human communities?"

## Learning Objectives:
1. Understand the concept of plate tectonics and how it influences the Earth's surface.
2. Explain the relationship between plate boundaries and earthquakes.
3. Analyze the impact of earthquakes on human communities and the environment.
4. Develop models to demonstrate the movement of tectonic plates and the occurrence of earthquakes.

## Lesson Sequence Summary:
1. Lesson 1: Introduction to Plate Tectonics
   - Explore the concept of plate tectonics through interactive activities and discussions.
   - I

In [17]:
with open("generated_unit_plan.txt", "w", encoding="utf-8") as f:
    f.write(response)


In [16]:
# ✅ Structured RAG Chain with Path 2 (Dictionary Input)

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define your structured prompt (no Jinja-style conditionals in LangChain PromptTemplate)
prompt = PromptTemplate.from_template("""
You are a thinking partner for a middle school science teacher designing a new unit plan.

Design a unit on the topic of: {topic}
For students: {student_context}
Grade Level: {grade_level}

Use the retrieved instructional materials to help generate:
1. Anchoring Phenomenon
2. Driving Question
3. Learning Objectives
4. Lesson Sequence Summary (3–5 lessons)
5. Assessment Ideas
6. Relevance to Students in {student_context}

Connections to local community assets (if applicable): {local_community_assets}

# Context:
{context}

# Unit Plan:
""")

# 2. Create a structured RAG chain
rag_chain = (
    RunnableMap({
        # Retriever needs a string: use topic only
        "context": lambda x: retriever.invoke(x["topic"]),
        # Pass these to fill the template
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school"),
        "local_community_assets": lambda x: x.get("local_community_assets", "None specified")
    })
    | prompt
    | llm  # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 3. Run it with structured input
data = {
    "topic": "climate change",
    "student_context": "Black middle school students in Oakland",
    "grade_level": "8th grade",
    "local_community_assets": "local parks, environmental justice organizations, and community gardens"
}

response = rag_chain.invoke(data)
print(response)


## Unit Plan: Climate Change

### Anchoring Phenomenon:
The anchoring phenomenon for this unit will be the impact of climate change on local parks and community gardens in Oakland. Students will explore how rising temperatures, changing precipitation patterns, and extreme weather events are affecting these natural spaces that they may visit or be familiar with.

### Driving Question:
How is climate change impacting our local community and what can we do to address it?

### Learning Objectives:
1. Understand the causes and effects of climate change, including how human activities contribute to global warming.
2. Analyze the impact of climate change on local ecosystems, such as parks and community gardens.
3. Evaluate potential solutions and actions that can mitigate the effects of climate change on our community.
4. Develop a deeper understanding of environmental justice issues related to climate change and its disproportionate impact on marginalized communities.

### Lesson Sequence Su

Step 1

In [24]:
# ✅ Structured RAG Chain – Step 1: Generate High-Level Unit Outline

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define the UNIT OUTLINE prompt
unit_outline_prompt = PromptTemplate.from_template("""
You are a curriculum design thinking partner for a middle school science teacher.
Use the retrieved examples from high-quality units to help generate a general outline for a new unit.

Design a unit on the topic of: {topic}
Grade Level: {grade_level}
Student Context: {student_context}

The output should include:
1. Title of the Unit
2. Anchoring Phenomenon
3. Driving Question
4. Summary of the storyline arc (3–5 sentence description)
5. List of 3–5 Lesson Sets (just short 1–2 sentence summaries)
6. NGSS Performance Expectations (if known or retrievable)

# Inspiration Context:
{context}

# Draft Unit Outline:
""")

# 2. Create the structured RAG chain for outline generation
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["topic"]),
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school")
    })
    | unit_outline_prompt
    | llm  # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 3. Example Run
example_data = {
    "topic": "ecosystems and human impact",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade"
}

response = rag_chain.invoke(example_data)
print(response)


1. Title of the Unit: "Balancing Act: Ecosystems and Human Impact"

2. Anchoring Phenomenon: The disappearance of bees in a local community garden and its impact on plant pollination.

3. Driving Question: How do human activities affect the balance of ecosystems and what can we do to mitigate these impacts?

4. Summary of the storyline arc: Students will investigate the interconnectedness of ecosystems and human activities, exploring the effects of overpopulation, habitat destruction, pollution, and climate change on biodiversity. They will analyze real-world examples of human impact on ecosystems and collaborate to develop solutions for sustainable coexistence.

5. List of Lesson Sets:
   a. Lesson Set 1: Introduction to Ecosystems - Students will explore the components of ecosystems and the concept of biodiversity.
   b. Lesson Set 2: Human Impact on Ecosystems - Students will investigate the various ways in which human activities can disrupt ecosystems.
   c. Lesson Set 3: Case Stud

Step 2

In [26]:
# ✅ Structured RAG Chain – Step 1: Generate High-Level Unit Outline

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

# 1. Define the UNIT OUTLINE prompt
unit_outline_prompt = PromptTemplate.from_template("""
You are a curriculum design thinking partner for a middle school science teacher.
Use the retrieved examples from high-quality units to help generate a general outline for a new unit.

Design a unit on the topic of: {topic}
Grade Level: {grade_level}
Student Context: {student_context}

The output should include:
1. Title of the Unit
2. Anchoring Phenomenon
3. Driving Question
4. Summary of the storyline arc (3–5 sentence description)
5. List of 3–5 Lesson Sets (just short 1–2 sentence summaries)
6. NGSS Performance Expectations (if known or retrievable)

# Inspiration Context:
{context}

# Draft Unit Outline:
""")

# 2. Define the LESSON EXPANSION prompt
lesson_expansion_prompt = PromptTemplate.from_template("""
You are continuing the collaborative curriculum design process for a middle school science teacher.
Expand the following lesson set description into a complete lesson sequence.

Student Context: {student_context}
Grade Level: {grade_level}

Lesson Set Description: {lesson_summary}

Use examples from high-quality instructional materials to generate:
1. Lesson Titles
2. Learning Objectives
3. Key Activities or Investigations (briefly described)
4. Instructional Strategies and Supports
5. Assessment Opportunities (formal or informal)

# Related Context:
{context}

# Expanded Lesson Set:
""")

# 3. Create the structured RAG chain for outline generation
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["topic"]),
        "topic": lambda x: x["topic"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x.get("grade_level", "middle school")
    })
    | unit_outline_prompt
    | llm  # Your language model (ChatOpenAI or HuggingFacePipeline)
    | StrOutputParser()
)

# 4. Example Run for Unit Outline
data_outline = {
    "topic": "ecosystems and human impact",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade"
}

response = rag_chain.invoke(data_outline)
print(response)

# 5. Example Run for Lesson Set Expansion (you can run this after generating the unit outline)
lesson_data = {
    "lesson_summary": "Students analyze how pollution and development affect food webs in local ecosystems.",
    "student_context": "Black and Latinx middle school students in Los Angeles",
    "grade_level": "7th grade",
    "context": retriever.invoke("ecosystems human impact food webs")
}

lesson_expansion_chain = (
    RunnableMap({
        "lesson_summary": lambda x: x["lesson_summary"],
        "student_context": lambda x: x["student_context"],
        "grade_level": lambda x: x["grade_level"],
        "context": lambda x: x["context"]
    })
    | lesson_expansion_prompt
    | llm
    | StrOutputParser()
)

lesson_response = lesson_expansion_chain.invoke(lesson_data)
print(lesson_response)


1. Title of the Unit: "Balancing Act: Ecosystems and Human Impact"

2. Anchoring Phenomenon: The disappearance of bees in a local community garden and its impact on plant pollination.

3. Driving Question: How do human activities affect the balance of ecosystems and what can we do to mitigate these impacts?

4. Summary of the storyline arc: Students will investigate the interconnectedness of ecosystems and human activities, exploring the effects of overpopulation, habitat destruction, pollution, and climate change on biodiversity. They will analyze real-world examples of human impact on ecosystems and collaborate to develop solutions for sustainable coexistence.

5. List of Lesson Sets:
   a. Lesson Set 1: Introduction to Ecosystems and Human Impact - Students will explore the concept of ecosystems and how human activities can disrupt the balance.
   b. Lesson Set 2: Biodiversity and Its Importance - Students will investigate the importance of biodiversity in maintaining ecosystem func