In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import nest_asyncio

nest_asyncio.apply()

from pathlib import Path
BASE_PATH = Path(".").absolute().parent.parent.parent

sys.path.append(str(BASE_PATH))

DATA_PATH = BASE_PATH / "sources" / "1917jps-wikisource"
TEXT_PATH = DATA_PATH / "text"
CREDITS_PATH = DATA_PATH / "credits"

with open(BASE_PATH / "opensiddur" / "private" / "api_key.txt", "r") as f:
    API_KEY = f.read().strip()


In [None]:
Path(".").absolute()

In [None]:
import operator
import time
from typing import Annotated, Literal, TypedDict, Optional
from pydantic import BaseModel, Field

from langgraph.graph import END, add_messages
from openai import APITimeoutError


class Page(BaseModel):
    number: int = Field(description = "Page sequence number")
    content: str = Field(description = "Page content")

def get_page(page_number: str | int) -> Optional[Page]:
    """ Return the wikitext of the given Page, or None if it does not exist """
    page_num = int(page_number)
    page_file_name = f"{page_num:04d}.txt"
    try:
        with open(TEXT_PATH / page_file_name, "r") as f:
            return Page.model_validate(dict(number=page_num, content=f.read()))
    except FileNotFoundError:
        return None


class OutlineItem(BaseModel):
    section_title: str = Field(description="Title of the section")
    start_page: int = Field(description="Page number of the first page of the section")
    
# Go through the items in the outline in a sliding window. If 2 items have the same title, remove the first one.
def reduce_outline_items(
    old: list[OutlineItem],
    new: list[OutlineItem]) -> list[OutlineItem]:
    combined = sorted(old + new, key=lambda x: x.start_page)
    pruned = []
    
    for i, item in enumerate(combined):
        next_item = combined[i+1] if i < len(combined) - 1 else None
        if next_item is not None:
            if item.section_title == next_item.section_title:
                continue
        pruned.append(item)
    return pruned

class Outline(BaseModel):
    outline: list[OutlineItem] = Field(description="Outline of the book")

class AgentState(TypedDict):
    current_page: Page = Field(description="Page currently being processed.")
   
    outline: Annotated[list[OutlineItem], reduce_outline_items] = Field(description="Outline of the book")

    next_tool: Literal["outline", "turn_page", "done"] = Field(description="The next tool to call")

from langchain.tools import tool



def turn_page_forward(state: AgentState) -> AgentState:
    """ Turn the page forward """
    current_page_number = state["current_page"].number
    new_page_number = current_page_number + 1
    page_text = get_page(new_page_number) or ""
    return dict(
        current_page = page_text,
        next_tool = "outline" if page_text else "done"
    )

def printable_state(state: AgentState) -> dict:
    outline = ""
    for item in state["outline"]:
        outline += f"* {item.section_title} ({item.start_page})\n"

    return {
        "outline": outline,
        "current_page_number": state["current_page"].number,
        "current_page": state["current_page"].content,
    }

outline_tools = [turn_page_forward]

from langchain_openai.chat_models.base import ChatOpenAI

from langchain.prompts import ChatPromptTemplate

class OutlineResponse(BaseModel):
    outline: list[OutlineItem] = Field(description="Next outline items")
    
def outline(state: AgentState) -> AgentState:
    print("Reading page", state["current_page"].number)
    prompt = ChatPromptTemplate.from_messages([
        ("system", """
# Your role        
You are a superlative expert in understanding MediaWiki markup and you are expert in reading and understanding Jewish scripture and liturgical texts.
You will be given the content of pages from a book that was encoded into Wikisource as MediaWiki markup.
You will direct the generation of an outline of a book.
Each item in the outline is a major section of the book.

The outline has the following components:
# Outline components
Title page
Table of contents
Torah (The Law)
  Genesis
  Exodus
  Leviticus
  Numbers
  Deuteronomy
Nevi'im (The Prophets)
  Joshua
  Judges
  1 Samuel
  2 Samuel
  1 Kings
  2 Kings
  Isaiah
  Jeremiah
  Ezekiel
  The Twelve Minor Prophets
    Hosea
    Joel
    Amos
    Obadiah
    Jonah
    Micah
    Nahum
    Habakkuk
    Zephaniah
    Haggai
    Zechariah
    Malachi
Ketuvim (The Writings)
  Psalms
  Proverbs
  Job  
  Song of Songs
  Ruth
  Lamentations
  Ecclesiastes
  Esther
  Daniel
  Ezra
  Nehemiah
  Chronicles
  Ezra
  Nehemiah
  1 Chronicles
  2 Chronicles

# Instructions
I will provide you with the current page in MediaWiki markup.
The page numbers will be provided for you in the headings above the page content. Do not make up your own page numbers.
When a new major section begins, you will return the section title, and the start page of the section in the outline.
If the last recorded outline item continues on this page, return the outline as an empty list (outline=[]).
"""),
        ("user", """
# Current MediaWiki page = page number {current_page_number}:
{current_page}

# Outline so far:
{outline}
        """)
    ]).partial(**printable_state(state))
    llm = ChatOpenAI(
        model="Qwen/Qwen3-235B-A22B-Instruct-2507", #"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", # "google/gemma-3-27b-it",
        api_key=API_KEY,
        base_url="https://api.deepinfra.com/v1/openai",
        temperature=0.0,
        timeout=10
    )
    llm = llm.with_structured_output(OutlineResponse)
    llm = prompt | llm
    retry = 0
    max_retries = 5
    success = False
    while not success and retry < max_retries:
        try:
            response = llm.invoke({"user": "Go."})
            success = True
        except APITimeoutError as e:
            print(f"Timeout error, retrying {retry+1}/{max_retries}...")
            time.sleep(1.0)
            retry += 1
            if retry >= max_retries:
                raise e
        except Exception as e:
            raise
    if response.outline:
        print(response)
    return {
        "outline": response.outline,
        "next_tool": "turn_page"
    }



from langgraph.graph import StateGraph

def conditional_edge(state: AgentState) -> str:
    return state["next_tool"]


workflow = StateGraph(AgentState)
workflow.add_node("outline", outline)
workflow.add_node("turn_page", turn_page_forward)
workflow.add_conditional_edges(
    "turn_page",
    conditional_edge,
    {
        "outline": "outline",
        "done": END
    }
)
workflow.add_edge("outline", "turn_page")
workflow.set_entry_point("outline")

app = workflow.compile()




In [None]:
from IPython.core.display import Image
from IPython.display import display
from langchain_core.runnables.graph import MermaidDrawMethod

display(Image(app.get_graph().draw_mermaid_png()))

In [None]:
response = app.invoke({"current_page": get_page(7), 
"outline": []}, config={"recursion_limit": 5000})

In [None]:
outline = response["outline"]
for item in outline:
    print(item.section_title, item.start_page)


In [None]:
get_page(23)

In [None]:
embeddings