In [1]:
import os
from dotenv import load_dotenv

from notion_client import Client

load_dotenv()

True

In [2]:
NOTION_API_KEY = os.getenv("NOTION_API_KEY")

In [3]:
notion = Client(auth=NOTION_API_KEY)

In [4]:
def get_block_children(notion, block_id):
    children = []
    cursor = None
    while True:
        response = notion.blocks.children.list(block_id=block_id, start_cursor=cursor)
        children.extend(response["results"])
        cursor = response.get("next_cursor")
        if not cursor:
            break
    return children


In [5]:
def extract_text_from_block(block):
    block_type = block["type"]
    text = ""

    if block_type in ["paragraph", "heading_1", "heading_2", "heading_3",
                      "bulleted_list_item", "numbered_list_item", "quote", "callout", "to_do"]:
        for rt in block[block_type].get("rich_text", []):
            text += rt.get("plain_text", "")
    elif block_type == "child_page":
        text += f"[Child Page] {block['child_page']['title']}"
    elif block_type == "child_database":
        text += f"[Database] {block['child_database']['title']}"
    return text.strip()


In [10]:
def traverse_page(notion, block_id, depth=0):
    content = []
    blocks = get_block_children(notion, block_id)

    for block in blocks:
        prefix = "  " * depth
        text = extract_text_from_block(block)
        if text:
            content.append(f"{prefix}- {text}")

        # Recursively handle nested blocks
        if block.get("has_children"):
            content.extend(traverse_page(notion, block["id"], depth + 1))

        # Special case: child_database – query it
        if block["type"] == "child_database":
            db_id = block["id"]
            try:
                db_pages = get_pages_from_database(notion, db_id)
                for p in db_pages:
                    title = extract_page_title(p)
                    content.append(f"{prefix}  - 🗂️ Database Page: {title}")
                    content.extend(traverse_page(notion, p["id"], depth + 2))
            except Exception as e:
                print(str(e))
    
    return content


In [7]:
def get_pages_from_database(notion, database_id):
    pages = []
    cursor = None
    while True:
        response = notion.databases.query(database_id=database_id, start_cursor=cursor)
        pages.extend(response["results"])
        cursor = response.get("next_cursor")
        if not cursor:
            break
    return pages

def extract_page_title(page):
    props = page.get("properties", {})
    for val in props.values():
        if val["type"] == "title":
            return "".join([t["plain_text"] for t in val["title"]])
    return "[No Title]"


In [8]:
page = notion.pages.retrieve(page_id="23115fcdd071815cb64dcd9a3af70648")

# Extract the title from the 'Name' property (adjust for your schema)
title_prop = page["properties"]["Name"]  # replace 'Name' with your title field name
title = "".join([t["plain_text"] for t in title_prop["title"]])

print(f"Title: {title}")

KeyError: 'Name'

In [None]:
extract_page_title()

AttributeError: 'str' object has no attribute 'get'

In [11]:
page_id = "23115fcdd071815cb64dcd9a3af70648"

content = traverse_page(notion, page_id)
print("\n".join(content))


- 🤔 Why: How your data gets into LangChain. Doesn’t always work as expected, recommended for complex multi-doc workflows, LLM chaining.
- 🧠 When Is LangChain Useful for Loading?
  - LangChain’s loaders are fine when:
  - You want a quick prototype and don’t mind complexity.
  - You’re chaining loading + chunking + embedding into one pipeline.
  - You’re already using LangChain heavily.
- 📝 Important libraries:
- 👩‍💻 Code example:


In [40]:
notion.databases.query(database_id="22a15fcdd071808cab19cc42fad5a7a5")

APIResponseError: Could not find database with ID: 22a15fcd-d071-808c-ab19-cc42fad5a7a5. Make sure the relevant pages and databases are shared with your integration.

<h2>Full data collection pipeline

In [None]:
from collections import deque
import os
import logging
from notion_client import Client
from dotenv import load_dotenv

load_dotenv()
logger = logging.getLogger(__name__)

NOTION_API_KEY = os.getenv("NOTION_API_KEY")
notion = Client(auth=NOTION_API_KEY)

class NotionBlockExtractor:
    def __init__(self, client: Client):
        self.client = client

    def gather_nested_page_blocks(self, page_id: str):
        page = self.client.pages.retrieve(page_id=page_id)
        page_title = self._get_page_title(page)
        page_last_edited = page.get("last_edited_time")

        logger.info(f"Gathering nested data from page: {page_title}")
        return self._walk_blocks_bfs(
            root_page_id=page_id,
            root_page_title=page_title,
            root_page_last_edited=page_last_edited
        )

    def _walk_blocks_bfs(self, root_page_id, root_page_title, root_page_last_edited):
        pages_queue = deque()
        flat_blocks = []

        pages_queue.append({
            "page_id": root_page_id,
            "page_title": root_page_title,
            "parent_page_id": None,
            "parent_page_name": None,
            "page_last_edited": root_page_last_edited,
            "depth": 0
        })
        all_pages = []

        while pages_queue:
            current_page = pages_queue.popleft()
            all_pages.append(current_page)

            children = self._get_page_children(current_page["page_id"])
            parent_block_id = None

            for child in children:

                block_record = {
                    "parent_page_id": current_page["parent_page_id"],
                    "parent_block_id": parent_block_id,
                    "page_id": current_page["page_id"],
                    "page_title": current_page["page_title"],
                    "depth": current_page["depth"] + 1
                }

                processed_block = self._process_block_data(child)

                if processed_block:
                    block_record.update(processed_block)
                    flat_blocks.append(block_record)

                    if block_record["type"] == "child_page":
                        pages_queue.append({
                                "page_id": block_record["block_id"],
                                "page_title": child['child_page']['title'],
                                "parent_page_id": current_page["page_id"],
                                "parent_page_name": current_page["page_title"],
                                "page_last_edited": block_record["last_edited_time"],
                                "depth": current_page["depth"] + 1
                            }
                        )

                    elif block_record["type"] == "child_database":
                        print("Skipping DB for now...")
                    

                    parent_block_id = block_record["block_id"]

        return flat_blocks

    def _get_page_children(self, page_id):
        children = []
        cursor = None
        while True:
            response = self.client.blocks.children.list(block_id=page_id, start_cursor=cursor)
            children.extend(response["results"])
            cursor = response.get("next_cursor")
            if not cursor:
                break
        return children

    def _process_block_data(self, block):
        block_type = block["type"]
        block_id = block["id"]
        content = ""

        if block_type in block and "rich_text" in block[block_type]:
            content = "".join(rt.get("plain_text", "") for rt in block[block_type]["rich_text"])
            if not content:
                return

        elif block_type == "child_page":
            content = f"[Child Page] {block['child_page']['title']}"
        elif block_type == "child_database":
            try:
                db_info = self.client.databases.retrieve(block_id)
                db_title = "".join(t.get("plain_text", "") for t in db_info.get("title", []))
                content = f"[Child Database] {db_title}"
            except Exception as e:
                content = "[Child Database: (Error fetching name)]"
                logger.warning(f"Failed to fetch child database title for id {block_id}: {e}")

        block_data = {
            "block_id": block_id,
            "type": block_type,
            "content": content.strip(),
            "last_edited_time": block.get("last_edited_time", None),
        }

        return block_data

    def _get_page_title(self, page):
        for val in page.get("properties", {}).values():
            if val.get("type") == "title":
                return "".join(t["plain_text"] for t in val["title"])
        return "[Untitled]"


In [32]:
extractor = NotionBlockExtractor(notion)

blocks = extractor.gather_nested_page_blocks("21615fcdd071807889b4c29a7f8a54b6")

Skipping DB for now...


Failed to fetch child database title for id 22a15fcd-d071-808c-ab19-cc42fad5a7a5: Could not find database with ID: 22a15fcd-d071-808c-ab19-cc42fad5a7a5. Make sure the relevant pages and databases are shared with your integration.


Skipping DB for now...
Skipping DB for now...


In [34]:
for block in blocks:
    print(block)

{'parent_page_id': None, 'parent_block_id': None, 'page_id': '21615fcdd071807889b4c29a7f8a54b6', 'page_title': 'Learning', 'depth': 1, 'block_id': '22a15fcd-d071-80ef-92c3-c654ed7bcf80', 'type': 'child_page', 'content': '[Child Page] Agenda', 'last_edited_time': '2025-07-17T10:39:00.000Z'}
{'parent_page_id': None, 'parent_block_id': '22a15fcd-d071-80ef-92c3-c654ed7bcf80', 'page_id': '21615fcdd071807889b4c29a7f8a54b6', 'page_title': 'Learning', 'depth': 1, 'block_id': '21615fcd-d071-80a0-8264-eb4fb334c186', 'type': 'child_database', 'content': '[Child Database] Materials', 'last_edited_time': '2025-07-16T17:28:00.000Z'}
{'parent_page_id': None, 'parent_block_id': '21615fcd-d071-80a0-8264-eb4fb334c186', 'page_id': '21615fcdd071807889b4c29a7f8a54b6', 'page_title': 'Learning', 'depth': 1, 'block_id': '22a15fcd-d071-804b-b5f0-c539210b4c44', 'type': 'child_page', 'content': '[Child Page] Notes', 'last_edited_time': '2025-07-16T12:27:00.000Z'}
{'parent_page_id': None, 'parent_block_id': '22a1