In [4]:
from dotenv import load_dotenv
import os
from notion_client import Client

from pprint import pprint

load_dotenv()

notion_token = os.getenv('NOTION_TOKEN')

# Initialize the Notion client
notion = Client(auth=notion_token)

Most of the documentation revolves around retrieving database objects. However, when I try to retrieve all the database objects in my person notion, I only get 2 databases and 0 pages.

In [17]:
# Function to search for databases in the workspace
def get_databases():
    response = notion.search(
        filter={
            "value": "database",
            "property": "object"
        }
    )
    return response

# Get the databases and print them
databases = get_databases()

# Extracting the database IDs
database_ids = [result['id'] for result in databases['results'] if result['object'] == 'database']

In [18]:
databases

{'object': 'list',
 'results': [{'object': 'database',
   'id': 'cc7af67e-48be-4387-8514-1ded52f9bd6b',
   'cover': None,
   'icon': {'type': 'emoji', 'emoji': '📑'},
   'created_time': '2023-02-13T20:40:00.000Z',
   'created_by': {'object': 'user',
    'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92'},
   'last_edited_by': {'object': 'user',
    'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92'},
   'last_edited_time': '2024-09-23T13:23:00.000Z',
   'title': [{'type': 'text',
     'text': {'content': 'ML Library', 'link': None},
     'annotations': {'bold': False,
      'italic': False,
      'strikethrough': False,
      'underline': False,
      'code': False,
      'color': 'default'},
     'plain_text': 'ML Library',
     'href': None}],
   'description': [],
   'is_inline': False,
   'properties': {'URL': {'id': 'IlqV',
     'name': 'URL',
     'type': 'url',
     'url': {}},
    'Tags': {'id': 'lo%3Cz',
     'name': 'Tags',
     'type': 'multi_select',
     'multi_select': {'options': [{'

Now we can get all the pages from each database.

In [6]:
# Function to query pages in a database
def query_database(database_id):
    response = notion.databases.query(database_id=database_id)
    return response

# Initialize an empty list to collect the page dictionaries
pages_db = []

# Loop over each database ID and query its pages
for database_id in database_ids:
    
    # Query the database for pages (assuming it returns a list of dicts)
    pages = query_database(database_id)
    
    # Append the pages (dicts) from the current database to all_pages list
    pages_db.extend(pages["results"])

print(f"Total pages: {len(pages_db)}")
pprint(pages_db)

Total pages: 102
[{'archived': False,
  'cover': None,
  'created_by': {'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92',
                 'object': 'user'},
  'created_time': '2024-04-02T07:39:00.000Z',
  'icon': None,
  'id': '6731349e-a37b-4a75-b100-66facfb2ea09',
  'in_trash': False,
  'last_edited_by': {'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92',
                     'object': 'user'},
  'last_edited_time': '2024-04-02T07:39:00.000Z',
  'object': 'page',
  'parent': {'database_id': 'cc7af67e-48be-4387-8514-1ded52f9bd6b',
             'type': 'database_id'},
  'properties': {'Date added': {'created_time': '2024-04-02T07:39:00.000Z',
                                'id': 'xmb%3D',
                                'type': 'created_time'},
                 'Level': {'id': 'u_%5El',
                           'select': {'color': 'purple',
                                      'id': '803fac75-9bdb-4b53-894e-1edf15f0fb39',
                                      'name': '🤓 🤓 🤓'},
            

This is great! But the output of each database will vary dramatically. This pages in this database simply have the name of a web page, its URL, and some tags.

The question is, how do we extract content from databases that have different structures?

We could start out by taking all the properties for each page.

In [7]:
def extract_properties(pages):
    # Initialize an empty list to store all properties
    all_properties = []

    # Loop through each page and extract its properties
    for page in pages:
        properties = page.get('properties', {})
        all_properties.append(properties)

    print(f"Extracted properties from {len(all_properties)} pages")
    return all_properties

properties_db = extract_properties(pages_db)
pprint(pages_db[0])  # Print the first page's properties as an example


Extracted properties from 102 pages
{'archived': False,
 'cover': None,
 'created_by': {'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92', 'object': 'user'},
 'created_time': '2024-04-02T07:39:00.000Z',
 'icon': None,
 'id': '6731349e-a37b-4a75-b100-66facfb2ea09',
 'in_trash': False,
 'last_edited_by': {'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92',
                    'object': 'user'},
 'last_edited_time': '2024-04-02T07:39:00.000Z',
 'object': 'page',
 'parent': {'database_id': 'cc7af67e-48be-4387-8514-1ded52f9bd6b',
            'type': 'database_id'},
 'properties': {'Date added': {'created_time': '2024-04-02T07:39:00.000Z',
                               'id': 'xmb%3D',
                               'type': 'created_time'},
                'Level': {'id': 'u_%5El',
                          'select': {'color': 'purple',
                                     'id': '803fac75-9bdb-4b53-894e-1edf15f0fb39',
                                     'name': '🤓 🤓 🤓'},
                          'typ

This is great! But only two of my many pages in Notion are databases. The rest are pages with content.
So how do we access the content of these pages? The code below only retrieves pages from a given database (with database id).

In [19]:
# Function to search for pages not in a database
def get_non_database_pages():
    response = notion.search(
        filter={
            "value": "page",  # Search for pages
            "property": "object"
        }
    )
    non_database_pages = []

    # Filter out pages that are not part of a database
    for page in response['results']:
        if page['parent']['type'] != 'database_id':  # Ensure it's not in a database
            non_database_pages.append(page)

    return non_database_pages

# Get the non-database pages
pages = get_non_database_pages()

# Extract properties from non-database pages
properties_pages = extract_properties(pages)

# Print the first page's properties as an example
pprint(properties_pages)


Extracted properties from 13 pages
[{'title': {'id': 'title',
            'title': [{'annotations': {'bold': False,
                                       'code': False,
                                       'color': 'default',
                                       'italic': False,
                                       'strikethrough': False,
                                       'underline': False},
                       'href': None,
                       'plain_text': 'Interviews',
                       'text': {'content': 'Interviews', 'link': None},
                       'type': 'text'}],
            'type': 'title'}},
 {'title': {'id': 'title',
            'title': [{'annotations': {'bold': False,
                                       'code': False,
                                       'color': 'default',
                                       'italic': False,
                                       'strikethrough': False,
                                       'underli

This code gives us the properties for each page (that's not part of a database). However, it doesn't give us the content of the page.

So now we need to get the content of each page by returning the blocks of the page.

In [10]:
# Function to get all blocks for a given page ID
def get_page_blocks(page_id):
    # Retrieve blocks on the page
    blocks = notion.blocks.children.list(block_id=page_id)
    return blocks['results']

page_blocks = []
# Loop over each page and get its blocks
for page in pages:
    # Get all blocks on the page
    blocks = get_page_blocks(page['id'])
    # Append blocks to the list
    page_blocks.extend(blocks)

We can do this and that's great. But parsing the blocks to extract the plain text sucks...

Maybe we have to specify all the different "types" that we want to extract plain text from?

In [12]:
page_blocks[0]

{'object': 'block',
 'id': 'd8e4bad2-19b6-4097-b2ca-e157b4137f1f',
 'parent': {'type': 'page_id',
  'page_id': 'f28b0596-f23f-4591-a958-b39db557bd7a'},
 'created_time': '2023-12-21T21:36:00.000Z',
 'last_edited_time': '2023-12-21T21:57:00.000Z',
 'created_by': {'object': 'user',
  'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92'},
 'last_edited_by': {'object': 'user',
  'id': '2d38b5ac-c257-418c-bbf1-b13265b09e92'},
 'has_children': True,
 'archived': False,
 'in_trash': False,
 'type': 'child_page',
 'child_page': {'title': 'Climatiq'}}

In [13]:
# Function to get plain text from a rich text array
def get_plain_text_from_rich_text(rich_text):
    return "".join([t['plain_text'] for t in rich_text])

# Function to get plain text from different block types
def get_text_from_block(block):
    text = ""

    # Check if the block supports rich_text
    if 'rich_text' in block[block['type']]:
        text = get_plain_text_from_rich_text(block[block['type']]['rich_text'])
    else:
        # Handle non-rich-text block types
        block_type = block['type']
        if block_type == 'unsupported':
            text = "[Unsupported block type]"
        elif block_type == 'bookmark':
            text = block['bookmark']['url']
        elif block_type == 'child_database':
            text = block['child_database']['title']
        elif block_type == 'child_page':
            text = block['child_page']['title']
        # elif block_type in ['embed', 'video', 'file', 'image', 'pdf']:
        #     text = get_media_source_text(block)
        elif block_type == 'equation':
            text = block['equation']['expression']
        elif block_type == 'link_preview':
            text = block['link_preview']['url']
        elif block_type == 'synced_block':
            text = f"Synced with block ID: {block['synced_block'].get('synced_from', {}).get('block_id', 'Unknown')}"
        elif block_type == 'table':
            text = f"Table width: {block['table']['table_width']}"
        elif block_type == 'table_of_contents':
            text = f"ToC color: {block['table_of_contents']['color']}"
        elif block_type in ['breadcrumb', 'column_list', 'divider']:
            text = "No text available"
        else:
            text = "[Needs case added]"

    # Check if block has children
    if block['has_children']:
        text += " (Has children)"

In [15]:
output = [get_text_from_block(block) for block in page_blocks]
pprint(output)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:


    return f"{block['type']}: {text}"

# Function to retrieve block children (first-level)
async def retrieve_block_children(notion, block_id):
    print("Retrieving blocks (async)...")
    blocks = []

    # Iterate through all first-level blocks on the page
    async for block in iterate_paginated_api(notion.blocks.children.list, block_id=block_id):
        blocks.append(block)

    return blocks

# Function to print text from all blocks
def print_block_text(blocks):
    print("Displaying blocks:")
    for block in blocks:
        text = get_text_from_block(block)
        print(text)

# Main function
async def main():
    # Retrieve all block children from the page
    blocks = await retrieve_block_children(notion, page_id)
    # Print plain text for each block
    print_block_text(blocks)

# Run the main function
import asyncio
asyncio.run(main())