<a href="https://colab.research.google.com/github/michaelwnau/ai-academy-machine-learning-2023/blob/main/from_notion_to_neo4j.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install neo4j

In [None]:
import json
import requests
import traceback
import time
from neo4j import GraphDatabase
import logging
from neo4j.exceptions import ServiceUnavailable

In [None]:
# Notion credentials
token = '<your token>'
database_id = '<your database id>'

headers = {
            "Authorization": f"Bearer {token}",
            "Notion-Version": "2021-08-16"
        }

# Neo4j credentials
uri = "bolt://<your neo4j uri>"
user = "<your neo4j user>"
password = "<your neo4j password>"



In [None]:
# Get a list of all pages from a Notion database
def get_list_of_pages(database_id):
    url = f'https://api.notion.com/v1/databases/{database_id}/query'

    r = requests.post(url, headers=headers)

    result_dict = r.json()
    pages = result_dict['results']
    has_more = result_dict['has_more']

    while has_more:
        r = requests.post(url, headers=headers, json={
            "start_cursor": result_dict['next_cursor']})
        result_dict = r.json()
        pages += result_dict['results']
        has_more = result_dict['has_more']

    return pages


In [None]:
# It can take a while to get all the pages

list_result = get_list_of_pages(database_id)

In [None]:
# Notion has different names for block inside a page
def get_all_values_for_key(input: dict, key: any) -> list:
    out = []

    def flatten(x):
        if type(x) is dict:
            for k in x:
                if k == key:
                    out.append(x[k])
                flatten(x[k])
        elif type(x) is list:
            for a in x:
                flatten(a)
    flatten(input)
    return out

In Notion, you can add a link to another page in two ways: use @ and add a link like mention to another page, or create a new page like a subpage. If you would like, create both types of links in Neo4j; just set `mention=True` and `chield=True`.

In [None]:
# Get all mantion links from bloks
def get_mention_id(page_id, mention=True, child=False):
    block_url = f'https://api.notion.com/v1/blocks/{page_id}/children'

    r = requests.get(block_url, headers=headers)
    ids = {'mention_id': [], 'children_id': []}
    all_mentions = get_all_values_for_key(r.json(), 'mention')

    try:
        for block in r.json()['results']:

            if block['has_children'] and block['type'] == 'child_page':
                ids['children_id'].append(block['id'])

        for m in all_mentions:
            if m['type'] == 'page':
                ids['mention_id'].append(m['page']['id'])


    except Exception as e:
        logging.error(e)

    return ids


In [None]:
# Get note name by id
def get_note(note_id):
    page_url = f'https://api.notion.com/v1/pages/{note_id}'
    r = requests.get(page_url, headers=headers)
    try:
        name = r.json()['properties']['Name']['title'][0]['plain_text']

    # I don't know why, but the retrieve a page properties has different structure
    except KeyError:
        try:
            name = r.json()['properties']['title']['title'][0]['plain_text']
        except:
            name = 'Unknown'
    except Exception as e:
        logging.error(f'{e}, {note_id}')
        name = 'Unknown'

    note_name = name

    note_id = note_id
    try:
        note_url = r.json()['url']
    except:
        note_url = 'Unknown'
    mentions = get_mention_id(note_id, child=True)

    note = {'name': note_name, 'id': note_id, 'url': note_url, 'mention_id': mentions['mention_id'], 'children_id': mentions['children_id']}
    return note


In [None]:
# Gather info from Notion results

def gather_info(list_result):
    notes = []
    for i in list_result:
        try:
            note_id = i['id']
            note_name = i['properties']['Name']['title'][0]['plain_text']
            note_url = i['url']
            mentions = get_mention_id(note_id, mention=True, child=True)
            notes.append({'id': note_id,
                        'name': note_name,
                        'url': note_url,
                        'mention_id': mentions['mention_id'],
                        'children_id': mentions['children_id'],
                        })
        except Exception as exc:
            # Simetimes Notion limits work
            logging.error(f'{exc}, {i}')
            time.sleep(30)
            continue
    return notes


In [None]:
notes = gather_info(list_result)

In [None]:
class Neo:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_note(self, note):
        with self.driver.session() as session:
            result = session.write_transaction(
                self._create_and_return_note, note)
            for row in result:
                print("Created note: {row}".format(row=row))
            return result

    @staticmethod
    def _create_and_return_note(tx, note):
        query = (
            "CREATE (n:Note {notion_id: $notion_id, name: $name, url: $url}) "
        )
        result = tx.run(
            query, notion_id=note['id'], name=note['name'], url=note['url'])
        return result

    def create_link(self, note1_id, note2_id, link_type):
        with self.driver.session() as session:
            result = session.write_transaction(
                self._create_link, note1_id, note2_id, link_type)
            for row in result:
                print("Created link between: {n1}, {n2}".format(
                    n1=row['n1'], n2=row['n2']))

    @staticmethod
    def _create_link(tx, note1_id, note2_id, link_type):
        # Labels can't be parameterized in Cypher. See this issue for details: https://github.com/neo4j/neo4j/issues/4334
        query = (
            "MATCH (n1:Note { notion_id: $note1_id }), (n2:Note { notion_id: $note2_id }) "
            "CREATE (n1)-" f"[:{link_type}]" + "->(n2) "
        )
        result = tx.run(query, note1_id=note1_id, note2_id=note2_id)
        try:
            return [{"n1": row["n1"]["name"], "n2": row["name"]} for row in result]
        except ServiceUnavailable as exception:
            logging.error("{query} raised an error: \n {exception}".format(
                query=query, exception=exception))
            raise

    def find_note(self, note_id):
        with self.driver.session() as session:
            result = session.read_transaction(
                self._find_and_return_note, note_id)
            for row in result:
                print("Found note: {row}".format(row=row))
            return result

    @staticmethod
    def _find_and_return_note(tx, note_id):
        query = (
            "MATCH (n:Note) "
            "WHERE n.notion_id = $note_id "
            "RETURN n.notion_id AS notion_id, n.name AS name"
        )
        result = tx.run(query, note_id=note_id)
        return [row["name"] for row in result]

    def count_links(self, note1_id, note2_id, link_type):
        with self.driver.session() as session:
            result = session.read_transaction(
                self._count_links, note1_id, note2_id, link_type)
            return result

    @staticmethod
    def _count_links(tx, note1_id, note2_id, link_type):
        # Labels can't be parameterized in Cypher. See this issue for details: https://github.com/neo4j/neo4j/issues/4334
        query = (
            "MATCH (n1:Note { notion_id: $note1_id })-" + f"[:{link_type}]" + "->(n2:Note { notion_id: $note2_id }) "
            "RETURN count(*) AS count"
        )
        result = tx.run(query, note1_id=note1_id, note2_id=note2_id, link_type=link_type)
        count = result.single()['count']
        return count

In [None]:
# Find Note by id, else create and add links to mentions and children

def find_and_link(neo, id, parent_id, link_type):

    if not neo.find_note(id):
        note = get_note(id)
        neo.create_note(note)

        for m in note['mention_id']:
            find_and_link(neo, m, note['id'], 'MENTION')

        for c in note['children_id']:
            find_and_link(neo, c, note['id'], 'CHILD')

    # Create link if not exists
    if neo.count_links(parent_id, id, link_type=link_type) == 0:
            neo.create_link(parent_id, id, link_type=link_type)


In [None]:
# Add notes and links to database

neo = Neo(uri, user, password)

for n in notes:

    if not neo.find_note(n['id']):
        neo.create_note(n)

    for m in n['mention_id']:
        find_and_link(neo, m, n['id'], 'MENTION')

    for c in n['children_id']:
        find_and_link(neo, c, n['id'], 'CHILD')


neo.close()
