In [1]:
%pip install --quiet pandas neo4j-rust-ext

Note: you may need to restart the kernel to use updated packages.


#### Issues with connection to local
https://neo4j.com/developer/kb/resolve-python-bolt-driver-error-connection/#:~:text=There%20are%20two%20options%20to%20resolve%20this%3A%20Either,uri%20%3D%20%22bolt%3A%2F%2F127.0.0.1%3A7687%22%20driver%20%3D%20GraphDatabase.driver%28uri%2C%20auth%3D%28%22neo4j%22%2C%20%22Password%22%29%29

In [1]:
import time

import pandas as pd
from neo4j import GraphDatabase

In [2]:
NEO4J_URI = "neo4j://localhost"  # or neo4j+s://xxxx.databases.neo4j.io
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = ""  # your password
NEO4J_DATABASE = "neo4j"

# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [3]:
driver.execute_query("show databases;")

EagerResult(records=[<Record name='neo4j' type='standard' aliases=[] access='read-write' address='localhost:7687' role='primary' writer=True requestedStatus='online' currentStatus='online' statusMessage='' default=True home=True constituents=[]>, <Record name='system' type='system' aliases=[] access='read-write' address='localhost:7687' role='primary' writer=True requestedStatus='online' currentStatus='online' statusMessage='' default=False home=False constituents=[]>], summary=<neo4j._work.summary.ResultSummary object at 0x000001E791A51F90>, keys=['name', 'type', 'aliases', 'access', 'address', 'role', 'writer', 'requestedStatus', 'currentStatus', 'statusMessage', 'default', 'home', 'constituents'])

In [9]:
import os
ROOT_DIR = os.getcwd()
ROOT_DIR

'D:\\repos\\graphrag'

In [10]:
GRAPHRAG_FOLDER = "graphrag_index/output/20241013-183801/artifacts"

# Batched Import
The batched import function takes a Cypher insert statement (needs to use the variable value for the row) and a dataframe to import. It will send by default 1k rows at a time as query parameter to the database to be inserted.

In [5]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.

    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start + batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value " + statement,
            rows=batch.to_dict("records"),
            database_=NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total

# Indexes and Constraints
Indexes in Neo4j are only used to find the starting points for graph queries, e.g. quickly finding two nodes to connect. Constraints exist to avoid duplicates, we create them mostly on id's of Entity types.

We use some Types as markers with two underscores before and after to distinguish them from the actual entity types.

The default relationship type here is RELATED but we could also infer a real relationship-type from the description or the types of the start and end-nodes.

```
__Entity__
__Document__
__Chunk__
__Community__
__Covariate__

```

In [6]:
# create constraints, idempotent operation

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


# Import Process
### Importing the Documents
We're loading the parquet file for the documents and create nodes with their ids and add the title property. We don't need to store text_unit_ids as we can create the relationships and the text content is also contained in the chunks.

In [11]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)
doc_df.head(2)

Unnamed: 0,id,title
0,a8e86e4eb56075ab3a7ef61716a89e63,book.txt


In [12]:
# Import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 1, 'nodes_created': 1, 'properties_set': 2}
1 rows in 0.9639983177185059 s.


1

# Loading Text Units
We load the text units, create a node per id and set the text and number of tokens. Then we connect them to the documents that we created before.

In [14]:
text_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_text_units.parquet",
    columns=["id", "text", "n_tokens", "document_ids"],
)
text_df.head(5)

Unnamed: 0,id,text,n_tokens,document_ids
0,3a765e78bbe9953c33030edec77489ab,Project Gutenberg eBook of The Great Gatsby\n ...,1200,[a8e86e4eb56075ab3a7ef61716a89e63]
1,8185e9ebc4df48f58fc509947870d49d,"5, just a quarter of\na century after my fathe...",1200,[a8e86e4eb56075ab3a7ef61716a89e63]
2,2611fae4b169a75b2f59911eabd6b01c,dinner with the Tom\nBuchanans. Daisy was my ...,1200,[a8e86e4eb56075ab3a7ef61716a89e63]
3,229aa7c469dadb63b7280786e10d588c,"end of the divan, completely motionless, and ...",1200,[a8e86e4eb56075ab3a7ef61716a89e63]
4,3fafa1a846a8fa6c642a4591309a0329,"enderly, languidly, their hands set lightly on...",1200,[a8e86e4eb56075ab3a7ef61716a89e63]


In [15]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 64, 'relationships_created': 64, 'nodes_created': 64, 'properties_set': 192}
64 rows in 0.5630002021789551 s.


64

# Loading Nodes
For the nodes we store id, name, description, embedding (if available), human readable id.

In [16]:
entity_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_entities.parquet",
    columns=[
        "name",
        "type",
        "description",
        "human_readable_id",
        "id",
        "description_embedding",
        "text_unit_ids",
    ],
)
entity_df.head(5)

Unnamed: 0,name,type,description,human_readable_id,id,description_embedding,text_unit_ids
0,"""PROJECT GUTENBERG""","""ORGANIZATION""",Project Gutenberg is an organization founded b...,0,b45241d70f0e43fca764df95b2b81f77,"[-0.02540149912238121, 0.02343343384563923, 0....","[3a765e78bbe9953c33030edec77489ab, 4d72b9e64b1..."
1,"""THE GREAT GATSBY""","""EVENT""","""The Great Gatsby"" is a novel authored by F. S...",1,4119fd06010c494caa07f439b333f4c5,"[-0.007354388944804668, 0.019090116024017334, ...","[3a765e78bbe9953c33030edec77489ab, 4fa7ef9cb03..."
2,"""F. SCOTT FITZGERALD""","""PERSON""",F. Scott Fitzgerald is the renowned author of ...,2,d3835bf3dda84ead99deadbeac5d0d7d,"[-0.0035111212637275457, 0.015590156428515911,...","[3a765e78bbe9953c33030edec77489ab, 4fa7ef9cb03..."
3,"""UNITED STATES""","""GEO""",The United States is highlighted as the primar...,3,077d2820ae1845bcbb1803379a3d1eae,"[-0.006668029818683863, 0.013017659075558186, ...","[3a765e78bbe9953c33030edec77489ab, df3c422bf52..."
4,"""ALEX CABAL""","""PERSON""","""Alex Cabal is credited with producing the eBo...",4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"[-0.007399603258818388, 0.004413312766700983, ...",[3a765e78bbe9953c33030edec77489ab]


In [17]:
entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:replace(value.name,'"','')}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

{'_contains_updates': True, 'labels_added': 341, 'relationships_created': 994, 'nodes_created': 341, 'properties_set': 1364}
341 rows in 5.973997354507446 s.


341

# Import Relationships
For the relationships we find the source and target node by name, using the base __Entity__ type. After creating the RELATED relationships, we set the description as attribute.

In [18]:
rel_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_relationships.parquet",
    columns=[
        "source",
        "target",
        "id",
        "rank",
        "weight",
        "human_readable_id",
        "description",
        "text_unit_ids",
    ],
)
rel_df.head(5)

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,"""PROJECT GUTENBERG""","""THE GREAT GATSBY""",2fb66f9a0de6406d83b61742a3b52cd6,17,1.0,0,"""Project Gutenberg has made 'The Great Gatsby'...",[3a765e78bbe9953c33030edec77489ab]
1,"""PROJECT GUTENBERG""","""PROJECT GUTENBERG AUSTRALIA""",b0e6cfd979ea48b997019b059999d3c2,10,1.0,1,"""Project Gutenberg Australia operates as a par...",[3a765e78bbe9953c33030edec77489ab]
2,"""PROJECT GUTENBERG""","""THE PROJECT GUTENBERG LITERARY ARCHIVE FOUNDA...",ef00ec3a324f4f5986141401002af3f6,10,1.0,2,"""Project Gutenberg's electronic works are owne...",[df3c422bf520d888d06d2ebc04c77453]
3,"""PROJECT GUTENBERG""","""UNITED STATES""",a542fd7aed7341468028928937ea2983,10,1.0,3,"""Project Gutenberg's activities, particularly ...",[df3c422bf520d888d06d2ebc04c77453]
4,"""PROJECT GUTENBERG""","""PROJECT GUTENBERG LITERARY ARCHIVE FOUNDATION""",1c5e296a5ac541c1b5cac4357537c22d,10,2.0,4,The Project Gutenberg Literary Archive Foundat...,"[4d72b9e64b16c757a763b3bd66422a1f, 903cdd5308f..."


In [19]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 307, 'properties_set': 1842}
307 rows in 0.6770002841949463 s.


307

# Importing Communities
For communities we import their id, title, level. We connect the __Community__ nodes to the start and end nodes of the relationships they refer to.

Connecting them to the chunks they orignate from is optional, as the entites are already connected to the chunks.

In [29]:
community_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_communities.parquet",
    columns=["id", "level", "title", "text_unit_ids", "relationship_ids"],
)

community_df.head(12)

Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,3,0,Community 3,"[0234fe5533ca5615cc9ea8b3183ca34e,054adb02dc5a...","[423b72bbd56f4caa98f3328202c1c3c9, 5c7ef01f46a..."
1,6,0,Community 6,"[054adb02dc5a9c32c7ef0a93ffc89c8d,06b9fca7b967...","[a24e9df02e1b4b43bf6324b039e28285, ab3a5a67132..."
2,1,0,Community 1,"[0e8dde4636dc8ed86a68f7cddc60f81c,21e07badfe13...","[93e1d19f9bfa4c6b8962d56d10ea9483, 8046335ba70..."
3,5,0,Community 5,"[054adb02dc5a9c32c7ef0a93ffc89c8d,06b9fca7b967...","[fd7d94fbab084bc380480abeef6bfade, cfb915c95ca..."
4,4,0,Community 4,"[06b9fca7b9670a65a433359dfa44d92c,0e8dde4636dc...","[fd139ac75b0e4777ab67b7423eaaa37f, a701c349eb7..."
5,7,0,Community 7,"[054adb02dc5a9c32c7ef0a93ffc89c8d,06b9fca7b967...","[3cf0ab4cf14e47ddabd49d500a3dc488, a39b72f8921..."
6,2,0,Community 2,"[0234fe5533ca5615cc9ea8b3183ca34e,054adb02dc5a...","[dc08f6d7398b4b798a3bdccf508a2ad4, 1c7fd5af8d8..."
7,0,0,Community 0,"[2891c0d9362733d2b2f0d9621ea3e4f5,9f4af24426ba...","[e96d3475d43b42a781b297ae7e650afe, d0bfb473fdc..."
8,8,0,Community 8,"[4fa7ef9cb033a4220fe2ca8ea404c361,dda932ffa077...","[d7db38bb599c42cab7066f3fdd282282, efd87a59d01..."
9,17,1,Community 17,"[0234fe5533ca5615cc9ea8b3183ca34e,054adb02dc5a...","[423b72bbd56f4caa98f3328202c1c3c9, 5c7ef01f46a..."


In [30]:
len(community_df)

37

In [21]:
statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

{'_contains_updates': True, 'labels_added': 37, 'relationships_created': 750, 'nodes_created': 37, 'properties_set': 111}
37 rows in 0.5889999866485596 s.


37

# Importing Community Reports
Fo the community reports we create nodes for each communitiy set the id, community, level, title, summary, rank, and rank_explanation and connect them to the entities they are about. For the findings we create the findings in context of the communities.

In [23]:
community_report_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_community_reports.parquet",
    columns=[
        "id",
        "community",
        "level",
        "title",
        "summary",
        "findings",
        "rank",
        "rank_explanation",
        "full_content",
    ],
)
community_report_df.head(5)

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,6da7e08b-92ff-444c-b65d-ef874cbbf2ac,30,2,The Complex World of Mr. Gatsby and His Quest ...,This report delves into the intricate communit...,[{'explanation': 'Mr. Gatsby's claimed associa...,7.5,The impact severity rating is high due to the ...,# The Complex World of Mr. Gatsby and His Ques...
1,870f9fd9-b14e-4a33-b2d2-e24494711438,31,2,Chicago and Its Central Role in Character Dyna...,This report delves into the intricate relation...,[{'explanation': 'Chicago is depicted as a cen...,7.5,The impact severity rating is relatively high ...,# Chicago and Its Central Role in Character Dy...
2,8753ce48-e3d3-4a33-a773-daa0f39dac51,32,2,The Complex Web of Gatsby and His Circle,This report delves into the intricate relation...,[{'explanation': 'Jay Gatsby's life and action...,7.5,The impact severity rating is high due to the ...,# The Complex Web of Gatsby and His Circle\n\n...
3,a52a6c5a-9ca2-496a-8538-c37b85de2ec3,33,2,West Egg: A Tale of New Wealth and Social Aspi...,West Egg serves as a pivotal setting in the na...,[{'explanation': 'West Egg represents the emer...,8.5,The impact severity rating is high due to the ...,# West Egg: A Tale of New Wealth and Social As...
4,c576c7c1-22ec-4be9-b4a3-8f8bacd76543,34,2,Gatsby's Household and Associates,This report delves into the intricate relation...,[{'explanation': 'The Butler emerges as a cent...,7.5,The impact severity rating is relatively high ...,# Gatsby's Household and Associates\n\nThis re...


In [24]:
# Import communities
community_statement = """
MERGE (c:__Community__ {community:value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id:finding_idx})
SET f += finding
"""
batched_import(community_statement, community_report_df)

{'_contains_updates': True, 'labels_added': 174, 'relationships_created': 174, 'nodes_created': 174, 'properties_set': 744}
37 rows in 0.4279968738555908 s.


37

# Importing Covariates
Covariates are for instance claims on entities, we connect them to the chunks where they originate from.

In [26]:
# cov_df = (pd.read_parquet(f"{GRAPHRAG_FOLDER}/create_final_covariates.parquet"),)
#                         columns=["id","text_unit_id"])
# cov_df.head(2)
# Subject id do not match entity ids

In [27]:
# Import covariates
# cov_statement = """
# MERGE (c:__Covariate__ {id:value.id})
# SET c += apoc.map.clean(value, ["text_unit_id", "document_ids", "n_tokens"], [NULL, ""])
# WITH c, value
# MATCH (ch:__Chunk__ {id: value.text_unit_id})
# MERGE (ch)-[:HAS_COVARIATE]->(c)
# """
# batched_import(cov_statement, cov_df)

# Visualize your data

## Show a few __Entity__ nodes and their relationships (Entity Graph)
```
MATCH path = (:__Entity__)-[:RELATED]->(:__Entity__)
RETURN path LIMIT 200
```
## Show the Chunks and the Document (Lexical Graph)
```
MATCH (d:__Document__) WITH d LIMIT 1
MATCH path = (d)<-[:PART_OF]-(c:__Chunk__)
RETURN path LIMIT 100
```
## Show a Community and it's Entities
```
MATCH (c:__Community__) WITH c LIMIT 1
MATCH path = (c)<-[:IN_COMMUNITY]-()-[:RELATED]-(:__Entity__)
RETURN path LIMIT 100
```
## Show everything
```MATCH (d:__Document__) WITH d LIMIT 1
MATCH path = (d)<-[:PART_OF]-(:__Chunk__)-[:HAS_ENTIY]->()-[:RELATED]-()-[:IN_COMMUNITY]->()
RETURN path LIMIT 250
```