In [26]:
%pip install neo4j-graphrag fsspec langchain-text-splitters openai python-dotenv

Collecting openai
  Downloading openai-1.47.1-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.6 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading openai-1.47.1-py3-none-any.whl (375 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Using cached jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl (299 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.5.0 openai-1.47.1 tqdm-4.66.5
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import load_dotenv
import os

#load neo4j credentials

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [2]:
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
loader = PdfLoader()
pdf_doc = await loader.run("pgpm-13-39.pdf")

In [3]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
splitter = LangChainTextSplitterAdapter(
    CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator=".")
)
split_text = await splitter.run(text=pdf_doc.text)

Created a chunk of size 563, which is longer than the specified 500
Created a chunk of size 646, which is longer than the specified 500
Created a chunk of size 732, which is longer than the specified 500


In [4]:
split_text.chunks[0].text

'REVIEW\nT owards Precision Medicine in Systemic Lupus\nErythematosus\nThis article was published in the following Dove Press journal:\nPharmacogenomics and Personalized Medicine\nElliott Lever1\nMarta R Alves2\nDavid A Isenberg1\n1Centre for Rheumatology, Division of\nMedicine, University College Hospital\nLondon, London, UK;2Internal Medicine,\nDepartment of Medicine, Centro\nHospitalar do Porto, Porto, PortugalAbstract: Systemic lupus erythematosus (SLE) is a remarkable condition characterised by\ndiversity amongst its clinical features and immunological abnormalities'

In [5]:
from neo4j_graphrag.experimental.components.schema import (
    SchemaBuilder,
    SchemaEntity,
    SchemaProperty,
    SchemaRelation,
)

schema_builder = SchemaBuilder()

node_properties = [SchemaProperty(name="name", type="STRING"), SchemaProperty(name="details", type="STRING")]

basic_node_labels = ["Object",
                     "Entity",
                     "Group",
                     "Person",
                     "Organization",
                     "Place"
                     ]
academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"]

medical_node_labels = ["Anatomy",
                       "BiologicalProcess",
                       "Cell",
                       "CellularComponent",
                       "CellType",
                       "Condition",
                       "Disease",
                       "Drug",
                       "EffectOrPhenotype",
                       "Exposure",
                       "GeneOrProtein",
                       "Molecule",
                       "MolecularFunction",
                       "Pathway"
                       ]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels


rel_properties = [SchemaProperty(name="relType", type="STRING", description="The type of relationship")]
basic_rel_types = ["RELATES_TO"]

# cartesian product for possible schema
possible_schema = []
for src in node_labels:
    for dst in node_labels:
        for re in basic_rel_types:
            possible_schema.append((src, re, dst))


In [6]:
schema = await schema_builder.run(
    entities=[SchemaEntity(label=i, properties=node_properties) for i in node_labels],
    relations=[] ,#[SchemaRelation(label=i, properties=rel_properties) for i in basic_rel_types],
    potential_schema=[] #possible_schema
)
schema

SchemaConfig(entities={'Object': {'label': 'Object', 'description': '', 'properties': [{'name': 'name', 'type': 'STRING', 'description': ''}, {'name': 'details', 'type': 'STRING', 'description': ''}]}, 'Entity': {'label': 'Entity', 'description': '', 'properties': [{'name': 'name', 'type': 'STRING', 'description': ''}, {'name': 'details', 'type': 'STRING', 'description': ''}]}, 'Group': {'label': 'Group', 'description': '', 'properties': [{'name': 'name', 'type': 'STRING', 'description': ''}, {'name': 'details', 'type': 'STRING', 'description': ''}]}, 'Person': {'label': 'Person', 'description': '', 'properties': [{'name': 'name', 'type': 'STRING', 'description': ''}, {'name': 'details', 'type': 'STRING', 'description': ''}]}, 'Organization': {'label': 'Organization', 'description': '', 'properties': [{'name': 'name', 'type': 'STRING', 'description': ''}, {'name': 'details', 'type': 'STRING', 'description': ''}]}, 'Place': {'label': 'Place', 'description': '', 'properties': [{'name': '

In [7]:
from neo4j_graphrag.experimental.components.entity_relation_extractor import (
    LLMEntityRelationExtractor, OnError,
)
from neo4j_graphrag.llm import OpenAILLM

extractor = LLMEntityRelationExtractor(
    llm=OpenAILLM(
        model_name="gpt-4o",
        model_params={
            "max_tokens": 1000,
            "response_format": {"type": "json_object"},
        }
    ),
    on_error=OnError.IGNORE,
)


In [8]:
res_graph = await extractor.run(chunks=split_text, schema=schema)



In [9]:
print(len(res_graph.nodes))
print(len(res_graph.relationships))

1367
1466


In [11]:
for node in res_graph.nodes:
    for p in ['name', 'details']:
        if p not in node.properties:
            node.properties[p] = ''


In [13]:
import neo4j
from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
from neo4j_graphrag.experimental.components.types import Neo4jGraph

with neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    writer = Neo4jWriter(driver)
    await writer.run(res_graph)

start_node_id='1727226074.522092:0:0' end_node_id='1727226074.522092:0' type='FROM_CHUNK' properties=None embedding_properties=None
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164985856'>]
start_node_id='1727226074.522092:0:1' end_node_id='1727226074.522092:0' type='FROM_CHUNK' properties=None embedding_properties=None
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164985857'>]
start_node_id='1727226074.522092:0:2' end_node_id='1727226074.522092:0' type='FROM_CHUNK' properties=None embedding_properties=None
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164985858'>]
start_node_id='1727226074.522092:0:3' end_node_id='1727226074.522092:0' type='FROM_CHUNK' properties=None embedding_properties=None
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164985859'>]
start_node_id='1727226074.522092:0:4' end_node_id='1727226074.522092:0' type='FROM_CHUNK' properties=None embedding_properties=None


IndexError: list index out of range