In [26]:
%pip install neo4j-graphrag fsspec langchain-text-splitters openai python-dotenv

Collecting openai
  Downloading openai-1.47.1-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.6 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading openai-1.47.1-py3-none-any.whl (375 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Using cached jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl (299 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.5.0 openai-1.47.1 tqdm-4.66.5
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import load_dotenv
import os

#load neo4j credentials

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [2]:
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
loader = PdfLoader()
pdf_doc = await loader.run("pgpm-13-39.pdf")

In [3]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
splitter = LangChainTextSplitterAdapter(
    CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator=".")
)
split_text = await splitter.run(text=pdf_doc.text)

Created a chunk of size 563, which is longer than the specified 500
Created a chunk of size 646, which is longer than the specified 500
Created a chunk of size 732, which is longer than the specified 500


In [4]:
split_text.chunks[0].text

'REVIEW\nT owards Precision Medicine in Systemic Lupus\nErythematosus\nThis article was published in the following Dove Press journal:\nPharmacogenomics and Personalized Medicine\nElliott Lever1\nMarta R Alves2\nDavid A Isenberg1\n1Centre for Rheumatology, Division of\nMedicine, University College Hospital\nLondon, London, UK;2Internal Medicine,\nDepartment of Medicine, Centro\nHospitalar do Porto, Porto, PortugalAbstract: Systemic lupus erythematosus (SLE) is a remarkable condition characterised by\ndiversity amongst its clinical features and immunological abnormalities'

In [5]:
from neo4j_graphrag.experimental.components.entity_relation_extractor import (
    LLMEntityRelationExtractor, OnError,
)
from neo4j_graphrag.llm import OpenAILLM

extractor = LLMEntityRelationExtractor(
    llm=OpenAILLM(
        model_name="gpt-4o",
        model_params={
            "max_tokens": 1000,
            "response_format": {"type": "json_object"},
        }
    ),
    on_error=OnError.IGNORE,
)


In [6]:
res_graph = await extractor.run(chunks=split_text)

LLM response has improper format {'nodes': [{'id': '0', 'label': 'Antibody', 'properties': {'name': 'Anti-Sm antibodies'}}, {'id': '1', 'label': 'Protein', 'properties': {'name': 'ribosomal P protein'}}, {'id': '2', 'label': 'Condition', 'properties': {'name': 'NPSLE'}}, {'id': '3', 'label': 'Cell', 'properties': {'name': 'microglia'}}, {'id': '4', 'label': 'Type', 'properties': {'name': 'INF type 1 cytokine'}}, {'id': '5', 'label': 'AnimalModel', 'properties': {'name': 'mouse'}}], 'relationships': [{'type': 'CROSS_REACT', 'start_node_id': '0', 'end_node_id': '1'}, {'type': 'CONTRIBUTE_TO', 'start_node_id': 'microglia', 'end_node_id': 'NPSLE'}, {'type': 'PRODUCE', 'start_node_id': '3', 'end_node_id': '4'}, {'type': 'FOUND_IN', 'start_node_id': 'microglia', 'properties': {'aberrant': 'CNS'}}]} for chunk_index=53
LLM response has improper format {'nodes': [{'id': '0', 'label': 'Cytokine', 'properties': {'name': 'TNF'}}, {'id': '1', 'label': 'Cytokine', 'properties': {'name': 'IL1'}}, {'i

In [7]:
res_graph

Neo4jGraph(nodes=[Neo4jNode(id='1727219765.669492:0:0', label='Person', properties={'name': 'Elliott Lever', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0:1', label='Person', properties={'name': 'Marta R Alves', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0:2', label='Person', properties={'name': 'David A Isenberg', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0:3', label='Organization', properties={'name': 'University College Hospital London', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0:4', label='Organization', properties={'name': 'Centro Hospitalar do Porto', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0:5', label='Journal', properties={'name': 'Pharmacogenomics and Personalized Medicine', 'chunk_index': 0}, embedding_properties=None), Neo4jNode(id='1727219765.669492:0', label='Chunk', properties={'text': 'REV

In [8]:
dir(res_graph)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [12]:
import neo4j
from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
from neo4j_graphrag.experimental.components.types import Neo4jGraph

with neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    writer = Neo4jWriter(driver)
    await writer.run(res_graph)

[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152923703630103636'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152923703630103637'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152923703630103638'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152921504606848084'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152921504606848085'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152921504606848086'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164986964'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164986965'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164986966'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164986967'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:1152927002164986968'>]
[<Record elementID(r)='5:b96ed560-4739-4282-bf4e-242d558e3b56:115

IndexError: list index out of range

In [14]:
i=0
for rel in res_graph.relationships:
    i+=1
    if rel.start_node_id == "1727219765.669492:25:5":
        print(i)

254
262


In [15]:
res_graph.relationships[254]

Neo4jRelationship(start_node_id='1727219765.669492:25:0', end_node_id='1727219765.669492:25:7', type='RECOGNIZES', properties={'sequence': 'unmethylated CpG'}, embedding_properties=None)