In [20]:
import os
from dotenv import load_dotenv

load_dotenv()

NEO4J_URI = os.getenv("LOCAL_NEO4J_URI")
NEO4J_USERNAME = os.getenv("LOCAL_NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("LOCAL_NEO4J_PASSWORD")

In [21]:
import nest_asyncio

nest_asyncio.apply()

In [22]:
from llama_index.graph_stores.neo4j import Neo4jPGStore

graph_store = Neo4jPGStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
)



### Dataset

News article dataset fetched from Diffbot. Since the property graph index operates with documents, we have to wrap the text from the news as LlamaIndex documents.

In [24]:
import pandas as pd
from llama_index.core import Document

NEWS_DATASET = "data/news_articles.csv"

news = pd.read_csv(NEWS_DATASET)
documents = [
    Document(text=f"{row['title']}: {row['text']}") for i, row in news.iterrows()
]
news.head()

Unnamed: 0,title,date,text
0,Chevron: Best Of Breed,2031-04-06T01:36:32.000000000+00:00,JHVEPhoto Like many companies in the O&G secto...
1,FirstEnergy (NYSE:FE) Posts Earnings Results,2030-04-29T06:55:28.000000000+00:00,FirstEnergy (NYSE:FE – Get Rating) posted its ...
2,Dáil almost suspended after Sinn Féin TD put p...,2023-06-15T14:32:11.000000000+00:00,The Dáil was almost suspended on Thursday afte...
3,Epic’s latest tool can animate hyperrealistic ...,2023-06-15T14:00:00.000000000+00:00,"Today, Epic is releasing a new tool designed t..."
4,"EU to Ban Huawei, ZTE from Internal Commission...",2023-06-15T13:50:00.000000000+00:00,The European Commission is planning to ban equ...


### Define Default LLMs

In [25]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o", temperature=0.0)
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

### Graph construction

LlamaIndex provides multiple out-of-the-box graph constructors. We will use the SchemaLLMPathExtractor, which allows us to define the schema of the graph structure we want to extract from documents. 

We begin by defining the types of nodes and relationships we want the LLM to extract.

In [28]:
from typing import Literal

# best practice to use upper-case
entities = Literal["PERSON", "LOCATION", "ORGANIZATION", "PRODUCT", "EVENT"]
relations = Literal[
    "SUPPLIER_OF",
    "COMPETITOR",
    "PARTNERSHIP",
    "ACQUISITION",
    "WORKS_AT",
    "SUBSIDIARY",
    "BOARD_MEMBER",
    "CEO",
    "PROVIDES",
    "HAS_EVENT",
    "IN_LOCATION",
]

Next, we will specify the relationships associated with each node label.

In [29]:
# define which entities can have which relations
validation_schema = {
    "Person": ["WORKS_AT", "BOARD_MEMBER", "CEO", "HAS_EVENT"],
    "Organization": [
        "SUPPLIER_OF",
        "COMPETITOR",
        "PARTNERSHIP",
        "ACQUISITION",
        "WORKS_AT",
        "SUBSIDIARY",
        "BOARD_MEMBER",
        "CEO",
        "PROVIDES",
        "HAS_EVENT",
        "IN_LOCATION",
    ],
    "Product": ["PROVIDES"],
    "Event": ["HAS_EVENT", "IN_LOCATION"],
    "Location": ["HAPPENED_AT", "IN_LOCATION"],
}

A `PERSON` can have the following relationships:

* WORKS_AT
* BOARD_MEMBER
* CEO
* HAS_EVENT

The schema is quite specific except for the `EVENT` node label, which is slightly more ambiguous and allows the LLM to capture various types of information.

Now that we have defined the graph schema, we can input it into the `SchemaLLMPathExtractor` and use it to construct a graph.

In [30]:
from llama_index.core import PropertyGraphIndex
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    # if false, allows for values outside of the schema
    # useful for using the schema as a suggestion
    strict=True,
)

NUMBER_OF_ARTICLES = 250

index = PropertyGraphIndex.from_documents(
    documents[:NUMBER_OF_ARTICLES],
    kg_extractors=[kg_extractor],
    llm=llm,
    embed_model=embed_model,
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/250 [00:00<?, ?it/s]

Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))
  """Create an agent that uses JSON to format its logic, build for Chat Models.
Extracting paths from text with schema:  70%|███████   | 175/250 [05:41<02:17,  1.84s/it]Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9246500773252473 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-nt09lN7VTRSJX5krqgTaiyY7 on tokens per min (TPM): Limit 30000, Used 29294, Requested 773. Please try again in 134ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Extracting paths from text with schema:  87%|████████▋ | 217/250 [07:03<01:02,  1.90s/it]Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5600309345490879 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for 

The constructed graph contains both text chunks, which contain text and embeddings. If an entity was mentioned in the text chunk, there is a `MENTIONS` relationships between the text chunk and entity. Additionally, entities can have relationships to other entities.

### Entity deduplication

Entity deduplication or disambiguation is an important but often overlooked step in graph construction. Essentially, it is a cleaning step where you try to match multiple nodes that represent a single entity and merge them together into a single node for better graph structural integrity. We will use a combination of text embedding similarity and word distance to find potential duplicates. We start by defining the vector index on our entities in the graph.

In [32]:
graph_store.structured_query("""
CREATE VECTOR INDEX entity IF NOT EXISTS
FOR (m:`__Entity__`)
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}
"""
)

[]

In [33]:
# Just for inspection
similarity_threshold = 0.9
word_edit_distance = 5
data = graph_store.structured_query("""
MATCH (e:__Entity__)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('entity', 10, e.embedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1)
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
RETURN combinedResult
""", param_map={'cutoff': similarity_threshold, 'distance': word_edit_distance})
for row in data:
    print(row)

{'combinedResult': ['MetaHuman', 'MetaHuman Animator']}
{'combinedResult': ['ZTE', 'ZTE Corp.']}
{'combinedResult': ['American Fork', 'American Fork, Utah']}
{'combinedResult': ['XPeng', 'XPeng Inc', 'Xpeng']}
{'combinedResult': ['Bank of America', 'Bank of America Corp.']}
{'combinedResult': ['Hyatt', 'Hyatt Hotels']}
{'combinedResult': ['Star Ocean The Second Story', 'Star Ocean: The Second Story', 'Star Ocean: The Second Story R', 'Star Ocean: The Second Story R logos']}
{'combinedResult': ['Square Enix Support Center', 'Square Enix Support Center website', 'Square Enix support site', 'Square Enix support website']}
{'combinedResult': ['Star Ocean First Departure R', 'Star Ocean: First Departure', 'Star Ocean: First Departure R']}
{'combinedResult': ['Citigroup', 'Citigroup Inc.']}
{'combinedResult': ['Wells Fargo', 'Wells Fargo & Co.']}
{'combinedResult': ['JPMorgan', 'JPMorgan (JPM)', 'JPMorgan Asset Management', 'JPMorgan Chase & Co.']}
{'combinedResult': ['show ‘Vanshaj’', '‘Van

In [34]:
graph_store.structured_query("""
MATCH (e:__Entity__)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('entity', 10, e.embedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1)
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
CALL {
  WITH combinedResult
	UNWIND combinedResult AS name
	MATCH (e:__Entity__ {name:name})
	WITH e
	ORDER BY size(e.name) DESC // prefer longer names to remain after merging
	RETURN collect(e) AS nodes
}
CALL apoc.refactor.mergeNodes(nodes, {properties: {
    `.*`: 'discard'
}})
YIELD node
RETURN count(*)
""", param_map={'cutoff': similarity_threshold, 'distance': word_edit_distance})

[{'count(*)': 90}]

### Implementing a custom retriever
Great, we have constructed a knowledge graph based on the news dataset. Now, let's examine our retriever options. At the moment, there are four existing retrievers available:

LLMSynonymRetriever: takes the query, and tries to generate keywords and synonyms to retrieve nodes (and therefore the paths connected to those nodes).
VectorContextRetriever: retrieves nodes based on their vector similarity, and then fetches the paths connected to those nodes
TextToCypherRetriever: uses a graph store schema, your query, and a prompt template in order to generate and execute a cypher query
CypherTemplateRetriever: Rather than letting the LLM have free-range of generating any cypher statement, we can instead provide a cypher template and have the LLM fill in the parameters.
Additionally, implementing a custom retriever is straightforward, so that is exactly what we will do here. Our custom retriever will first identify entities in the input query and then execute the VectorContextRetriever for each identified entity separately. First, we will define the entity extraction model and prompt.

In [41]:
from pydantic import BaseModel
from typing import Optional, List


class Entities(BaseModel):
    """List of named entities in the text such as names of people, organizations, concepts, and locations"""
    names: Optional[List[str]]


prompt_template_entities = """
Extract all named entities such as names of people, organizations, concepts, and locations
from the following text:
{text}
"""

In [42]:
from typing import Any, Optional

from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.retrievers import CustomPGRetriever, VectorContextRetriever
from llama_index.core.vector_stores.types import VectorStore
from llama_index.program.openai import OpenAIPydanticProgram


class MyCustomRetriever(CustomPGRetriever):
    """Custom retriever with cohere reranking."""

    def init(
        self,
        ## vector context retriever params
        embed_model: Optional[BaseEmbedding] = None,
        vector_store: Optional[VectorStore] = None,
        similarity_top_k: int = 4,
        path_depth: int = 1,
        include_text: bool = True,
        **kwargs: Any,
    ) -> None:
        """Uses any kwargs passed in from class constructor."""
        self.entity_extraction = OpenAIPydanticProgram.from_defaults(
            output_cls=Entities, prompt_template_str=prompt_template_entities
        )
        self.vector_retriever = VectorContextRetriever(
            self.graph_store,
            include_text=self.include_text,
            embed_model=embed_model,
            similarity_top_k=similarity_top_k,
            path_depth=path_depth,
        )

    def custom_retrieve(self, query_str: str) -> str:
        """Define custom retriever with reranking.

        Could return `str`, `TextNode`, `NodeWithScore`, or a list of those.
        """
        entities = self.entity_extraction(text=query_str).names
        result_nodes = []
        if entities:
            print(f"Detected entities: {entities}")
            for entity in entities:
                result_nodes.extend(self.vector_retriever.retrieve(entity))
        else:
            result_nodes.extend(self.vector_retriever.retrieve(query_str))
        ## TMP: please change
        final_text = "\n\n".join(
            [n.get_content(metadata_mode="llm") for n in result_nodes]
        )
        return final_text

The MyCustomRetriever class has only two methods. You can use the init method to instantiate any functions or classes you will be using in the retriever. In this example, we instantiate the entity detection OpenAI program along with the vector context retriever.

The custom_retrieve method is called during retrieval. In our custom retriever implementation, we first identify any relevant entities in the text. If any entities are found, we iterate and execute the vector context retriever for each entity. On the other hand, if no entities are identified we pass the entire input to the vector context retriever.

As you can observe, you can easily customize the retriever for your use-case by incorporating existing retrievers or starting from scratch as you can easily execute Cypher statements by using the structured_query method of the graph store.
### Question-answering flow
Let's wrap it up by using the custom retriever to answer an example question. We need to pass the retriever to the RetrieverQueryEngine .

In [45]:
from llama_index.core.query_engine import RetrieverQueryEngine

custom_sub_retriever = MyCustomRetriever(
    index.property_graph_store,
    include_text=True,
    vector_store=index.vector_store,
    embed_model=embed_model
)

query_engine = RetrieverQueryEngine.from_args(
    index.as_retriever(sub_retrievers=[custom_sub_retriever]), llm=llm
)

### Try out some queries

In [46]:
response = query_engine.query("What do you know about Maliek Collins or Darragh O’Brien?")
print(str(response))

Detected entities: ['Maliek Collins', "Darragh O'Brien"]
Maliek Collins has signed a two-year deal with the Houston Texans, which includes a $23 million contract extension with $20 million guaranteed. This new deal gives him a raise from his previous contract, where he earned $17 million with $8.5 million guaranteed. Collins, who previously played for the Dallas Cowboys and the Las Vegas Raiders, had a career-high 37 tackles and 3.5 sacks over 15 games last NFL season. He is expected to be a key piece in the Texans' defensive line and fit well with the 4-3 alignment that DeMeco Ryans is expected to implement.

Darragh O’Brien is the Minister for Housing and works in the Government. He was involved in a heated debate in the Dáil regarding retained firefighters, which almost led to the suspension of the session. During the debate, Sinn Féin TD John Brady placed an on-call pager in front of O’Brien, an act that was described as "theatre" and "choreographed." O’Brien expressed confidence t

In [48]:
response = query_engine.query("I am looking for a quarterback named Justin, but I am not sure about his last name.")
print(str(response))

Detected entities: ['Justin']
You might be referring to Justin Herbert, who is a quarterback currently involved in NFL contract extension negotiations.


In [49]:
response = query_engine.query("Do you have any news related to any former Ducks' or Beavers' players?")
print(str(response))

Detected entities: ['Ducks', 'Beavers']
There is no news related to any former Ducks' or Beavers' players.


In [50]:
response = query_engine.query("To which university did Justin Herbert go?")
print(str(response))

Detected entities: ['Justin Herbert', 'university']
The provided information does not specify which university Justin Herbert attended.


In [51]:
response = query_engine.query("Where did Justin Herbert go to college?")
print(str(response))

Detected entities: ['Justin Herbert']
The information provided does not include details about where Justin Herbert went to college.


In [52]:
response = query_engine.query("Where was Justin Herbert born?")
print(str(response))

Detected entities: ['Justin Herbert']
The information provided does not include details about Justin Herbert's place of birth.


### Summary

We've explored the intricacies of customizing the property graph index within LlamaIndex, focusing on implementing entity deduplication and designing custom retrieval methods to enhance GraphRAG accuracy. The property graph index allows for a modular and flexible approach, utilizing various graph constructors and retrievers to tailor the implementation to your specific needs. Whether you're building your first knowledge graph or optimizing for a unique dataset, these customizable components offer a powerful toolkit. We invite you to test out the property graph index integration to see how they can elevate your knowledge graph projects.