##### LlamaIndex - Property Graph Index advanced example
doc [link](https://docs.llamaindex.ai/en/stable/examples/property_graph/property_graph_advanced/)

##### Setup

In [5]:
### SETUP ###

# get keys
import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [1]:
### SETUP ###

# get example unstructured data
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2024-05-31 09:09:16--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-05-31 09:09:17 (6.03 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [2]:
### SETUP ###

# not sure why async is necessary
import nest_asyncio
nest_asyncio.apply()

In [3]:
### SETUP ###

# idk
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

  from .autonotebook import tqdm as notebook_tqdm


##### Construction

In [10]:
from typing import Literal
#from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

# best practice to use upper-case
entities = Literal["PERSON", "PLACE", "ORGANIZATION"]
relations = Literal["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"]

# define which entities can have which relations
validation_schema = {
    "PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
    "PLACE": ["HAS", "PART_OF", "WORKED_AT"],
    "ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"],
}

kg_extractor = SchemaLLMPathExtractor(
    #llm=Ollama(model="llama3", json_mode=True, request_timeout=3600),
    llm=OpenAI(model="gpt-3.5-turbo", json_mode=True, request_timeout=3600),
    #llm=OpenAI(model="gpt-4o", json_mode=True, request_timeout=3600),
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    # if false, allows for values outside of the schema
    # useful for using the schema as a suggestion
    strict=True,
)

In [7]:
from llama_index.graph_stores.neo4j import Neo4jPGStore

graph_store = Neo4jPGStore(
    username="neo4j",
    password="neo4jpass",
    url="bolt://localhost:7687",
)

In [11]:
from llama_index.core import PropertyGraphIndex
#from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

index = PropertyGraphIndex.from_documents(
    documents,
    kg_extractors=[kg_extractor],
    #embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    #embed_model=OpenAIEmbedding(model_name="text-embedding-3-large"),
    property_graph_store=graph_store,
)

##### Querying

In [13]:
# using lower-level API to construct retrievers

from llama_index.core.indices.property_graph import (
    LLMSynonymRetriever,
    VectorContextRetriever,
)

llm_synonym = LLMSynonymRetriever(
    index.property_graph_store,
    #llm=Ollama(model="llama3", request_timeout=3600),
    llm=OpenAI(model="gpt-3.5-turbo", request_timeout=3600),
    include_text=False,
)
vector_context = VectorContextRetriever(
    index.property_graph_store,
    #embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    include_text=False,
)

In [14]:
retriever = index.as_retriever(
    sub_retrievers=[
        llm_synonym,
        vector_context,
    ]
)

In [15]:
nodes = retriever.retrieve("What happened at Interleaf?")

for node in nodes:
    print(node.text)

Paul Graham -> WORKED_AT -> Interleaf
Paul Graham -> WORKED_ON -> Interleaf
Interleaf -> WORKED_WITH -> Paul Graham
Interleaf -> PART_OF -> Microsoft
Interleaf -> HAS -> Microsoft
Paul Graham -> HAS -> Interleaf
Interleaf -> PART_OF -> Release Engineering
Viaweb -> WORKED_WITH -> Interleaf
Interleaf -> PART_OF -> RISD
Y Combinator -> PART_OF -> PGI
Y Combinator -> PART_OF -> Investment Firm
Y Combinator -> PART_OF -> Angel Firm
Y Combinator -> PART_OF -> Tech Company
