## Setting LLM Model

In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
#print("Open AI - ",os.getenv("LITELLM_URL"),os.getenv("OPENAI_API_MODEL"), os.getenv("OPENAI_API_EMBEDDING"))
#print("OLLAMA  - ",os.getenv("OLLAMA_URL"),os.getenv("OLLAMA_MODEL"))
#print("Local OLLAMA - ",os.getenv("OLLAMA_LOCAL_URL"),os.getenv("OLLAMA_LOCAL_MODEL"))

True

In [6]:
from llama_index.core import Settings

### RUN LLM AS OLLAMA 

In [None]:
#install OPEN AI LLM, skip if already installed
!pipenv install llama-index-llms-ollama

In [4]:
from llama_index.llms.ollama import Ollama
api_base=os.getenv("OLLAMA_LOCAL_URL")
model=os.getenv("OLLAMA_LOCAL_MODEL")
llm = Ollama(model=model, base_url=api_base,request_timeout=120.0)

# use remote ollam
"""
api_base=os.getenv("OLLAMA_URL")
model=os.getenv("OLLAMA_MODEL")
llm = Ollama(model=model, base_url=api_base,request_timeout=180.0)
"""
#test run
response = llm.complete("What is the capital of France?")
print(response)

The capital of France is **Paris**. 🇫🇷  



### RUN LLM AS OPEN AI 

In [None]:
#install OPEN AI LLM, skip if already installed
!pipenv install llama-index-llms-openai

In [7]:
from llama_index.llms.openai import OpenAI
api_base=os.getenv("LITELLM_URL")
model=os.getenv("OPENAI_API_MODEL")

Settings.llm = OpenAI(
    model=model,
    api_base = api_base,
    temperature=0.3
)

# resp = Settings.llm.complete("What is the capital of France?")
# print(resp)

## Setting Embedding Model

In [8]:
# use open AI embedding
from llama_index.embeddings.openai import OpenAIEmbedding
api_base=os.getenv("LITELLM_URL")
embedding_model=os.getenv("OPENAI_API_EMBEDDING")

Settings.embed_model = OpenAIEmbedding(
    model_name=embedding_model,
    api_base = api_base,
)

# embed_text = Settings.embed_model.get_text_embedding("hello")
# print(f"{len(embed_text)}, {embed_text}")

In [10]:
from llama_index.core import Document

In [11]:
text = ''' Dinosaurs are a group of reptiles that dominated the land for over 140 million years (more than 160 million years in some parts of the world). They evolved diverse shapes and sizes, from the fearsome giant Spinosaurus to the chicken-sized Microraptor, and were able to survive in a variety of ecosystems.

One of the reasons for dinosaurs' success is that they had straight back legs, perpendicular to their bodies. This allowed them to use less energy to move than other reptiles that had a sprawling stance like today's lizards and crocodiles.

With their legs positioned under their bodies rather than sticking out to the side, dinosaurs' weight was also better supported.'''

documents = [Document(text=text)]
print(documents)

[Document(id_='767b7a2f-4620-47be-82c0-811b3f87a38f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=" Dinosaurs are a group of reptiles that dominated the land for over 140 million years (more than 160 million years in some parts of the world). They evolved diverse shapes and sizes, from the fearsome giant Spinosaurus to the chicken-sized Microraptor, and were able to survive in a variety of ecosystems.\n\nOne of the reasons for dinosaurs' success is that they had straight back legs, perpendicular to their bodies. This allowed them to use less energy to move than other reptiles that had a sprawling stance like today's lizards and crocodiles.\n\nWith their legs positioned under their bodies rather than sticking out to the side, dinosaurs' weight was also better supported.", mimetype=None, path=None, url

In [12]:
import logging
import sys

#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from IPython.display import Markdown, display

In [13]:
from llama_index.core import PropertyGraphIndex

In [14]:
index = PropertyGraphIndex.from_documents(documents,)

In [15]:
query_engine = index.as_query_engine(
    include_text=True,  # include source chunk with matching paths
    similarity_top_k=2,  # top k for vector kg node retrieval
)
response = query_engine.query("is Dinosaurs a reptiles?")

print(response)

Yes, dinosaurs are a group of reptiles.


In [16]:
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)

### SimpleLLMPathExtractor: 
This extractor creates a basic knowledge graph without any predefined schema. It may produce a larger number of diverse relationships but might lack consistency in entity and relation naming.

In [17]:

kg_extractor = SimpleLLMPathExtractor(
    llm=Settings.llm, max_paths_per_chunk=20, num_workers=4
)
simple_index = PropertyGraphIndex.from_documents(
    documents,
    llm=Settings.llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.65s/it]


In [18]:
simple_index.property_graph_store.to_dict()

{'nodes': {'ccbe7b88-1414-41a1-98b5-600547d44dcf': {'label': 'text_chunk',
   'embedding': None,
   'properties': {'_node_content': '{"id_": "ccbe7b88-1414-41a1-98b5-600547d44dcf", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "767b7a2f-4620-47be-82c0-811b3f87a38f", "node_type": "4", "metadata": {}, "hash": "da82e7fff1801bdab3edbff29b0826306aa4c4dd1d2633b924ae7fdcc9a763f7", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\\n", "text": "", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 675, "metadata_seperator": "\\n", "text_template": "{metadata_str}\\n\\n{content}", "class_name": "TextNode"}',
    '_node_type': 'TextNode',
    'document_id': '767b7a2f-4620-47be-82c0-811b3f87a38f',
    'doc_id': '767b7a2f-4620-47be-82c0-811b3f87a38f',
    'ref_doc_id': '767b7a2f-4620-47be-82c0-811b3f87a38f'},
   'text': "Dinosaurs are a group of 

In [None]:
!pipenv install yfiles_jupyter_graphs

In [16]:
simple_index.property_graph_store.save_networkx_graph(
    name="./SimpleGraph.html"
)

In [17]:
simple_index.property_graph_store.show_jupyter_graph()

GraphWidget(layout=Layout(height='760px', width='100%'))

In [19]:
query_engine = index.as_query_engine(
    include_text=True,  # include source chunk with matching paths
    similarity_top_k=2,  # top k for vector kg node retrieval
)
response = query_engine.query("is Dinosaurs a reptiles?")

print(response)

Yes, dinosaurs are a group of reptiles.


### SchemaLLMPathExtractor
With a predefined schema, this extractor produces a more structured graph. The entities and relations are limited to those specified in the schema, which can lead to a more consistent but potentially less comprehensive graph. Even if we set "strict" to false, the extracted KG Graph doesn't reflect the LLM's pursuit of trying to find new entities and types that fall outside of the input schema's scope

In [20]:
kg_schema_extractor = SchemaLLMPathExtractor(
    llm=Settings.llm,
    max_triplets_per_chunk=20,
    strict=False,  # Set to False to showcase why it's not going to be the same as DynamicLLMPathExtractor
    possible_entities=None,  # USE DEFAULT ENTITIES (PERSON, ORGANIZATION... etc)
    possible_relations=None,  # USE DEFAULT RELATIONSHIPS
    possible_relation_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    possible_entity_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    num_workers=4,
)

schema_index = PropertyGraphIndex.from_documents(
    documents,
    llm=Settings.llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_schema_extractor],
    show_progress=True,
)

schema_index.property_graph_store.save_networkx_graph(
    name="./SchemaGraph.html"
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|█████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.62s/it]


In [28]:
schema_index.property_graph_store.to_dict()

{'nodes': {'84fb98a8-f4cd-483d-a3c1-91922bce90e7': {'label': 'text_chunk',
   'embedding': None,
   'properties': {'_node_content': '{"id_": "84fb98a8-f4cd-483d-a3c1-91922bce90e7", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "3980c4c8-9760-49f7-8d56-b8b82d2e6bbc", "node_type": "4", "metadata": {}, "hash": "da82e7fff1801bdab3edbff29b0826306aa4c4dd1d2633b924ae7fdcc9a763f7", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\\n", "text": "", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 675, "metadata_seperator": "\\n", "text_template": "{metadata_str}\\n\\n{content}", "class_name": "TextNode"}',
    '_node_type': 'TextNode',
    'document_id': '3980c4c8-9760-49f7-8d56-b8b82d2e6bbc',
    'doc_id': '3980c4c8-9760-49f7-8d56-b8b82d2e6bbc',
    'ref_doc_id': '3980c4c8-9760-49f7-8d56-b8b82d2e6bbc'},
   'text': "Dinosaurs are a group of 

In [21]:
query_engine = schema_index.as_query_engine(
    include_text=True,  # include source chunk with matching paths
    similarity_top_k=2,  # top k for vector kg node retrieval
)
response = query_engine.query("is Dinosaurs a reptiles?")

print(response)

Yes, dinosaurs are classified as reptiles.


### DynamicLLMPathExtractor
This new extractor combines the flexibility of SimpleLLMPathExtractor with some initial guidance from a schema. It can expand beyond the initial entity and relation types, potentially producing a rich and diverse graph while maintaining some level of consistency.
Not giving it any entities or relations to start with in the input gives the LLM complete freedom to infer the schema on the fly as it best sees fit. This is going to vary based on the LLM and the temperature used.

In [22]:
kg_dynamic_extractor = DynamicLLMPathExtractor(
    llm=Settings.llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    # Let the LLM infer entities and their labels (types) on the fly
    allowed_entity_types=None,
    # Let the LLM infer relationships on the fly
    allowed_relation_types=None,
    # LLM will generate any entity properties, set `None` to skip property generation (will be faster without)
    allowed_relation_props=[],
    # LLM will generate any relation properties, set `None` to skip property generation (will be faster without)
    allowed_entity_props=[],
)

In [23]:
dynamic_index = PropertyGraphIndex.from_documents(
    documents,
    llm=Settings.llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_dynamic_extractor],
    show_progress=True,
)

dynamic_index.property_graph_store.save_networkx_graph(
    name="./DynamicGraph.html"
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.46s/it]


In [24]:
query_engine = dynamic_index.as_query_engine(
    include_text=True,  # include source chunk with matching paths
    similarity_top_k=2,  # top k for vector kg node retrieval
)
response = query_engine.query("is Dinosaurs a reptiles?")

print(response)

Yes, dinosaurs are a group of reptiles.
