# CITATION: 
## [GraphRAG Implementation with LlamaIndex - V2](https://docs.llamaindex.ai/en/stable/examples/cookbooks/GraphRAG_v2/)


In [None]:
import pandas as pd
from llama_index.core import Document

json_path =  "datasets/arxiv_cs_metadata.json"
nrows = 5
papers = pd.read_json(json_path, lines=True, nrows=nrows)
database = "arxivcs-demo"

papers.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,801.0341,Michael Chertkov,Michael Chertkov (Los Alamos),Exactness of Belief Propagation for Some Graph...,"12 pages, 1 figure, submitted to JSTAT",J. Stat. Mech. (2008) P10016,10.1088/1742-5468/2008/10/P10016,LANL LA-UR-07-8441,cond-mat.stat-mech cond-mat.other cs.AI cs.IT ...,http://arxiv.org/licenses/nonexclusive-distrib...,It is well known that an arbitrary graphical m...,"[{'version': 'v1', 'created': 'Wed, 2 Jan 2008...",2009-11-13,"[[Chertkov, Michael, , Los Alamos]]"
1,803.4355,Marko A. Rodriguez,Marko A. Rodriguez,Grammar-Based Random Walkers in Semantic Networks,First draft of manuscript originally written i...,"Rodriguez, M.A., ""Grammar-Based Random Walkers...",10.1016/j.knosys.2008.03.030,LA-UR-06-7791,cs.AI cs.DS,http://creativecommons.org/licenses/publicdomain/,Semantic networks qualify the meaning of an ed...,"[{'version': 'v1', 'created': 'Mon, 31 Mar 200...",2008-09-11,"[[Rodriguez, Marko A., ]]"
2,810.2434,Edward Rosten,"Edward Rosten, Reid Porter, Tom Drummond",Faster and better: a machine learning approach...,"35 pages, 11 figures","IEEE Trans. PAMI, 32 (2010), 105--119",10.1109/TPAMI.2008.275,07-3912,cs.CV cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,The repeatability and efficiency of a corner d...,"[{'version': 'v1', 'created': 'Tue, 14 Oct 200...",2010-07-09,"[[Rosten, Edward, ], [Porter, Reid, ], [Drummo..."
3,812.4446,Peter Turney,Peter D. Turney (National Research Council of ...,The Latent Relation Mapping Engine: Algorithm ...,related work available at http://purl.org/pete...,"Journal of Artificial Intelligence Research, (...",10.1613/jair.2693,NRC-50738,cs.CL cs.AI cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,Many AI researchers and cognitive scientists h...,"[{'version': 'v1', 'created': 'Tue, 23 Dec 200...",2020-08-20,"[[Turney, Peter D., , National Research Counci..."
4,901.3574,Christoph Benzmueller,Christoph Benzmueller,Automating Access Control Logics in Simple Typ...,ii + 20 pages,"SEKI Report SR-2008-01 (ISSN 1437-4447), Saarl...",10.1007/978-3-642-01244-0_34,SEKI Report SR-2008-01,cs.LO cs.AI,http://arxiv.org/licenses/nonexclusive-distrib...,Garg and Abadi recently proved that prominent ...,"[{'version': 'v1', 'created': 'Fri, 23 Jan 200...",2015-05-13,"[[Benzmueller, Christoph, ]]"


In [76]:
documents = [
    Document(text=f"{row['title']}: {row['abstract']}",)
    for i, row in papers.iterrows()
]

In [None]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="qwen2.5",  request_timeout=20000)

In [None]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

from typing import Any, List, Callable, Optional, Union, Dict
from IPython.display import Markdown, display

from llama_index.core.async_utils import run_jobs
from llama_index.core.indices.property_graph.utils import (
    default_parse_triplets_fn,
)
from llama_index.core.graph_stores.types import (
    EntityNode,
    KG_NODES_KEY,
    KG_RELATIONS_KEY,
    Relation,
)
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.schema import TransformComponent, BaseNode
from llama_index.core.bridge.pydantic import BaseModel, Field


class GraphRAGExtractor(TransformComponent):
    """Extract triples from a graph.

    Uses an LLM and a simple prompt + output parsing to extract paths (i.e. triples) and entity, relation descriptions from text.

    Args:
        llm (LLM):
            The language model to use.
        extract_prompt (Union[str, PromptTemplate]):
            The prompt to use for extracting triples.
        parse_fn (callable):
            A function to parse the output of the language model.
        num_workers (int):
            The number of workers to use for parallel processing.
        max_paths_per_chunk (int):
            The maximum number of paths to extract per chunk.
    """

    llm: LLM
    extract_prompt: PromptTemplate
    parse_fn: Callable
    num_workers: int
    max_paths_per_chunk: int

    def __init__(
        self,
        llm: Optional[LLM] = llm,
        extract_prompt: Optional[Union[str, PromptTemplate]] = None,
        parse_fn: Callable = default_parse_triplets_fn,
        max_paths_per_chunk: int = 10,
        num_workers: int = 4,
    ) -> None:
        """Init params."""
        from llama_index.core import Settings

        if isinstance(extract_prompt, str):
            extract_prompt = PromptTemplate(extract_prompt)

        super().__init__(
            llm=llm or Settings.llm,
            extract_prompt=extract_prompt,
            parse_fn=parse_fn,
            num_workers=num_workers,
            max_paths_per_chunk=max_paths_per_chunk,
        )

    @classmethod
    def class_name(cls) -> str:
        return "GraphExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes."""
        return asyncio.run(
            self.acall(nodes, show_progress=show_progress, **kwargs)
        )

    async def _aextract(self, node: BaseNode) -> BaseNode:
        """Extract triples from a node."""
        assert hasattr(node, "text")

        text = node.get_content(metadata_mode="llm")
        try:
            llm_response = await self.llm.apredict(
                self.extract_prompt,
                text=text,
                max_knowledge_triplets=self.max_paths_per_chunk,
            )
            print(f"llm_response: {llm_response}")
            entities, entities_relationship = self.parse_fn(llm_response)
        except ValueError:
            entities = []
            entities_relationship = []

        existing_nodes = node.metadata.pop(KG_NODES_KEY, [])
        existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])
        entity_metadata = node.metadata.copy()
        for entity, entity_type, description in entities:
            entity_metadata["entity_description"] = description
            entity_node = EntityNode(
                name=entity, label=entity_type, properties=entity_metadata
            )
            existing_nodes.append(entity_node)

        relation_metadata = node.metadata.copy()
        for triple in entities_relationship:
            subj, obj, rel, description = triple
            relation_metadata["relationship_description"] = description
            rel_node = Relation(
                label=rel,
                source_id=subj,
                target_id=obj,
                properties=relation_metadata,
            )

            existing_relations.append(rel_node)

        node.metadata[KG_NODES_KEY] = existing_nodes
        node.metadata[KG_RELATIONS_KEY] = existing_relations
        return node

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes async."""
        jobs = []
        for node in nodes:
            jobs.append(self._aextract(node))

        return await run_jobs(
            jobs,
            workers=self.num_workers,
            show_progress=show_progress,
            desc="Extracting paths from text",
        )

In [79]:
import re
import networkx as nx
from graspologic.partition import hierarchical_leiden
from collections import defaultdict

from llama_index.core.llms import ChatMessage
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore


class GraphRAGStore(Neo4jPropertyGraphStore):
    community_summary = {}
    entity_info = None
    max_cluster_size = 5
    llm = llm

    def generate_community_summary(self, text):
        """Generate summary for a given text using an LLM."""
        messages = [
            ChatMessage(
                role="system",
                content=(
                    "You are provided with a set of relationships from a knowledge graph, each represented as "
                    "(relationship$$$$<source_entity>$$$$<target_entity>$$$$<relation>$$$$<relationship_description>)." 
                    "Your task is to create a summary of these relationships. The summary should include the names of the entities involved and a concise synthesis "
                    "of the relationship descriptions. The goal is to capture the most critical and relevant details that "
                    "highlight the nature and significance of each relationship. Ensure that the summary is coherent and "
                    "integrates the information in a way that emphasizes the key aspects of the relationships."
                ),
            ),
            ChatMessage(role="user", content=text),
        ]
        response = llm.chat(messages)
        clean_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return clean_response

    def build_communities(self):
        """Builds communities from the graph and summarizes them."""
        nx_graph = self._create_nx_graph()
        community_hierarchical_clusters = hierarchical_leiden(
            nx_graph, max_cluster_size=self.max_cluster_size
        )
        self.entity_info, community_info = self._collect_community_info(
            nx_graph, community_hierarchical_clusters
        )
        self._summarize_communities(community_info)

    def _create_nx_graph(self):
        """Converts internal graph representation to NetworkX graph."""
        nx_graph = nx.Graph()
        triplets = self.get_triplets()
        for entity1, relation, entity2 in triplets:
            # if relation.properties.get("relationship_description"):

                # relation.properties["relationship_description"] = ""
            relationship_desc = relation.properties.get("relationship_description", "relationship_description_dummy")
            nx_graph.add_node(entity1.name)
            nx_graph.add_node(entity2.name)
            nx_graph.add_edge(
                relation.source_id,
                relation.target_id,
                relationship=relation.label,
                description=relationship_desc,
            )
        return nx_graph

    def _collect_community_info(self, nx_graph, clusters):
        """
        Collect information for each node based on their community,
        allowing entities to belong to multiple clusters.
        """
        entity_info = defaultdict(set)
        community_info = defaultdict(list)

        for item in clusters:
            node = item.node
            cluster_id = item.cluster

            # Update entity_info
            entity_info[node].add(cluster_id)

            for neighbor in nx_graph.neighbors(node):
                edge_data = nx_graph.get_edge_data(node, neighbor)
                if edge_data:
                    detail = f"{node} -> {neighbor} -> {edge_data['relationship']} -> {edge_data['description']}"
                    community_info[cluster_id].append(detail)

        # Convert sets to lists for easier serialization if needed
        entity_info = {k: list(v) for k, v in entity_info.items()}

        return dict(entity_info), dict(community_info)

    def _summarize_communities(self, community_info):
        """Generate and store summaries for each community."""
        for community_id, details in community_info.items():
            details_text = (
                "\n".join(details) + "."
            )  # Ensure it ends with a period
            self.community_summary[
                community_id
            ] = self.generate_community_summary(details_text)

    def get_community_summaries(self):
        """Returns the community summaries, building them if not already done."""
        if not self.community_summary:
            self.build_communities()
        return self.community_summary

In [80]:
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.llms import LLM
from llama_index.core import PropertyGraphIndex

import re


class GraphRAGQueryEngine(CustomQueryEngine):
    graph_store: GraphRAGStore
    index: PropertyGraphIndex
    llm: LLM = llm
    similarity_top_k: int = 20

    def custom_query(self, query_str: str) -> str:
        """Process all community summaries to generate answers to a specific query."""

        entities = self.get_entities(query_str, self.similarity_top_k)

        community_ids = self.retrieve_entity_communities(
            self.graph_store.entity_info, entities
        )
        community_summaries = self.graph_store.get_community_summaries()
        community_answers = [
            self.generate_answer_from_summary(community_summary, query_str)
            for id, community_summary in community_summaries.items()
            if id in community_ids
        ]

        final_answer = self.aggregate_answers(community_answers)
        return final_answer

    def get_entities(self, query_str, similarity_top_k):
        nodes_retrieved = self.index.as_retriever(
            similarity_top_k=similarity_top_k
        ).retrieve(query_str)

        enitites = set()
        pattern = (
            r"^(\w+(?:\s+\w+)*)\s*->\s*([a-zA-Z\s]+?)\s*->\s*(\w+(?:\s+\w+)*)$"
        )

        for node in nodes_retrieved:
            matches = re.findall(
                pattern, node.text, re.MULTILINE | re.IGNORECASE
            )

            for match in matches:
                subject = match[0]
                obj = match[2]
                enitites.add(subject)
                enitites.add(obj)

        return list(enitites)

    def retrieve_entity_communities(self, entity_info, entities):
        """
        Retrieve cluster information for given entities, allowing for multiple clusters per entity.

        Args:
        entity_info (dict): Dictionary mapping entities to their cluster IDs (list).
        entities (list): List of entity names to retrieve information for.

        Returns:
        List of community or cluster IDs to which an entity belongs.
        """
        community_ids = []

        for entity in entities:
            if entity in entity_info:
                community_ids.extend(entity_info[entity])

        return list(set(community_ids))

    def generate_answer_from_summary(self, community_summary, query):
        """Generate an answer from a community summary based on a given query using LLM."""
        prompt = (
            f"Given the community summary: {community_summary}, "
            f"how would you answer the following query? Query: {query}"
        )
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content="I need an answer based on the above information.",
            ),
        ]
        response = self.llm.chat(messages)
        cleaned_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return cleaned_response

    def aggregate_answers(self, community_answers):
        """Aggregate individual community answers into a final, coherent response."""
        # intermediate_text = " ".join(community_answers)
        prompt = "Combine the following intermediate answers into a final, concise response."
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content=f"Intermediate answers: {community_answers}",
            ),
        ]
        final_response = self.llm.chat(messages)
        cleaned_final_response = re.sub(
            r"^assistant:\s*", "", str(final_response)
        ).strip()
        return cleaned_final_response

In [None]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)
len(nodes)

KeyboardInterrupt: 

### Build ProperGraphIndex using `GraphRAGExtractor` and `GraphRAGStore`

In [83]:
KG_TRIPLET_EXTRACT_TMPL = """
-Goal-
Given a text document, identify all entities and their entity types from the text and all relationships among the identified entities.
Given the text, extract up to {max_knowledge_triplets} entity-relation triplets.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: Type of the entity
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"$$$$<entity_name>$$$$<entity_type>$$$$<entity_description>)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relation: relationship between source_entity and target_entity
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other

Format each relationship as ("relationship"$$$$<source_entity>$$$$<target_entity>$$$$<relation>$$$$<relationship_description>)

3. When finished, output.

-Real Data-
######################
text: {text}
######################
output:"""

In [84]:
entity_pattern = r'\("entity"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\)'
relationship_pattern = r'\("relationship"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\)'
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor

# def parse_fn(response_str: str) -> Any:
#     entities = re.findall(entity_pattern, response_str)
#     relationships = re.findall(relationship_pattern, response_str)
#     print(f"response_str: {response_str}")
#     print(f"entities: {entities}")
#     print(f"relationships: {relationships}")
#     if entities == []:
#         entities = [("DummyE", "DummyE", "DummyE",)]
#     if relationships == []:
#         relationships = [("DummyR", "DummyR", "DummyR", "DummyR")]
#     return entities, relationships

def parse_fn(response_str: str) -> Any:
    # Updated patterns to match actual output format
    entity_pattern = r'\("entity"\$\$\$\$(.*?)\$\$\$\$(.*?)\$\$\$\$(.*?)\)'
    relationship_pattern = r'\("relationship"\$\$\$\$(.*?)\$\$\$\$(.*?)\$\$\$\$(.*?)\$\$\$\$(.*?)\)'
    
    # Find all matches
    entities = re.findall(entity_pattern, response_str, re.DOTALL)
    relationships = re.findall(relationship_pattern, response_str, re.DOTALL)
    
    # Clean up any whitespace
    entities = [(e1.strip(), e2.strip(), e3.strip()) for e1, e2, e3 in entities]
    relationships = [(r1.strip(), r2.strip(), r3.strip(), r4.strip()) 
                    for r1, r2, r3, r4 in relationships]
    
    # Add default if empty (keeping your original fallback)
    if not entities:
        entities = [("DummyE", "DummyE", "DummyE")]
    if not relationships:
        relationships = [("DummyR", "DummyR", "DummyR", "DummyR")]
    
    print(f"Found entities: {entities}")
    print(f"Found relationships: {relationships}")
    
    return entities, relationships

kg_extractor = GraphRAGExtractor(
    llm=llm,
    extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
    max_paths_per_chunk=20,
    num_workers=4,
    parse_fn=parse_fn,

)
# max_triplets_per_chunk=20,
#         num_workers=4
# kg_extractor = DynamicLLMPathExtractor(
#             llm=llm,
#             max_triplets_per_chunk=20,
#             num_workers=4,
#             allowed_entity_types=None,
#             allowed_relation_types=None,
#             allowed_relation_props=["relationship_description"],
#             allowed_entity_props=[],
#             parse_fn=parse_fn,
#             extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
# )

## Docker Setup And Neo4J setup

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command.

```
docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
```
From here, you can open the db at http://localhost:7474/. On this page, you will be asked to sign in. Use the default username/password of neo4j and neo4j.

Once you login for the first time, you will be asked to change the password.

In [85]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

# Note: used to be `Neo4jPGStore`
graph_store = GraphRAGStore(
    username="neo4j", password="password", url="bolt://localhost:7687", database=database
)

In [86]:
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# embed_model = HuggingFaceEmbedding("sentence-transformers/all-MiniLM-L6-v2")
embed_model = HuggingFaceEmbedding("avsolatorio/GIST-all-MiniLM-L6-v2")
# GIST-all-MiniLM-L6-v2 
# PropertyGraphIndex.from_documents(
#             documents,
#             property_graph_store=graph_store,
#             llm=self.llm,
#             embed_model=self.embed_model,
#             embed_kg_nodes=True,
#             kg_extractors=[self.kg_extractor],
#             show_progress=True
# )

index = PropertyGraphIndex(
    nodes=nodes,
    kg_extractors=[kg_extractor],
    property_graph_store=graph_store,
    llm=llm,
    show_progress=True,
    embed_model=embed_model,
)

Extracting paths from text:  20%|██        | 1/5 [00:44<02:57, 44.50s/it]

llm_response: ```plaintext
("entity"$$$$Latent Relation Mapping Engine$$$$Algorithm$$$$The Latent Relation Mapping Engine is an algorithm designed to automate the process of finding analogical mappings between lists of words using a large corpus of raw text. It combines ideas from Structure Mapping Theory and Latent Relational Analysis, aiming to remove the need for complex hand-coded representations.)

("entity"$$$$Structure Mapping Theory (SMT)$$$$Theory$$$$Structure Mapping Theory is a computational model that suggests analogies are based on structural correspondences between two domains. This theory has been implemented in the Structure Mapping Engine (SME).)

("entity"$$$$Structure Mapping Engine (SME)$$$$Engine$$$$The Structure Mapping Engine is an implementation of the Structure Mapping Theory, which requires complex hand-coded representations to find analogical mappings.)

("entity"$$$$Latent Relational Analysis (LRA)$$$$Analysis Technique$$$$Latent Relational Analysis is a sta

Extracting paths from text:  40%|████      | 2/5 [00:48<01:02, 20.97s/it]

llm_response: ```plaintext
("entity"$$$$Belief Propagation$$$$Algorithm$$$$An iterative algorithm used for inference on tree structures, converging to unique minimum of Bethe free energy. It may not converge or find the global minimum in loopy graphs.)

("entity"$$$$Bethe Free Energy Functional$$$$Functional$$$$A functional used in statistical inference on trees that has a unique minimum but can have multiple minima and non-convergent solutions in loopy graphs.)

("entity"$$$$Maximum-Likelihood Solution$$$$Solution$$$$The optimal solution for maximum likelihood estimation, often not guaranteed to be found by Belief Propagation in the zero-temperature limit of loopy graphs.)

("entity"$$$$Linear Programming (LP) Algorithm$$$$Algorithm$$$$An efficient algorithm used for solving optimization problems with constraints, particularly useful when the matrix of constraints is Totally-Uni-Modular (TUM).)

("entity"$$$$Totally-Uni-Modular (TUM) Matrix$$$$Matrix$$$$A special type of matrix that g

Extracting paths from text:  60%|██████    | 3/5 [01:31<01:01, 30.64s/it]

llm_response: ### Entities Identified:
1. ("entity"$$$$Corner Detector$$$$Type$$$$Determines how likely it is to be useful in real-world applications; involves repeatability and efficiency.")
2. ("entity"$$$$Repeatability$$$$Metric$$$$Measure of consistency across different viewing positions of the same scene.)
3. ("entity"$$$$Efficiency$$$$Metric$$$$Determine whether the detector can operate at frame rate with further processing.)
4. ("entity"$$$$Feature Detection$$$$Process$$$$The process of identifying corners in an image or video.)
5. ("entity"$$$$Live PAL Video$$$$Input$$$$Video input processed by the feature detector for real-time application.)
6. ("entity"$$$$Harris Detector$$$$Existing Technology$$$$A corner detection algorithm that operates slower than the new heuristic detector.")
7. ("entity"$$$$SIFT (Scale-Invariant Feature Transform)$$$$Existing Technology$$$$Another slow corner detection algorithm compared with the new heuristic detector.")
8. ("entity"$$$$Machine Learnin

Extracting paths from text:  80%|████████  | 4/5 [01:33<00:19, 19.31s/it]

llm_response: Based on the given text, let's extract and identify entities, their types, descriptions, relations, and detailed explanations.

### Step 1: Identifying Entities

1. **Garg**
   - entity_name: Garg
   - entity_type: Person
   - entity_description: A researcher or co-author involved in the work related to access control logics.

2. **Abadi**
   - entity_name: Abadi
   - entity_type: Person
   - entity_description: A researcher or co-author involved in the work related to access control logics.

3. **Automating Access Control Logics in Simple Type Theory with LEO-II** (This is a title, not an entity)
4. **LEO-II**
   - entity_name: LEO-II
   - entity_type: Software
   - entity_description: A higher-order theorem prover used for automated reasoning.

5. **Modal logic S4**
   - entity_name: Modal logic S4
   - entity_type: Logical System
   - entity_description: A modal logic system known for its sound and complete translations of access control logics.

6. **Simple Type Theor

Extracting paths from text: 100%|██████████| 5/5 [01:53<00:00, 22.79s/it]


llm_response: ### Step 1: Identify all entities

- Grammar-Based Random Walkers: $$$$Grammar-Based Random Walkers$$$$Random Walkers$$$$A model used in semantic networks to rank vertices based on a modified random walker constrained by a user-defined grammar.
- Semantic Networks: $$$$Semantic Networks$$$$Networks$$$$Collections of interconnected concepts or nodes where edges qualify the relationships between these nodes. The text mentions that vertices can be "central" and relationships are context-dependent.
- Vertices: $$$$Vertices$$$$Nodes$$$$The basic elements in semantic networks, representing concepts or entities. Their centrality is subjective and depends on the relationship type.
- Context-Based Rankings: $$$$Context-Based Rankings$$$$Rankings$$$$Metrics based on user-defined contexts that rank vertices in a semantic network. The text highlights their importance over direct vertex rankings.
- Eigenvector Centrality: $$$$Eigenvector Centrality$$$$Metric$$$$A measure of the influe

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
Generating embeddings: 100%|██████████| 5/5 [00:01<00:00,  3.73it/s]


In [87]:
index.property_graph_store.get_triplets()

# for triplet in index.property_graph_store.get_triplets():
#     print(triplet.re)

[[EntityNode(label='Random Walkers', embedding=None, properties={'id': 'Grammar-Based Random Walkers', 'entity_description': 'A model used in semantic networks to rank vertices based on a modified random walker constrained by a user-defined grammar.', 'triplet_source_id': 'a65ebbac-9058-452f-b607-33bffe930c8a'}, name='Grammar-Based Random Walkers'),
  Relation(label='Based On', source_id='Grammar-Based Random Walkers', target_id='Random Walkers (Markov Chain Analysis)', properties={'triplet_source_id': 'a65ebbac-9058-452f-b607-33bffe930c8a', 'relationship_description': 'The text explicitly states that the random walker model is a modified version of the random walker model used in Markov chain analysis.'}),
  EntityNode(label='Model', embedding=None, properties={'id': 'Random Walkers (Markov Chain Analysis)', 'entity_description': 'A probabilistic model where the probability of moving from one vertex to another is defined by transition probabilities, modified here by a user-defined gra

In [88]:
index.property_graph_store.get_triplets()[10][0].properties

{'id': 'Latent Relation Mapping Engine',
 'entity_description': 'The Latent Relation Mapping Engine is an algorithm designed to automate the process of finding analogical mappings between lists of words using a large corpus of raw text. It combines ideas from Structure Mapping Theory and Latent Relational Analysis, aiming to remove the need for complex hand-coded representations.',
 'triplet_source_id': '2d30093d-f4a3-4e8f-930f-40eaecaaa726'}

In [89]:
index.property_graph_store.get_triplets()[10][1].properties

{'triplet_source_id': '2d30093d-f4a3-4e8f-930f-40eaecaaa726',
 'relationship_description': 'LRME integrates LRA to automatically discover semantic relations among words without the need for hand-coded representations, much like how SMT is integrated with LRA in LRME.'}

### Build communities

This will create communities and summary for each community.

In [90]:
index.property_graph_store.build_communities()

In [None]:
index.property_graph_store.get_community_summaries()

{0: 'The relationships provided describe how "Grammar-Based Random Walkers" and "Random Walkers (Markov Chain Analysis)" are interconnected, specifically through their dependency on each other. \n\nKey points:\n- Both "Grammar-Based Random Walkers" and "Random Walkers (Markov Chain Analysis)" are based on a common foundational concept: the random walker model.\n- However, "Grammar-Based Random Walkers" is derived as a modified version of the traditional "Random Walker Model," which originates from Markov chain analysis.\n\nThis summary highlights that both concepts share a fundamental similarity but differ in their specific applications or modifications.',
 1: 'In semantic networks, vertices are key components that represent interconnected concepts. These vertices can be analyzed using measures like eigenvector centrality and PageRank to evaluate their influence in the network. Both eigenvector centrality and PageRank rank the significance of vertices, with eigenvector centrality empha

In [None]:
import networkx as nx
from pyvis.network import Network

net = Network(
    directed = True,
    select_menu = True, 
    filter_menu = True, 
)
net.show_buttons() 
net.from_nx(graph_store._create_nx_graph()) 
net.write_html('community_graph.html')

### Create QueryEngine

In [92]:
query_engine = GraphRAGQueryEngine(
    graph_store=index.property_graph_store,
    llm=llm,
    index=index,
    similarity_top_k=10,
)

### Querying

In [93]:
response = query_engine.query(
    "What are the main topics discussed in the papers?"
)
display(Markdown(f"{response.response}"))

The main topics discussed across the papers are diverse, covering several key areas:

1. **Vertices and Concepts**: The use of vertices as fundamental components representing interconnected concepts within semantic networks.
2. **Network Analysis Techniques**:
   - **Eigenvector Centrality**: Evaluates vertex significance based on their influence from neighboring vertices.
   - **PageRank**: Assesses the relative importance or significance of vertices, particularly in web-based contexts.

3. **Repeatability and Efficiency in Corner Detectors**: 
   - Repeatability ensures consistent corner detection across different conditions.
   - Efficiency concerns computational resources and frame rate requirements for practical utility.

4. **Theoretical vs. Practical Models**:
   - Connections between theoretical models like Structure Mapping Theory (SMT) and Latent Relational Analysis (LRA).
   - Automated semantic relation mapping using tools such as the Latent Relation Mapping Engine (LRME) and Structure Mapping Engine (SME).

5. **Automated Semantic Relation Mapping**: 
   - LRME, inspired by SMT and LRA, aims to automate semantic relation discovery more efficiently.
   - Comparison with SME’s hand-coded representations for theoretical models.

These topics collectively explore the evaluation of network structure through analytical tools, practical aspects of computer vision techniques, and advancements in automated semantic analysis.

In [94]:
response = query_engine.query(
    "Which papers have the most in common?"
)
display(Markdown(f"{response.response}"))

Based on the provided summaries, to identify papers or entities that have the most in common:

1. For semantic networks and measures like eigenvector centrality (EVC) and PageRank:
   - Common focus is on ranking vertices based on their influence using these centrality measures.
   - Relevant papers would compare EVC and PageRank in different types of networks, particularly semantic networks, and analyze the impact of these measures in complex information systems.

2. For corner detectors:
   - The most common papers will discuss both repeatability (consistency across viewing positions) and efficiency (operational speed and resource management), providing a comprehensive understanding of detector performance.

3. For Structure Mapping Theory (SMT) and Structure Mapping Engine (SME):
   - Both share a foundational relationship where SMT is the theoretical basis for SME, an automated implementation that integrates additional capabilities but retains the core principles of SMT.

In summary, the common threads are comparing measures in networks, balancing performance metrics in algorithms, and linking theoretical frameworks with practical implementations.

In [None]:

user_query = "What are the main news in energy sector?"
display(Markdown(f"{user_query}"))

NameError: name 'Markdown' is not defined

In [None]:
user_query = "What are the main news in energy sector?"
response = query_engine.query(
    "What are the main news in energy sector?"
)

display(Markdown(f"{user_query}"))
display(Markdown(f"{response.response}"))

NameError: name 'query_engine' is not defined