In [19]:
import pandas as pd
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
import re
from GraphRAGStore import GraphRAGStore
from GraphRAGExtractor import GraphRAGExtractor
from RobustGraphRAGExtractor import RobustGraphRAGExtractor
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.llms.ollama import Ollama

from llama_index.core import PropertyGraphIndex


In [2]:
json_path =  "datasets/arxiv_cs_metadata.json"
nrows = 5

papers = pd.read_json(json_path, lines=True, nrows=nrows)

documents = [
    Document(text=f"{row['title']}: {row['abstract']}")
    for i, row in papers.iterrows()
]

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)
len(nodes)


5

In [3]:
KG_TRIPLET_EXTRACT_TMPL = """
-Goal-
Given a text document, identify all entities and their entity types from the text and all relationships among the identified entities.
Given the text, extract up to {max_knowledge_triplets} entity-relation triplets.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: Type of the entity
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"$$$$$$$$$$$$)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relation: relationship between source_entity and target_entity
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other

Format each relationship as ("relationship"$$$$$$$$$$$$$$$$)

3. When finished, output.

-Real Data-
######################
text: {text}
######################
output:"""

In [4]:


entity_pattern = r'\("entity"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\)'
relationship_pattern = r'\("relationship"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\)'


def parse_fn(response_str: str):
    entities = re.findall(entity_pattern, response_str)
    relationships = re.findall(relationship_pattern, response_str)
    return entities, relationships


kg_extractor = GraphRAGExtractor(
    extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
    max_paths_per_chunk=2,
    parse_fn=parse_fn,
)

In [26]:
graph_store = GraphRAGStore(
    username="neo4j", password="password", url="bolt://localhost:7687"
)

embed_model = HuggingFaceEmbedding("sentence-transformers/all-MiniLM-L6-v2")

# index = PropertyGraphIndex(
#     nodes=nodes,
#     kg_extractors=[kg_extractor],
#     property_graph_store=graph_store,
#     show_progress=True,
#     embed_model=embed_model,
# )

# robust_extractor = RobustGraphRAGExtractor(
#     extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
#     max_paths_per_chunk=2,
#     parse_fn=parse_fn,
#     max_retries=3,  # Optional: customize retry settings
#     base_delay=1,
#     max_delay=60
# )

index = PropertyGraphIndex(
    nodes=nodes,
    kg_extractors=[kg_extractor],
    property_graph_store=graph_store,
    show_progress=True,
    embed_model=embed_model,
    use_async=False,
)



[A[A

ReadTimeout: 

In [15]:
llm = Ollama(model="mistral", request_timeout=300.0)  # Increased timeout
response = llm.complete("Hello, are you working?")
response

CompletionResponse(text=" Yes, I'm here to help! How can I assist you today?", additional_kwargs={'tool_calls': []}, raw={'model': 'mistral', 'created_at': '2024-11-29T15:57:45.902986Z', 'message': {'role': 'assistant', 'content': " Yes, I'm here to help! How can I assist you today?"}, 'done_reason': 'stop', 'done': True, 'total_duration': 679190959, 'load_duration': 16734584, 'prompt_eval_count': 11, 'prompt_eval_duration': 228792000, 'eval_count': 17, 'eval_duration': 429117000, 'usage': {'prompt_tokens': 11, 'completion_tokens': 17, 'total_tokens': 28}}, logprobs=None, delta=None)