## Data Processing

### Load Data

In [1]:
import os
import re
import json

In [2]:
DATA_DIR = "./GraphRAG/data"

with open(os.path.join(DATA_DIR, "raw", "wikisent2.txt"), 'r', encoding='utf-8') as file:
    filtered_lines = [line.strip() for line in file if re.search(r'\bparis\b', line, re.IGNORECASE)]
    


In [3]:
len(filtered_lines)

9476

### Process Data

In [None]:
from rag_kg.utils.schema import *
from rag_kg.utils.client import get_chat_chain
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

template = """
You are an NER model with capability to identify relationships between entities in a sentence. Given the following sentence,
identify all the entity pairs and the relationship between them.

Here is the sentence separated in backticks (```):
```
{sentence}
```

Please provide the output in the following format:
{format_instructions}
"""


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_openai.chat_models.azure import AzureChatOpenAI
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import time

In [6]:
chain = get_chat_chain(
	template=template,
	parser=PydanticOutputParser(pydantic_object=Relationships),
	input_variables=["sentence"]
)

results = []
for idx, line in enumerate(filtered_lines[:30]):
	result = chain.invoke({
		"sentence": line
	})
	results.extend(result.model_dump()['relationships'])
	print(idx, result.model_dump())
	time.sleep(3)

0 {'relationships': [{'entity1': {'name': 'Johann Stamitz', 'type': 'person'}, 'entity2': {'name': 'No. 3', 'type': 'book'}, 'relationship': 'written_by'}, {'entity1': {'name': 'Paris', 'type': 'city'}, 'entity2': {'name': 'France', 'type': 'country'}, 'relationship': 'located_in'}, {'entity1': {'name': 'No. 3', 'type': 'book'}, 'entity2': {'name': 'Paris', 'type': 'city'}, 'relationship': 'located_in'}, {'entity1': {'name': 'No. 3', 'type': 'book'}, 'entity2': {'name': '1754', 'type': 'date'}, 'relationship': 'dated_on'}, {'entity1': {'name': 'No. 3', 'type': 'book'}, 'entity2': {'name': '1755', 'type': 'date'}, 'relationship': 'dated_on'}]}
1 {'relationships': [{'entity1': {'name': 'Brussels', 'type': 'city'}, 'entity2': {'name': '146 countries', 'type': 'country'}, 'relationship': 'associated_with'}, {'entity1': {'name': 'Paris', 'type': 'city'}, 'entity2': {'name': '146 countries', 'type': 'country'}, 'relationship': 'associated_with'}]}
2 {'relationships': [{'entity1': {'name': '1

In [7]:
with open(os.path.join(DATA_DIR, "processed", "wikisent2_paris_relationships.json"), 'w', encoding='utf-8') as json_file:
	json.dump(results, json_file, indent=4)

## Construct Knowledge Graph

In [8]:
# !pip install neo4j

In [9]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(os.getenv("NEO4J_URI"), auth=(os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD")))
session = driver.session()

In [10]:
with open(os.path.join(DATA_DIR, "processed", "wikisent2_paris_relationships.json"), 'r', encoding='utf-8') as json_file:
	relationships = json.load(json_file)

In [11]:
query = """
UNWIND $relationships AS relationship
MERGE (entity1:Entity {name: relationship.entity1.name, type: relationship.entity1.type})
MERGE (entity2:Entity {name: relationship.entity2.name, type: relationship.entity2.type})
MERGE (entity1)-[:RELATED_TO {relationship: relationship.relationship}]->(entity2)
"""

result = session.run(query, relationships=relationships)

In [12]:
query = """
MATCH (e1:Entity)-[r:RELATED_TO]->(e2:Entity)
RETURN e1.name AS Entity1, e1.type AS Type1, 
       e2.name AS Entity2, e2.type AS Type2,
       r.relationship AS Relationship
"""

result = session.run(query)

## Query Data

In [18]:
class EntityExtractor():
	def __init__(self, client: str = "google", model: str = "gemini-2.0-flash"):
		self.template = """
You are an NER model with the capability to identify different entities in a sentence. Given the following sentence, identify all the entities in it.

Here is the sentence separated in backticks (```):
```
{sentence}
```

Please provide the output in the following format:
{format_instructions}
"""
		self.parser=PydanticOutputParser(pydantic_object=Entities)
		self.input_variables = ["sentence"]

	def extract(self, sentence: str):
		chain = get_chat_chain(
			template=self.template,
			parser=self.parser,
			input_variables=self.input_variables
		)
		entities = chain.invoke({
			"sentence": sentence
		}).model_dump()["entities"]
		return entities

In [19]:
sentence = "What are some sport events that held and some books written in cities of France?"
entities_extractor = EntityExtractor()
entities = entities_extractor.extract(sentence)

In [20]:
entities

[{'name': 'sport events', 'type': 'event'},
 {'name': 'books', 'type': 'book'},
 {'name': 'France', 'type': 'country'}]

In [21]:
query = """
MATCH path = (startNode)-[*1..3]-(connectedNode)
WHERE startNode.name IN $entitiesName AND startNode.type IN $entitiesType
UNWIND relationships(path) AS rel
RETURN startNode AS entity1, connectedNode AS entity2, rel
"""

result = session.run(
    query,
    entitiesName=[entity['name'] for entity in entities],
    entitiesType=[entity['type'] for entity in entities]
)

In [22]:
closest_entities = []
for record in result:
	relation = f"{dict(record['entity2'])} - {dict(record['rel'])} -> {dict(record['entity1'])}"
	if relation not in closest_entities:
		closest_entities.append(relation)

In [23]:
template = """
You are a similar query generator system. Your task is to use the entities and their relationships in the closest
entities list to generate {n} similar queries to the given query.

Here is the query separated in backticks (```):
```
{query}
```

Here are the closest entities related to the entities in the query separated in backticks (```):
```
{closest_entities}
```

Please provide the output in the following format:
{format_instructions}
"""

In [24]:
chain = get_chat_chain(
    template=template,
    parser=PydanticOutputParser(pydantic_object=Queries),
    input_variables=["n", "query", "closest_entities"]
)
queries = chain.invoke({
    "n": 5,
	"query": "What are some sport events that held and some books written in cities of France?",
    "closest_entities": closest_entities
}).model_dump()['queries']

In [25]:
queries

['What are some music events that held and some books written in cities of France?',
 'What are some political events that held and some books written in cities of France?',
 'What are some sport events that held and some poems written in cities of France?',
 'What are some sport events that held and some articles written in cities of France?',
 'What are some sport events that held and some books written in Paris?']

In [26]:
template = """
You are a query analyzer system. Your task is to analyze the given queries and suggest
its complexity.

These are the complexity types:
- Single Hop: Queries that can be answered only by checking all the nodes connected to the main entity (node).
- Multiple Hop: Queries that require multiple hops to get sequential answers to for the final answer.

Here is the query separated in backticks (```):
```
{query}
```

Please provide the output in the following format:
{format_instructions}
"""

In [27]:
from langchain.output_parsers import EnumOutputParser

In [28]:
def analyze_query_complexity(query):
	parser = EnumOutputParser(enum=ComplexityTypes)
	chain = get_chat_chain(
		template=template,
		parser=parser,
    	input_variables=["query"]
	)
	return chain.invoke({
		"query":query
	})
    

In [29]:
template = """
You are a query generator system. Your task is to break the query into multiple queries
that can be executed in sequence to get the final answer.

Here is the query separated in backticks (```):
```
{query}
```

Please provide the output in the following format:
{format_instructions}
"""

In [30]:
def get_sequential_queries_with_dependency(query):
	chain = get_chat_chain(
		template=template,
		parser=PydanticOutputParser(pydantic_object=SubQueries),
		input_variables=["query"]
	)

	return chain.invoke({
		"query":query
	})

In [33]:
def query_planning(query):
	complexity = analyze_query_complexity(query)
	if complexity == ComplexityTypes.SINGLE_HOP:
		return "Execute the query directly."
	elif complexity == ComplexityTypes.MULTIPLE_HOP:
		sequential_queries = get_sequential_queries_with_dependency(query)
		print(sequential_queries)
		for subquery in sequential_queries.yield_subquery_idx_to_execute():
			print("Query:", subquery)
			print("Dependencies:", subquery.dependencies)
			print("Extracted Entities:", EntityExtractor().extract(subquery.query))
		

In [34]:
query = "What are some sport or music events that held in cities of France or Europe?"
query_planning(query)

subqueries=[SubQuery(query='What are some cities in France?', dependencies=[]), SubQuery(query='What are some cities in Europe?', dependencies=[]), SubQuery(query='What are some sport events held in the cities from the previous queries?', dependencies=[0, 1]), SubQuery(query='What are some music events held in the cities from the previous queries?', dependencies=[0, 1])]
Query: query='What are some cities in France?' dependencies=[]
Dependencies: []
Extracted Entities: [{'name': 'France', 'type': 'country'}]
Query: query='What are some cities in Europe?' dependencies=[]
Dependencies: []
Extracted Entities: [{'name': 'Europe', 'type': 'country'}]
Query: query='What are some sport events held in the cities from the previous queries?' dependencies=[0, 1]
Dependencies: [0, 1]
Extracted Entities: [{'name': 'sport events', 'type': 'event'}, {'name': 'cities', 'type': 'city'}]
Query: query='What are some music events held in the cities from the previous queries?' dependencies=[0, 1]
Dependenc