In [None]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

In [None]:
import numpy as np
import json
from collections import defaultdict
from scipy.sparse import csr_matrix
from pymilvus import MilvusClient
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from tqdm import tqdm

In [3]:
milvus_client = MilvusClient("http://localhost:19530")

llm = ChatOpenAI(
    model = "gpt-4o",
    temperature = 0.0,
)
embedding_model = OpenAIEmbeddings(model = "text-embedding-3-small")

## Data Preparation

In [4]:
nano_dataset = [
    {
        "passage": "Jakob Bernoulli (1654–1705): Jakob was one of the earliest members of the Bernoulli family to gain prominence in mathematics. He made significant contributions to calculus, particularly in the development of the theory of probability. He is known for the Bernoulli numbers and the Bernoulli theorem, a precursor to the law of large numbers. He was the older brother of Johann Bernoulli, another influential mathematician, and the two had a complex relationship that involved both collaboration and rivalry.",
        "triplets": [
            ["Jakob Bernoulli", "made significant contributions to", "calculus"],
            [
                "Jakob Bernoulli",
                "made significant contributions to",
                "the theory of probability",
            ],
            ["Jakob Bernoulli", "is known for", "the Bernoulli numbers"],
            ["Jakob Bernoulli", "is known for", "the Bernoulli theorem"],
            ["The Bernoulli theorem", "is a precursor to", "the law of large numbers"],
            ["Jakob Bernoulli", "was the older brother of", "Johann Bernoulli"],
        ],
    },
    {
        "passage": "Johann Bernoulli (1667–1748): Johann, Jakob’s younger brother, was also a major figure in the development of calculus. He worked on infinitesimal calculus and was instrumental in spreading the ideas of Leibniz across Europe. Johann also contributed to the calculus of variations and was known for his work on the brachistochrone problem, which is the curve of fastest descent between two points.",
        "triplets": [
            [
                "Johann Bernoulli",
                "was a major figure of",
                "the development of calculus",
            ],
            ["Johann Bernoulli", "was", "Jakob's younger brother"],
            ["Johann Bernoulli", "worked on", "infinitesimal calculus"],
            ["Johann Bernoulli", "was instrumental in spreading", "Leibniz's ideas"],
            ["Johann Bernoulli", "contributed to", "the calculus of variations"],
            ["Johann Bernoulli", "was known for", "the brachistochrone problem"],
        ],
    },
    {
        "passage": "Daniel Bernoulli (1700–1782): The son of Johann Bernoulli, Daniel made major contributions to fluid dynamics, probability, and statistics. He is most famous for Bernoulli’s principle, which describes the behavior of fluid flow and is fundamental to the understanding of aerodynamics.",
        "triplets": [
            ["Daniel Bernoulli", "was the son of", "Johann Bernoulli"],
            ["Daniel Bernoulli", "made major contributions to", "fluid dynamics"],
            ["Daniel Bernoulli", "made major contributions to", "probability"],
            ["Daniel Bernoulli", "made major contributions to", "statistics"],
            ["Daniel Bernoulli", "is most famous for", "Bernoulli’s principle"],
            [
                "Bernoulli’s principle",
                "is fundamental to",
                "the understanding of aerodynamics",
            ],
        ],
    },
    {
        "passage": "Leonhard Euler (1707–1783) was one of the greatest mathematicians of all time, and his relationship with the Bernoulli family was significant. Euler was born in Basel and was a student of Johann Bernoulli, who recognized his exceptional talent and mentored him in mathematics. Johann Bernoulli’s influence on Euler was profound, and Euler later expanded upon many of the ideas and methods he learned from the Bernoullis.",
        "triplets": [
            [
                "Leonhard Euler",
                "had a significant relationship with",
                "the Bernoulli family",
            ],
            ["leonhard Euler", "was born in", "Basel"],
            ["Leonhard Euler", "was a student of", "Johann Bernoulli"],
            ["Johann Bernoulli's influence", "was profound on", "Euler"],
        ],
    },
]

In [6]:
entityid_2_relationids = defaultdict(list)
relationid_2_passageids = defaultdict(list)

entities = []
relations = []
passages = []
for passage_id, dataset_info in enumerate(nano_dataset):
    passage, triplets = dataset_info["passage"], dataset_info["triplets"]
    passages.append(passage)
    for triplet in triplets:
        if triplet[0] not in entities:
            entities.append(triplet[0])
        if triplet[2] not in entities:
            entities.append(triplet[2])
        relation = " ".join(triplet)
        if relation not in relations:
            relations.append(relation)
            entityid_2_relationids[entities.index(triplet[0])].append(
                len(relations) - 1
            )
            entityid_2_relationids[entities.index(triplet[2])].append(
                len(relations) - 1
            )
        relationid_2_passageids[relations.index(relation)].append(passage_id)  

In [7]:
# create milvus collection
embedding_dim = len(embedding_model.embed_query("foo"))

def create_milvus_collection(collection_name: str):
    if milvus_client.has_collection(collection_name = collection_name):
        milvus_client.drop_collection(collection_name = collection_name)
    milvus_client.create_collection(
        collection_name = collection_name,
        dimension = embedding_dim,
        consistency_level = "Strong",
    )

entity_col_name = "entity_collection"
relation_col_name = "relation_collection"
passage_col_name = "passage_collection" # use as naive RAG retrieval comparison

create_milvus_collection(entity_col_name)
create_milvus_collection(relation_col_name)
create_milvus_collection(passage_col_name)

In [8]:
# insert data into milvus collection
def milvus_insert(
    collection_name: str,
    text_list: list[str],
):
    batch_size = 512
    for row_id in tqdm(range(0, len(text_list), batch_size), desc = "Inserting"):
        batch_texts = text_list[row_id : row_id + batch_size]
        batch_embeddings = [embedding_model.embed_query(text) for text in batch_texts]
        batch_ids = list(range(row_id, row_id + len(batch_texts)))
        # create metadata for each entry
        batch_data = [
            {
                "id": id_,
                "text": text,
                "vector": vector,
            } for id_, text, vector in zip(batch_ids, batch_texts, batch_embeddings)
        ]
        milvus_client.insert(
            collection_name = collection_name,
            data = batch_data,
        )

milvus_insert(entity_col_name, entities)
milvus_insert(relation_col_name, relations)
milvus_insert(passage_col_name, passages)

Inserting: 100%|██████████| 1/1 [00:16<00:00, 16.87s/it]
Inserting: 100%|██████████| 1/1 [00:13<00:00, 13.37s/it]
Inserting: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


## Similarity Retrieval

In [10]:
query = "What contribution did the son of Euler's teacher make?"

query_ner_list = ["Euler"]

query_ner_embeddings = [
    embedding_model.embed_query(query_ner) for query_ner in query_ner_list
]

top_k = 3

entity_search_res = milvus_client.search(
    collection_name = entity_col_name,
    data = query_ner_embeddings,
    limit = top_k,
    output_fields = ["id"], # metadata fields to return
)

query_embedding = embedding_model.embed_query(query)

relation_search_res = milvus_client.search(
    collection_name = relation_col_name,
    data = [query_embedding],
    limit = top_k,
    output_fields = ["id"],
)[0]

In [11]:
# constructing a relational graph through adjacency matrix
entity_relation_adj = np.zeros((len(entities), len(relations)))

for entity_id, entity in enumerate(entities):
    entity_relation_adj[entity_id, entityid_2_relationids[entity_id]] = 1

entity_relation_adj = csr_matrix(entity_relation_adj)

target_degree = 1

entity_adj_1_degree = entity_relation_adj @ entity_relation_adj.T
relation_adj_1_degree = entity_relation_adj.T @ entity_relation_adj

entity_adj_target_degree = entity_adj_1_degree
for _ in range(target_degree - 1):
    entity_adj_target_degree = entity_adj_target_degree * entity_adj_1_degree
relation_adj_target_degree = relation_adj_1_degree
for _ in range(target_degree - 1):
    relation_adj_target_degree = relation_adj_target_degree * relation_adj_1_degree

entity_relation_adj_target_degree = entity_adj_target_degree @ entity_relation_adj

In [12]:
expanded_relations_from_relation = set()
expanded_relations_from_entity = set()
# You can set the similarity threshold here to guarantee the quality of the retrieved ones.
# entity_sim_filter_thresh = ...
# relation_sim_filter_thresh = ...

filtered_hit_relation_ids = [
    relation_res["entity"]["id"]
    for relation_res in relation_search_res
    # if relation_res['distance'] > relation_sim_filter_thresh
]
for hit_relation_id in filtered_hit_relation_ids:
    expanded_relations_from_relation.update(
        relation_adj_target_degree[hit_relation_id].nonzero()[1].tolist()
    )

filtered_hit_entity_ids = [
    one_entity_res["entity"]["id"]
    for one_entity_search_res in entity_search_res
    for one_entity_res in one_entity_search_res
    # if one_entity_res['distance'] > entity_sim_filter_thresh
]

for filtered_hit_entity_id in filtered_hit_entity_ids:
    expanded_relations_from_entity.update(
        entity_relation_adj_target_degree[filtered_hit_entity_id].nonzero()[1].tolist()
    )

# Merge the expanded relations from the relation and entity retrieval ways.
relation_candidate_ids = list(
    expanded_relations_from_relation | expanded_relations_from_entity
)

relation_candidate_texts = [
    relations[relation_id] for relation_id in relation_candidate_ids
]

## LLM Reranking

In [None]:
# employ a one-shot prompt and Chain-of-Thought approach to rerank the relations
query_prompt_one_shot_input = """I will provide you with a list of relationship descriptions. Your task is to select 3 relationships that may be useful to answer the given question. Please return a JSON object containing your thought process and a list of the selected relationships in order of their relevance.

Question:
When was the mother of the leader of the Third Crusade born?

Relationship descriptions:
[1] Eleanor was born in 1122.
[2] Eleanor married King Louis VII of France.
[3] Eleanor was the Duchess of Aquitaine.
[4] Eleanor participated in the Second Crusade.
[5] Eleanor had eight children.
[6] Eleanor was married to Henry II of England.
[7] Eleanor was the mother of Richard the Lionheart.
[8] Richard the Lionheart was the King of England.
[9] Henry II was the father of Richard the Lionheart.
[10] Henry II was the King of England.
[11] Richard the Lionheart led the Third Crusade.

"""
query_prompt_one_shot_output = """{"thought_process": "To answer the question about the birth of the mother of the leader of the Third Crusade, I first need to identify who led the Third Crusade and then determine who his mother was. After identifying his mother, I can look for the relationship that mentions her birth.", "useful_relationships": ["[11] Richard the Lionheart led the Third Crusade", "[7] Eleanor was the mother of Richard the Lionheart", "[1] Eleanor was born in 1122"]}"""

query_prompt_template = """Question:
{question}

Relationship descriptions:
{relation_des_str}

"""

def rerank_relations(
    query: str,
    relation_candidate_texts: list[str],
    relation_candidate_ids: list[str],
) -> list[int]:
    relation_des_str = "\n".join(
        map(
            lambda item: f"[{item[0]}] {item[1]}",
            zip(relation_candidate_ids, relation_candidate_texts),
        )
    ).strip()
    rerank_prompts = ChatPromptTemplate.from_messages(
        [
            HumanMessage(query_prompt_one_shot_input),
            AIMessage(query_prompt_one_shot_output),
            HumanMessagePromptTemplate.from_template(query_prompt_template),
        ]
    )
    rerank_chain = (
        rerank_prompts 
        | llm.bind (response_format = {"type": "json_object"})
        | StrOutputParser()
    )
    print(f"question: {query}")
    print(relation_des_str)
    rerank_res_str = rerank_chain.invoke(
        {"question": query ,  "relation_des_str": relation_des_str}
    )
    print(rerank_res_str)
    rerank_res = json.loads(rerank_res_str)
    rerank_relation_ids = []
    rerank_relation_lines = rerank_res["useful_relationships"]
    id_2_lines = {}
    for line in rerank_relation_lines:
        id_ = int(line[line.find("[") + 1 : line.find("]")])
        id_2_lines[id_] = line.strip()
        rerank_relation_ids.append(id_)
    return rerank_relation_ids

rerank_relation_ids = rerank_relations(
    query,
    relation_candidate_texts,
    relation_candidate_ids,
)

What contribution did the son of Euler's teacher make?
[5] Jakob Bernoulli was the older brother of Johann Bernoulli
[6] Johann Bernoulli was a major figure of the development of calculus
[7] Johann Bernoulli was Jakob's younger brother
[8] Johann Bernoulli worked on infinitesimal calculus
[9] Johann Bernoulli was instrumental in spreading Leibniz's ideas
[10] Johann Bernoulli contributed to the calculus of variations
[11] Johann Bernoulli was known for the brachistochrone problem
[12] Daniel Bernoulli was the son of Johann Bernoulli
[18] Leonhard Euler had a significant relationship with the Bernoulli family
[19] leonhard Euler was born in Basel
[20] Leonhard Euler was a student of Johann Bernoulli
[21] Johann Bernoulli's influence was profound on Euler
{"thought_process": "To determine the contribution of the son of Euler's teacher, I first need to identify Euler's teacher and then find his son. From the relationships, Johann Bernoulli was Euler's teacher. Therefore, I need to find t

## Get Final Results

In [29]:
final_top_k = 2

final_passages = []
final_passage_ids = []
for relation_id in rerank_relation_ids:
    for passage_id in relationid_2_passageids[relation_id]:
        if passage_id not in final_passage_ids:
            final_passages.append(passages[passage_id])
            final_passage_ids.append(passage_id)
passages_graph_rag = final_passages[:final_top_k]

passages_naive_rag = milvus_client.search(
    collection_name = passage_col_name,
    data = [query_embedding],
    limit = final_top_k,
    output_fields = ["text"],
)[0]

passages_naive_rag = [passage["entity"]["text"] for passage in passages_naive_rag]

print(
    f"Passages retrieved from naive RAG: \n{passages_naive_rag}\n\n"
    f"Passages retrieved from graph RAG: \n{passages_graph_rag}\n\n"
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "human",
            """Use the following pieces of retrieved context to answer the question. If there is not enough information in the retrieved context to answer the question, just say that you don't know.
Question: {question}
Context: {context}
Answer:""",
        )
    ]
)

rag_chain = prompt | llm | StrOutputParser()

answer_from_naive_rag = rag_chain.invoke(
    {"question": query, "context": "\n".join(passages_naive_rag)}
)
answer_from_graph_rag = rag_chain.invoke(
    {"question": query, "context": "\n".join(passages_graph_rag)}
)


print(
    f"Answer from naive RAG: {answer_from_naive_rag}\n\nAnswer from our method: {answer_from_graph_rag}"
)

Passages retrieved from naive RAG: 
['Leonhard Euler (1707–1783) was one of the greatest mathematicians of all time, and his relationship with the Bernoulli family was significant. Euler was born in Basel and was a student of Johann Bernoulli, who recognized his exceptional talent and mentored him in mathematics. Johann Bernoulli’s influence on Euler was profound, and Euler later expanded upon many of the ideas and methods he learned from the Bernoullis.', 'Johann Bernoulli (1667–1748): Johann, Jakob’s younger brother, was also a major figure in the development of calculus. He worked on infinitesimal calculus and was instrumental in spreading the ideas of Leibniz across Europe. Johann also contributed to the calculus of variations and was known for his work on the brachistochrone problem, which is the curve of fastest descent between two points.']

Passages retrieved from graph RAG: 
['Leonhard Euler (1707–1783) was one of the greatest mathematicians of all time, and his relationship w

- Graph rag enables multi-hop reasoning by leveraging a relational graph of entities and their connections. 
- Unlike naive RAG approach, Graph RAG explicitly traverses known relationships (e.g., "Euler → taught_by → Johann Bernoulli → father_of → Daniel Bernoulli → contributed_to → fluid mechanics") to resolve complex query.