## Extracting Context Topics and Contexts

In [None]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict

np.random.seed(42)
with open('../../data/taji_1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

In [2]:
# Extract topics from context to new column, meanwhile, extracting the context themselves into a list
all_context = []

def extract_topics(context_list):
    topics = []
    for context_item in context_list:
        # element 0 is the topic, element 1 are the contexts
        topics.append(context_item[0])  
        all_context.append(('\n'.join(context_item[1])))  # Collect all contexts
    return topics

df['topics'] = df['context'].apply(extract_topics)

# Get all unique topics and contexts
all_topics = set()

for topics in df['topics']:
    all_topics.update(topics)
    
all_context = list(set(all_context))  # Remove duplicates from contexts
print(f"Total unique contexts: {len(all_context)}")
print(f"Total unique topics: {len(all_topics)}")

Total unique contexts: 470
Total unique topics: 388


In [None]:
## save all_context to a json file
with open('all_context.json', 'w', encoding='utf-8') as f:
    json.dump(all_context, f, ensure_ascii=False, indent=4)

## Splitting data into Train/Test

Also ensuring the fact that all contexts appear at least once in our train set.

In [6]:
topic_to_samples = defaultdict(list)
for idx, topics in enumerate(df['topics']):
    for topic in topics:
        topic_to_samples[topic].append(idx)

In [7]:
topic_to_samples

defaultdict(list,
            {'۱۳ دلیل برای اینکه ': [0, 1, 2, 256, 257, 258],
             'هانا بیکر ': [0, 1, 2, 510, 511],
             'سامورایی ': [3, 4],
             'سایتو دوسان ': [3, 4],
             'سوپرتاکس ': [5, 6, 246, 247, 248],
             'نینتندو ': [5, 6, 236, 237, 242],
             'مالک سیدیبه ': [7, 8, 9],
             'شیر طلایی ': [7, 8, 9, 241],
             'مسابقه آواز یوروویژن ۲۰۱۹ ': [10],
             'مسابقه آواز یوروویژن ۲۰۱۸ ': [10],
             'نامه فرهنگستان ': [11, 12, 13, 14, 15, 436],
             'سردبیر ': [11, 12, 13, 14, 15],
             'خاطرات بریجت جونز (فیلم) ': [16, 17, 18, 19],
             'جما جونز ': [16, 17, 18, 19],
             'لامپ نئون ': [20, 21, 22, 23],
             'جیوه ': [20, 21, 22, 23, 225, 226, 227, 228, 433, 434, 455],
             'زابیواکا (برنامه تلویزیونی) ': [24, 25, 26, 27, 28, 29, 30, 31],
             'جام جهانی فوتبال ۲۰۱۸ ': [24, 25, 26, 27, 28, 29, 30, 31],
             'استفانی مک\u200cمن ': [32, 3

In [8]:
# one sample per topic for training (to guarantee coverage)
guaranteed_train_indices = set()
for topic, sample_indices in topic_to_samples.items():
    selected_idx = np.random.choice(sample_indices)
    guaranteed_train_indices.add(selected_idx)

print(f"Guaranteed training samples: {len(guaranteed_train_indices)}")

Guaranteed training samples: 280


In [9]:
remaining_indices = set(range(len(df))) - guaranteed_train_indices
remaining_indices = list(remaining_indices)

additional_train_needed = 400 - len(guaranteed_train_indices)

additional_train_indices = np.random.choice(
    remaining_indices, 
    size=additional_train_needed, 
    replace=False
)
train_indices = list(guaranteed_train_indices) + list(additional_train_indices)
all_indices = set(range(len(df)))
test_indices = list(all_indices - set(train_indices))

# Create train and test splits
train_df = df.iloc[train_indices].reset_index(drop=True)
test_df = df.iloc[test_indices].reset_index(drop=True)

# Remove the helper 'topics' column
train_df = train_df.drop('topics', axis=1)
test_df = test_df.drop('topics', axis=1)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 400
Test size: 152


In [None]:
# Verify topic coverage in training 
train_topics = set()
for _, row in train_df.iterrows():
    for context_item in row['context']:
        if len(context_item) > 0:
            train_topics.add(context_item[0])

print(f"Topics in training set: {len(train_topics)}")

## Save the splits
# train_df.to_json('train_data.json', orient='records', force_ascii=False, indent=2)
# test_df.to_json('test_data.json', orient='records', force_ascii=False, indent=2)

Topics in training set: 388


# Extracting Knowledge Graph Triplets

In [1]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List
import dotenv
import json
import os

In [2]:
dotenv.load_dotenv()

MODEL_NAME = "gpt-4o-mini"

llm = ChatOpenAI(
    model=MODEL_NAME,
    temperature=0,
    api_key=os.getenv("METIS_API_KEY"),
    base_url="https://api.metisai.ir/openai/v1",
)

In [None]:
# load all_context from json file
with open('all_context.json', 'r', encoding='utf-8') as f:
    all_context = json.load(f)

# print 3 first contexts
print("First 3 contexts:")
for context in all_context[:3]:
    print(context)
    print("-" * 10)

First 3 contexts:
آفریقای جنوبی با نام رسمی جمهوری آفریقای جنوبی  کشوری در جنوب آفریقا و در سواحل دو اقیانوس اطلس و هند است.
این کشور با جمعیتی برابر ۵۹٫۶ میلیون تن چهارمین کشور پر جمعیت در آفریقای زیرصحرا می‌باشد.
----------
مانجاری فاندیس مدل و هنرپیشه اهل هند است.
مانجاری در سال ۲۰۰۸ با بازی در سریال اگه بدونی یا نه شناخته شد.
آقای تقلب فیلم دیگریست که او در آن نقش آفرینی کرده‌است.
----------
لوک اسکای‌واکر شخصیتی تخیلی و قهرمان داستان در سه‌گانهٔ اصلی فرنچایز جنگ ستارگان است که توسط جرج لوکاس پدید آمده‌است.
لوک را مارک همیل به تصویر کشیده‌است و نخستین بار در جنگ ستارگان ظاهر شد و در فیلم‌های امپراتوری ضربه متقابل می‌زند و بازگشت جدای بازگشت.
مارک همیل سه دهه بعد نقش لوک را در سه‌گانهٔ دنبالهٔ این فرنچایز متشکل از فیلم‌های نیرو برمی‌خیزد ، آخرین جدای و خیزش اسکای‌واکر ایفا کرد.
همیل همچنین به‌صورت بدل و دیجیتالی جوان‌شده در قسمت ۱۶ مجموعهٔ مندلورین در نقش لوک حضور داشت.
----------


In [None]:
class NamedEntities(BaseModel):
    """A structured list of named entities."""
    named_entities: List[str] = Field(
        ...,
        description="A list of named entities extracted from the provided passage."
    )

class KnowledgeTriple(BaseModel):
    """Represents a single Subject-Predicate-Object relationship."""
    subject: str = Field(..., description="The subject of the relationship.")
    predicate: str = Field(..., description="The predicate or verb phrase describing the relationship.")
    object: str = Field(..., description="The object of the relationship.")

class Triples(BaseModel):
    """A structured list of knowledge triples."""
    triples: List[KnowledgeTriple] = Field(
        ...,
        description="A list of knowledge triples (subject, predicate, object) extracted from the passage."
    )

In [11]:
ner_system_prompt = """
Instruction:
Your task is to extract named entities from the given paragraph in the user's message.
Respond with a JSON list of entities.

One-Shot Demonstration:
If the user provides the paragraph:
"Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."

Your output should be a JSON object with a list of the extracted entities:
{{"named_entities": ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]}}
"""

# System Prompt for Open Information Extraction (OpenIE)
# This example is taken from Figure 9 of the HippoRAG paper.
openie_system_prompt = """
Instruction:
Your task is to construct an RDF (Resource Description Framework) graph from the given passages and named entity lists.
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph.
Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.

One-Shot Demonstration:
If the user provides the paragraph and entity list:
Paragraph:
"Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."
Named Entity List:
["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]

Your output should be a JSON object containing a list of the extracted triples:
{{"triples": [
    ["Radio City", "located in", "India"],
    ["Radio City", "is", "private FM radio station"],
    ["Radio City", "started on", "3 July 2001"],
    ["Radio City", "plays songs in", "Hindi"],
    ["Radio City", "plays songs in", "English"],
    ["Radio City", "forayed into", "New Media"],
    ["Radio City", "launched", "PlanetRadiocity.com"],
    ["PlanetRadiocity.com", "launched in", "May 2008"],
    ["PlanetRadiocity.com", "is", "music portal"],
    ["PlanetRadiocity.com", "offers", "news"],
    ["PlanetRadiocity.com", "offers", "videos"],
    ["PlanetRadiocity.com", "offers", "songs"]
]}}
"""

In [12]:
ner_prompt = ChatPromptTemplate.from_messages([
    ("system", ner_system_prompt),
    ("human", "{passage}")
])

openie_prompt = ChatPromptTemplate.from_messages([
    ("system", openie_system_prompt),
    ("human", "Paragraph:\n{passage}\n\nNamed Entity List:\n{entities}")
])

In [13]:
structured_ner_llm = llm.with_structured_output(NamedEntities)
structured_openie_llm = llm.with_structured_output(Triples)

ner_chain = ner_prompt | structured_ner_llm
openie_chain = openie_prompt | structured_openie_llm

In [None]:
def extract_knowledge_structured(passage: str) -> (List[str], List[dict]):
    """
    Implements the two-step HippoRAG extraction process using a system/user prompt structure.
    """
    try:
        ner_response = ner_chain.invoke({"passage": passage})
        entities = ner_response.named_entities
        print(f"  -> Extracted Entities: {entities}")
    except Exception as e:
        print(f"  -> Error during structured NER extraction: {e}")
        return [], []

    if not entities:
        print("  -> No entities found, skipping triple extraction.")
        return [], []

    try:
        entities_str = json.dumps(entities, ensure_ascii=False)
        openie_response = openie_chain.invoke({"passage": passage, "entities": entities_str})
        triples = [triple.model_dump() for triple in openie_response.triples]
        print(f"  -> Extracted Triples: {triples}")
        return entities, triples
    except Exception as e:
        print(f"  -> Error during structured OpenIE extraction: {e}")
        return entities, []

In [23]:
## Test 5 contexts
contexts_to_process = all_context[:5]
knowledge_graph_data = []

print("Starting Knowledge Graph Extraction (HippoRAG Paper Examples)")
for i, context_passage in enumerate(contexts_to_process):
    print(f"\ncontext {i+1}/5:")
    print(f"'{context_passage}'")

    extracted_entities, extracted_triples = extract_knowledge_structured(context_passage)

    if extracted_triples:
        knowledge_graph_data.append({
            "id": i,
            "passage": context_passage,
            "entities": extracted_entities,
            "triples": extracted_triples
        })
    print("-" * 20)

Starting Knowledge Graph Extraction (HippoRAG Paper Examples)

context 1/5:
'آفریقای جنوبی با نام رسمی جمهوری آفریقای جنوبی  کشوری در جنوب آفریقا و در سواحل دو اقیانوس اطلس و هند است.
این کشور با جمعیتی برابر ۵۹٫۶ میلیون تن چهارمین کشور پر جمعیت در آفریقای زیرصحرا می‌باشد.'


--------------------

context 2/5:
'مانجاری فاندیس مدل و هنرپیشه اهل هند است.
مانجاری در سال ۲۰۰۸ با بازی در سریال اگه بدونی یا نه شناخته شد.
آقای تقلب فیلم دیگریست که او در آن نقش آفرینی کرده‌است.'
--------------------

context 3/5:
'لوک اسکای‌واکر شخصیتی تخیلی و قهرمان داستان در سه‌گانهٔ اصلی فرنچایز جنگ ستارگان است که توسط جرج لوکاس پدید آمده‌است.
لوک را مارک همیل به تصویر کشیده‌است و نخستین بار در جنگ ستارگان ظاهر شد و در فیلم‌های امپراتوری ضربه متقابل می‌زند و بازگشت جدای بازگشت.
مارک همیل سه دهه بعد نقش لوک را در سه‌گانهٔ دنبالهٔ این فرنچایز متشکل از فیلم‌های نیرو برمی‌خیزد ، آخرین جدای و خیزش اسکای‌واکر ایفا کرد.
همیل همچنین به‌صورت بدل و دیجیتالی جوان‌شده در قسمت ۱۶ مجموعهٔ مندلورین در نقش لوک حضور داشت.'
--------------------

context 4/5:
'مارشال‌های آمریکایی فیلمی محصول سال ۱۹۹۸ آمریکا به کارگردانی استوارت برد است.
در این فیلم بازیگرانی همچون تامی لی جونز، رابرت داونی جونیور، وسلی اسنایپس، جو پانتولیانو، ایرن ژاکوب، کیت نلیگان، دانیال روبوک، لاتانیا ریچاردسون، پاتریک مالاهید 

In [None]:
# Now do the same for all of the contexts
from tqdm import tqdm

def extract_knowledge_structured(passage: str) -> (List[str], List[dict]):
    """
    Implements the two-step HippoRAG extraction process using a system/user prompt structure.
    """
    try:
        ner_response = ner_chain.invoke({"passage": passage})
        entities = ner_response.named_entities
        # print(f"  -> Extracted Entities: {entities}")
    except Exception as e:
        print(f"  -> Error during structured NER extraction: {e}")
        return [], []

    if not entities:
        print("  -> No entities found, skipping triple extraction.")
        return [], []

    try:
        entities_str = json.dumps(entities, ensure_ascii=False)
        openie_response = openie_chain.invoke({"passage": passage, "entities": entities_str})
        triples = [triple.model_dump() for triple in openie_response.triples]
        # print(f"  -> Extracted Triples: {triples}")
        return entities, triples
    except Exception as e:
        print(f"  -> Error during structured OpenIE extraction: {e}")
        return entities, []
    
    

knowledge_graph_data = []

print("Starting Knowledge Graph Extraction (HippoRAG Paper Examples)")

i = 0
for context_passage in tqdm(all_context):
    extracted_entities, extracted_triples = extract_knowledge_structured(context_passage)

    if extracted_triples:
        knowledge_graph_data.append({
            "id": i,
            "passage": context_passage,
            "entities": extracted_entities,
            "triples": extracted_triples
        })
        
    i += 1
    print("-" * 20)

Starting Knowledge Graph Extraction (HippoRAG Paper Examples)


  0%|          | 1/470 [00:06<50:50,  6.51s/it]

--------------------


  0%|          | 2/470 [00:11<44:09,  5.66s/it]

--------------------


  1%|          | 3/470 [00:21<1:00:23,  7.76s/it]

--------------------


  1%|          | 4/470 [00:32<1:10:41,  9.10s/it]

--------------------


  1%|          | 5/470 [00:38<59:39,  7.70s/it]  

--------------------


  1%|▏         | 6/470 [00:43<53:05,  6.87s/it]

--------------------


  1%|▏         | 7/470 [00:49<51:05,  6.62s/it]

--------------------


  2%|▏         | 8/470 [01:00<1:01:11,  7.95s/it]

--------------------


  2%|▏         | 9/470 [01:15<1:18:42, 10.24s/it]

--------------------


  2%|▏         | 10/470 [01:24<1:16:09,  9.93s/it]

--------------------


  2%|▏         | 11/470 [01:34<1:15:47,  9.91s/it]

--------------------


  3%|▎         | 12/470 [01:43<1:14:02,  9.70s/it]

--------------------


  3%|▎         | 13/470 [01:50<1:06:04,  8.68s/it]

--------------------


  3%|▎         | 14/470 [01:58<1:04:33,  8.49s/it]

--------------------


  3%|▎         | 15/470 [02:04<58:04,  7.66s/it]  

--------------------


  3%|▎         | 16/470 [02:13<1:00:51,  8.04s/it]

--------------------


  4%|▎         | 17/470 [02:20<58:35,  7.76s/it]  

--------------------


  4%|▍         | 18/470 [02:26<56:15,  7.47s/it]

--------------------


  4%|▍         | 19/470 [02:30<47:54,  6.37s/it]

--------------------


  4%|▍         | 20/470 [02:42<59:58,  8.00s/it]

--------------------


  4%|▍         | 21/470 [02:48<56:17,  7.52s/it]

--------------------


  5%|▍         | 22/470 [02:55<54:56,  7.36s/it]

--------------------


  5%|▍         | 23/470 [03:09<1:09:23,  9.31s/it]

--------------------


  5%|▌         | 24/470 [03:19<1:09:47,  9.39s/it]

--------------------


  5%|▌         | 25/470 [03:26<1:05:47,  8.87s/it]

--------------------


  6%|▌         | 26/470 [03:34<1:02:14,  8.41s/it]

--------------------


  6%|▌         | 27/470 [03:42<1:02:05,  8.41s/it]

--------------------


  6%|▌         | 28/470 [03:53<1:07:23,  9.15s/it]

--------------------


  6%|▌         | 29/470 [04:01<1:04:22,  8.76s/it]

--------------------


  6%|▋         | 30/470 [04:10<1:04:16,  8.77s/it]

--------------------


  7%|▋         | 31/470 [04:21<1:08:42,  9.39s/it]

--------------------


  7%|▋         | 32/470 [04:27<1:01:29,  8.42s/it]

--------------------


  7%|▋         | 33/470 [04:34<58:08,  7.98s/it]  

--------------------


  7%|▋         | 34/470 [04:42<59:08,  8.14s/it]

--------------------


  7%|▋         | 35/470 [04:50<58:24,  8.06s/it]

--------------------


  8%|▊         | 36/470 [04:57<55:51,  7.72s/it]

--------------------


  8%|▊         | 37/470 [05:10<1:06:55,  9.27s/it]

--------------------


  8%|▊         | 38/470 [05:16<1:00:34,  8.41s/it]

--------------------


  8%|▊         | 39/470 [05:23<56:44,  7.90s/it]  

--------------------


  9%|▊         | 40/470 [05:29<53:00,  7.40s/it]

--------------------


  9%|▊         | 41/470 [05:36<51:25,  7.19s/it]

--------------------


  9%|▉         | 42/470 [05:42<48:22,  6.78s/it]

--------------------


  9%|▉         | 43/470 [05:49<49:21,  6.93s/it]

--------------------


  9%|▉         | 44/470 [05:53<41:59,  5.91s/it]

--------------------


 10%|▉         | 45/470 [06:10<1:06:57,  9.45s/it]

--------------------


 10%|▉         | 46/470 [06:21<1:09:27,  9.83s/it]

--------------------


 10%|█         | 47/470 [06:38<1:24:40, 12.01s/it]

--------------------


 10%|█         | 48/470 [06:44<1:12:16, 10.28s/it]

--------------------


 10%|█         | 49/470 [06:49<1:00:40,  8.65s/it]

--------------------


 11%|█         | 50/470 [07:04<1:13:09, 10.45s/it]

--------------------


 11%|█         | 51/470 [07:10<1:03:04,  9.03s/it]

--------------------


 11%|█         | 52/470 [07:16<56:48,  8.15s/it]  

--------------------


 11%|█▏        | 53/470 [07:25<58:01,  8.35s/it]

--------------------


 11%|█▏        | 54/470 [07:38<1:09:15,  9.99s/it]

--------------------


 12%|█▏        | 55/470 [07:46<1:04:50,  9.37s/it]

--------------------


 12%|█▏        | 56/470 [08:00<1:13:44, 10.69s/it]

--------------------


 12%|█▏        | 57/470 [08:09<1:10:45, 10.28s/it]

--------------------


 12%|█▏        | 58/470 [08:16<1:02:39,  9.13s/it]

--------------------


 13%|█▎        | 59/470 [08:28<1:08:15,  9.96s/it]

--------------------


 13%|█▎        | 60/470 [08:35<1:03:09,  9.24s/it]

--------------------


 13%|█▎        | 61/470 [08:46<1:06:38,  9.78s/it]

--------------------


 13%|█▎        | 62/470 [08:52<57:58,  8.52s/it]  

--------------------


 13%|█▎        | 63/470 [08:59<54:08,  7.98s/it]

--------------------


 14%|█▎        | 64/470 [09:11<1:03:35,  9.40s/it]

--------------------


 14%|█▍        | 65/470 [09:22<1:06:25,  9.84s/it]

--------------------


 14%|█▍        | 66/470 [09:30<1:03:07,  9.37s/it]

--------------------


 14%|█▍        | 67/470 [09:39<1:01:31,  9.16s/it]

--------------------


 14%|█▍        | 68/470 [09:51<1:06:18,  9.90s/it]

--------------------


 15%|█▍        | 69/470 [10:00<1:04:27,  9.64s/it]

--------------------


 15%|█▍        | 70/470 [10:07<58:46,  8.82s/it]  

--------------------


 15%|█▌        | 71/470 [10:14<55:22,  8.33s/it]

--------------------


 15%|█▌        | 72/470 [10:22<55:44,  8.40s/it]

--------------------


 16%|█▌        | 73/470 [10:30<54:45,  8.28s/it]

--------------------


 16%|█▌        | 74/470 [10:38<53:44,  8.14s/it]

--------------------


 16%|█▌        | 75/470 [10:50<1:00:32,  9.20s/it]

--------------------


 16%|█▌        | 76/470 [11:02<1:06:02, 10.06s/it]

--------------------


 16%|█▋        | 77/470 [11:08<57:14,  8.74s/it]  

--------------------


 17%|█▋        | 78/470 [11:16<56:56,  8.71s/it]

--------------------


 17%|█▋        | 79/470 [11:24<54:12,  8.32s/it]

--------------------


 17%|█▋        | 80/470 [11:33<56:58,  8.77s/it]

--------------------


 17%|█▋        | 81/470 [11:39<51:03,  7.87s/it]

--------------------


 17%|█▋        | 82/470 [11:48<53:03,  8.20s/it]

--------------------


 18%|█▊        | 83/470 [11:55<50:35,  7.84s/it]

--------------------


 18%|█▊        | 84/470 [12:06<55:09,  8.57s/it]

--------------------


 18%|█▊        | 85/470 [12:19<1:04:10, 10.00s/it]

--------------------


 18%|█▊        | 86/470 [12:24<55:35,  8.69s/it]  

--------------------


 19%|█▊        | 87/470 [12:33<55:01,  8.62s/it]

--------------------


 19%|█▊        | 88/470 [12:37<46:40,  7.33s/it]

--------------------


 19%|█▉        | 89/470 [12:44<46:15,  7.29s/it]

--------------------


 19%|█▉        | 90/470 [12:47<37:41,  5.95s/it]

  -> Error during structured NER extraction: 'NoneType' object is not iterable
--------------------


 19%|█▉        | 91/470 [12:55<40:28,  6.41s/it]

--------------------


 20%|█▉        | 92/470 [13:00<38:44,  6.15s/it]

--------------------


 20%|█▉        | 93/470 [13:13<50:21,  8.01s/it]

--------------------


 20%|██        | 94/470 [13:21<50:34,  8.07s/it]

--------------------


 20%|██        | 95/470 [13:39<1:09:31, 11.12s/it]

--------------------


 20%|██        | 96/470 [13:44<58:33,  9.40s/it]  

--------------------


 21%|██        | 97/470 [13:51<52:20,  8.42s/it]

--------------------


 21%|██        | 98/470 [13:58<50:47,  8.19s/it]

--------------------


 21%|██        | 99/470 [15:37<3:39:14, 35.46s/it]

  -> Error during structured OpenIE extraction: 'NoneType' object is not iterable
--------------------


 21%|██▏       | 100/470 [15:42<2:41:03, 26.12s/it]

--------------------


 21%|██▏       | 101/470 [15:51<2:10:28, 21.22s/it]

--------------------


 22%|██▏       | 102/470 [15:59<1:44:56, 17.11s/it]

--------------------


 22%|██▏       | 103/470 [16:21<1:54:12, 18.67s/it]

--------------------


 22%|██▏       | 104/470 [16:42<1:57:10, 19.21s/it]

--------------------


 22%|██▏       | 105/470 [16:54<1:43:14, 16.97s/it]

--------------------


 23%|██▎       | 106/470 [17:08<1:38:12, 16.19s/it]

--------------------


 23%|██▎       | 107/470 [17:21<1:31:42, 15.16s/it]

--------------------


 23%|██▎       | 108/470 [17:29<1:19:09, 13.12s/it]

--------------------


 23%|██▎       | 109/470 [17:40<1:15:47, 12.60s/it]

--------------------


 23%|██▎       | 110/470 [17:56<1:21:30, 13.58s/it]

--------------------


 24%|██▎       | 111/470 [18:00<1:02:58, 10.53s/it]

--------------------


 24%|██▍       | 112/470 [18:17<1:15:36, 12.67s/it]

--------------------


 24%|██▍       | 113/470 [18:28<1:11:34, 12.03s/it]

--------------------


 24%|██▍       | 114/470 [18:35<1:02:25, 10.52s/it]

--------------------


 24%|██▍       | 115/470 [18:41<54:20,  9.18s/it]  

--------------------


 25%|██▍       | 116/470 [18:48<50:41,  8.59s/it]

--------------------


 25%|██▍       | 117/470 [18:55<47:57,  8.15s/it]

--------------------


 25%|██▌       | 118/470 [19:00<41:50,  7.13s/it]

--------------------


 25%|██▌       | 119/470 [19:09<45:34,  7.79s/it]

--------------------


 26%|██▌       | 120/470 [19:15<41:40,  7.14s/it]

--------------------


 26%|██▌       | 121/470 [19:22<41:59,  7.22s/it]

--------------------


 26%|██▌       | 122/470 [19:34<49:06,  8.47s/it]

--------------------


 26%|██▌       | 123/470 [19:46<54:53,  9.49s/it]

--------------------


 26%|██▋       | 124/470 [19:59<1:01:36, 10.68s/it]

--------------------


 27%|██▋       | 125/470 [20:07<56:54,  9.90s/it]  

--------------------


 27%|██▋       | 126/470 [20:18<58:18, 10.17s/it]

--------------------


 27%|██▋       | 127/470 [20:26<55:03,  9.63s/it]

--------------------


 27%|██▋       | 128/470 [20:39<59:29, 10.44s/it]

--------------------


 27%|██▋       | 129/470 [20:44<50:11,  8.83s/it]

--------------------


 28%|██▊       | 130/470 [20:54<52:25,  9.25s/it]

--------------------


 28%|██▊       | 131/470 [21:03<51:38,  9.14s/it]

--------------------


 28%|██▊       | 132/470 [21:14<54:47,  9.73s/it]

--------------------


 28%|██▊       | 133/470 [21:23<53:38,  9.55s/it]

--------------------


 29%|██▊       | 134/470 [21:32<52:20,  9.35s/it]

--------------------


 29%|██▊       | 135/470 [21:40<49:18,  8.83s/it]

--------------------


 29%|██▉       | 136/470 [21:47<46:37,  8.38s/it]

--------------------


 29%|██▉       | 137/470 [21:54<44:28,  8.01s/it]

--------------------


 29%|██▉       | 138/470 [22:08<54:04,  9.77s/it]

--------------------


 30%|██▉       | 139/470 [22:13<45:50,  8.31s/it]

--------------------


 30%|██▉       | 140/470 [22:19<42:30,  7.73s/it]

--------------------


 30%|███       | 141/470 [22:25<39:10,  7.14s/it]

--------------------


 30%|███       | 142/470 [22:31<37:17,  6.82s/it]

--------------------


 30%|███       | 143/470 [22:39<39:38,  7.27s/it]

--------------------


 31%|███       | 144/470 [22:51<45:58,  8.46s/it]

--------------------


 31%|███       | 145/470 [23:05<55:09, 10.18s/it]

--------------------


 31%|███       | 146/470 [23:14<54:07, 10.02s/it]

--------------------


 31%|███▏      | 147/470 [23:21<48:47,  9.06s/it]

--------------------


 31%|███▏      | 148/470 [23:30<48:24,  9.02s/it]

--------------------


 32%|███▏      | 149/470 [23:43<54:25, 10.17s/it]

--------------------


 32%|███▏      | 150/470 [23:55<56:47, 10.65s/it]

--------------------


 32%|███▏      | 151/470 [24:05<55:02, 10.35s/it]

--------------------


 32%|███▏      | 152/470 [24:14<53:06, 10.02s/it]

--------------------


 33%|███▎      | 153/470 [24:26<57:01, 10.79s/it]

--------------------


 33%|███▎      | 154/470 [24:35<52:43, 10.01s/it]

--------------------


 33%|███▎      | 155/470 [24:44<52:17,  9.96s/it]

--------------------


 33%|███▎      | 156/470 [24:49<42:57,  8.21s/it]

--------------------


 33%|███▎      | 157/470 [24:54<38:48,  7.44s/it]

--------------------


 34%|███▎      | 158/470 [25:03<40:52,  7.86s/it]

--------------------


 34%|███▍      | 159/470 [25:08<36:41,  7.08s/it]

--------------------


 34%|███▍      | 160/470 [25:15<36:13,  7.01s/it]

--------------------


 34%|███▍      | 161/470 [25:30<48:00,  9.32s/it]

--------------------


 34%|███▍      | 162/470 [25:38<45:49,  8.93s/it]

--------------------


 35%|███▍      | 163/470 [25:57<1:01:57, 12.11s/it]

--------------------


 35%|███▍      | 164/470 [26:03<51:47, 10.15s/it]  

--------------------


 35%|███▌      | 165/470 [26:08<43:40,  8.59s/it]

--------------------


 35%|███▌      | 166/470 [26:22<52:22, 10.34s/it]

--------------------


 36%|███▌      | 167/470 [26:37<58:45, 11.63s/it]

--------------------


 36%|███▌      | 168/470 [26:55<1:08:07, 13.53s/it]

--------------------


 36%|███▌      | 169/470 [27:05<1:02:36, 12.48s/it]

--------------------


 36%|███▌      | 170/470 [27:13<55:07, 11.02s/it]  

--------------------


 36%|███▋      | 171/470 [27:27<59:33, 11.95s/it]

--------------------


 37%|███▋      | 172/470 [27:33<50:38, 10.20s/it]

--------------------


 37%|███▋      | 173/470 [27:43<51:05, 10.32s/it]

--------------------


 37%|███▋      | 174/470 [27:52<48:04,  9.75s/it]

--------------------


 37%|███▋      | 175/470 [28:00<45:01,  9.16s/it]

--------------------


 37%|███▋      | 176/470 [28:11<48:03,  9.81s/it]

--------------------


 38%|███▊      | 177/470 [28:18<44:24,  9.09s/it]

--------------------


 38%|███▊      | 178/470 [28:25<41:15,  8.48s/it]

--------------------


 38%|███▊      | 179/470 [28:38<46:58,  9.69s/it]

--------------------


 38%|███▊      | 180/470 [28:46<44:53,  9.29s/it]

--------------------


 39%|███▊      | 181/470 [28:53<41:22,  8.59s/it]

--------------------


 39%|███▊      | 182/470 [29:01<39:23,  8.21s/it]

--------------------


 39%|███▉      | 183/470 [29:10<41:08,  8.60s/it]

--------------------


 39%|███▉      | 184/470 [29:26<51:02, 10.71s/it]

--------------------


 39%|███▉      | 185/470 [29:30<41:57,  8.84s/it]

--------------------


 40%|███▉      | 186/470 [29:42<46:18,  9.78s/it]

--------------------


 40%|███▉      | 187/470 [29:48<41:05,  8.71s/it]

--------------------


 40%|████      | 188/470 [30:07<55:16, 11.76s/it]

--------------------


 40%|████      | 189/470 [30:13<46:50, 10.00s/it]

--------------------


 40%|████      | 190/470 [30:18<39:43,  8.51s/it]

--------------------


 41%|████      | 191/470 [30:28<41:23,  8.90s/it]

--------------------


 41%|████      | 192/470 [30:35<37:58,  8.20s/it]

--------------------


 41%|████      | 193/470 [30:41<36:05,  7.82s/it]

--------------------


 41%|████▏     | 194/470 [30:48<33:43,  7.33s/it]

--------------------


 41%|████▏     | 195/470 [30:54<32:02,  6.99s/it]

--------------------


 42%|████▏     | 196/470 [31:04<36:52,  8.07s/it]

--------------------


 42%|████▏     | 197/470 [31:11<34:51,  7.66s/it]

--------------------


 42%|████▏     | 198/470 [31:19<34:51,  7.69s/it]

--------------------


 42%|████▏     | 199/470 [31:26<33:54,  7.51s/it]

--------------------


 43%|████▎     | 200/470 [31:35<35:12,  7.82s/it]

--------------------


 43%|████▎     | 201/470 [31:43<35:24,  7.90s/it]

--------------------


 43%|████▎     | 202/470 [31:47<30:40,  6.87s/it]

--------------------


 43%|████▎     | 203/470 [32:01<40:31,  9.11s/it]

--------------------


 43%|████▎     | 204/470 [32:09<38:27,  8.68s/it]

--------------------


 44%|████▎     | 205/470 [32:18<38:34,  8.73s/it]

--------------------


 44%|████▍     | 206/470 [32:33<46:16, 10.52s/it]

--------------------


 44%|████▍     | 207/470 [32:38<39:10,  8.94s/it]

--------------------


 44%|████▍     | 208/470 [32:45<36:07,  8.27s/it]

--------------------


 44%|████▍     | 209/470 [32:50<32:17,  7.43s/it]

--------------------


 45%|████▍     | 210/470 [32:54<27:31,  6.35s/it]

--------------------


 45%|████▍     | 211/470 [32:58<24:19,  5.63s/it]

--------------------


 45%|████▌     | 212/470 [33:04<24:45,  5.76s/it]

--------------------


 45%|████▌     | 213/470 [33:09<23:57,  5.59s/it]

--------------------


 46%|████▌     | 214/470 [33:19<28:59,  6.79s/it]

--------------------


 46%|████▌     | 215/470 [33:31<36:03,  8.49s/it]

--------------------


 46%|████▌     | 216/470 [33:38<34:28,  8.15s/it]

--------------------


 46%|████▌     | 217/470 [33:47<34:22,  8.15s/it]

--------------------


 46%|████▋     | 218/470 [33:57<37:11,  8.85s/it]

--------------------


 47%|████▋     | 219/470 [34:07<37:49,  9.04s/it]

--------------------


 47%|████▋     | 220/470 [34:15<36:50,  8.84s/it]

--------------------


 47%|████▋     | 221/470 [34:18<29:20,  7.07s/it]

--------------------


 47%|████▋     | 222/470 [34:28<32:57,  7.97s/it]

--------------------


 47%|████▋     | 223/470 [34:35<31:25,  7.63s/it]

--------------------


 48%|████▊     | 224/470 [34:48<37:34,  9.17s/it]

--------------------


 48%|████▊     | 225/470 [35:00<41:17, 10.11s/it]

--------------------


 48%|████▊     | 226/470 [35:08<38:54,  9.57s/it]

--------------------


 48%|████▊     | 227/470 [35:15<35:10,  8.69s/it]

--------------------


 49%|████▊     | 228/470 [35:27<38:51,  9.64s/it]

--------------------


 49%|████▊     | 229/470 [35:33<34:59,  8.71s/it]

--------------------


 49%|████▉     | 230/470 [35:43<36:27,  9.12s/it]

--------------------


 49%|████▉     | 231/470 [35:48<31:20,  7.87s/it]

--------------------


 49%|████▉     | 232/470 [35:58<33:45,  8.51s/it]

--------------------


 50%|████▉     | 233/470 [36:05<31:17,  7.92s/it]

--------------------


 50%|████▉     | 234/470 [36:11<28:45,  7.31s/it]

--------------------


 50%|█████     | 235/470 [36:14<24:10,  6.17s/it]

--------------------


 50%|█████     | 236/470 [36:18<20:50,  5.35s/it]

--------------------


 50%|█████     | 237/470 [36:24<22:20,  5.76s/it]

--------------------


 51%|█████     | 238/470 [36:35<28:15,  7.31s/it]

--------------------


 51%|█████     | 239/470 [36:44<30:14,  7.86s/it]

--------------------


 51%|█████     | 240/470 [36:50<27:28,  7.17s/it]

--------------------


 51%|█████▏    | 241/470 [36:59<29:19,  7.69s/it]

--------------------


 51%|█████▏    | 242/470 [37:07<29:11,  7.68s/it]

--------------------


 52%|█████▏    | 243/470 [37:11<24:58,  6.60s/it]

--------------------


 52%|█████▏    | 244/470 [37:19<26:23,  7.00s/it]

--------------------


 52%|█████▏    | 245/470 [37:31<32:06,  8.56s/it]

--------------------


 52%|█████▏    | 246/470 [37:36<28:15,  7.57s/it]

--------------------


 53%|█████▎    | 247/470 [37:45<29:55,  8.05s/it]

--------------------


 53%|█████▎    | 248/470 [37:50<26:17,  7.11s/it]

--------------------


 53%|█████▎    | 249/470 [37:59<27:49,  7.55s/it]

--------------------


 53%|█████▎    | 250/470 [38:11<33:11,  9.05s/it]

--------------------


 53%|█████▎    | 251/470 [38:18<30:16,  8.30s/it]

--------------------


 54%|█████▎    | 252/470 [38:29<33:21,  9.18s/it]

--------------------


 54%|█████▍    | 253/470 [38:37<31:49,  8.80s/it]

--------------------


 54%|█████▍    | 254/470 [38:47<33:27,  9.30s/it]

--------------------


 54%|█████▍    | 255/470 [38:54<30:21,  8.47s/it]

--------------------


 54%|█████▍    | 256/470 [39:01<28:29,  7.99s/it]

--------------------


 55%|█████▍    | 257/470 [39:15<34:45,  9.79s/it]

--------------------


 55%|█████▍    | 258/470 [39:23<32:22,  9.16s/it]

--------------------


 55%|█████▌    | 259/470 [39:35<35:33, 10.11s/it]

--------------------


 55%|█████▌    | 260/470 [39:41<30:50,  8.81s/it]

--------------------


 56%|█████▌    | 261/470 [39:49<29:53,  8.58s/it]

--------------------


 56%|█████▌    | 262/470 [39:58<30:42,  8.86s/it]

--------------------


 56%|█████▌    | 263/470 [40:11<34:33, 10.02s/it]

--------------------


 56%|█████▌    | 264/470 [40:16<29:11,  8.50s/it]

--------------------


 56%|█████▋    | 265/470 [40:27<31:57,  9.35s/it]

--------------------


 57%|█████▋    | 266/470 [40:35<30:39,  9.02s/it]

--------------------


 57%|█████▋    | 267/470 [40:41<26:41,  7.89s/it]

--------------------


 57%|█████▋    | 268/470 [40:49<27:29,  8.17s/it]

--------------------


 57%|█████▋    | 269/470 [40:59<29:08,  8.70s/it]

--------------------


 57%|█████▋    | 270/470 [41:07<27:57,  8.39s/it]

--------------------


 58%|█████▊    | 271/470 [41:14<25:52,  7.80s/it]

--------------------


 58%|█████▊    | 272/470 [41:22<25:55,  7.86s/it]

--------------------


 58%|█████▊    | 273/470 [41:27<23:10,  7.06s/it]

--------------------


 58%|█████▊    | 274/470 [41:33<22:05,  6.76s/it]

--------------------


 59%|█████▊    | 275/470 [41:48<30:06,  9.27s/it]

--------------------


 59%|█████▊    | 276/470 [41:56<29:10,  9.02s/it]

--------------------


 59%|█████▉    | 277/470 [42:03<27:02,  8.41s/it]

--------------------


 59%|█████▉    | 278/470 [42:11<26:02,  8.14s/it]

--------------------


 59%|█████▉    | 279/470 [42:20<27:05,  8.51s/it]

--------------------


 60%|█████▉    | 280/470 [42:28<26:22,  8.33s/it]

--------------------


 60%|█████▉    | 281/470 [42:39<28:47,  9.14s/it]

--------------------


 60%|██████    | 282/470 [42:49<29:30,  9.42s/it]

--------------------


 60%|██████    | 283/470 [43:00<31:03,  9.97s/it]

--------------------


 60%|██████    | 284/470 [43:23<42:16, 13.64s/it]

--------------------


 61%|██████    | 285/470 [43:32<37:40, 12.22s/it]

--------------------


 61%|██████    | 286/470 [43:41<35:16, 11.50s/it]

--------------------


 61%|██████    | 287/470 [43:48<30:46, 10.09s/it]

--------------------


 61%|██████▏   | 288/470 [43:54<26:17,  8.67s/it]

--------------------


 61%|██████▏   | 289/470 [44:03<26:43,  8.86s/it]

--------------------


 62%|██████▏   | 290/470 [44:08<23:08,  7.72s/it]

--------------------


 62%|██████▏   | 291/470 [44:15<22:12,  7.45s/it]

--------------------


 62%|██████▏   | 292/470 [44:21<21:26,  7.23s/it]

--------------------


 62%|██████▏   | 293/470 [44:30<22:33,  7.65s/it]

--------------------


 63%|██████▎   | 294/470 [44:37<21:27,  7.31s/it]

--------------------


 63%|██████▎   | 295/470 [44:44<21:28,  7.36s/it]

--------------------


 63%|██████▎   | 296/470 [44:58<27:15,  9.40s/it]

--------------------


 63%|██████▎   | 297/470 [45:10<29:04, 10.08s/it]

--------------------


 63%|██████▎   | 298/470 [45:28<35:45, 12.47s/it]

--------------------


 64%|██████▎   | 299/470 [45:33<29:37, 10.39s/it]

--------------------


 64%|██████▍   | 300/470 [45:47<31:58, 11.29s/it]

--------------------


 64%|██████▍   | 301/470 [45:56<30:03, 10.67s/it]

--------------------


 64%|██████▍   | 302/470 [46:04<27:25,  9.79s/it]

--------------------


 64%|██████▍   | 303/470 [46:15<28:22, 10.20s/it]

--------------------


 65%|██████▍   | 304/470 [46:22<25:22,  9.17s/it]

--------------------


 65%|██████▍   | 305/470 [46:25<20:13,  7.36s/it]

--------------------


 65%|██████▌   | 306/470 [46:32<19:55,  7.29s/it]

--------------------


 65%|██████▌   | 307/470 [46:37<17:46,  6.54s/it]

--------------------


 66%|██████▌   | 308/470 [46:44<18:29,  6.85s/it]

--------------------


 66%|██████▌   | 309/470 [46:52<18:37,  6.94s/it]

--------------------


 66%|██████▌   | 310/470 [46:59<18:57,  7.11s/it]

--------------------


 66%|██████▌   | 311/470 [47:03<16:25,  6.20s/it]

--------------------


 66%|██████▋   | 312/470 [47:10<16:29,  6.26s/it]

--------------------


 67%|██████▋   | 313/470 [47:21<20:24,  7.80s/it]

--------------------


 67%|██████▋   | 314/470 [47:37<26:52, 10.34s/it]

--------------------


 67%|██████▋   | 315/470 [47:43<23:28,  9.08s/it]

--------------------


 67%|██████▋   | 316/470 [47:52<22:39,  8.83s/it]

--------------------


 67%|██████▋   | 317/470 [47:58<20:54,  8.20s/it]

--------------------


 68%|██████▊   | 318/470 [48:05<19:47,  7.82s/it]

--------------------


 68%|██████▊   | 319/470 [48:14<20:39,  8.21s/it]

--------------------


 68%|██████▊   | 320/470 [48:33<28:05, 11.24s/it]

--------------------


 68%|██████▊   | 321/470 [48:48<31:08, 12.54s/it]

--------------------


 69%|██████▊   | 322/470 [48:57<27:47, 11.26s/it]

--------------------


 69%|██████▊   | 323/470 [49:04<24:56, 10.18s/it]

--------------------


 69%|██████▉   | 324/470 [49:10<21:38,  8.90s/it]

--------------------


 69%|██████▉   | 325/470 [49:16<19:34,  8.10s/it]

--------------------


 69%|██████▉   | 326/470 [49:26<20:27,  8.52s/it]

--------------------


 70%|██████▉   | 327/470 [49:35<20:36,  8.65s/it]

--------------------


 70%|██████▉   | 328/470 [49:39<17:17,  7.31s/it]

--------------------


 70%|███████   | 329/470 [49:47<17:36,  7.49s/it]

--------------------


 70%|███████   | 330/470 [49:55<17:43,  7.59s/it]

--------------------


 70%|███████   | 331/470 [50:03<18:19,  7.91s/it]

--------------------


 71%|███████   | 332/470 [50:18<22:54,  9.96s/it]

--------------------


 71%|███████   | 333/470 [50:24<20:00,  8.76s/it]

--------------------


 71%|███████   | 334/470 [50:37<22:37,  9.98s/it]

--------------------


 71%|███████▏  | 335/470 [50:46<22:07,  9.84s/it]

--------------------


 71%|███████▏  | 336/470 [50:54<20:21,  9.12s/it]

--------------------


 72%|███████▏  | 337/470 [51:05<21:24,  9.66s/it]

--------------------


 72%|███████▏  | 338/470 [51:26<28:36, 13.01s/it]

--------------------


 72%|███████▏  | 339/470 [51:32<23:55, 10.96s/it]

--------------------


 72%|███████▏  | 340/470 [51:39<21:22,  9.87s/it]

--------------------


 73%|███████▎  | 341/470 [51:45<18:25,  8.57s/it]

--------------------


 73%|███████▎  | 342/470 [51:51<17:07,  8.03s/it]

--------------------


 73%|███████▎  | 343/470 [51:57<15:12,  7.18s/it]

--------------------


 73%|███████▎  | 344/470 [52:08<18:01,  8.59s/it]

--------------------


 73%|███████▎  | 345/470 [52:16<17:03,  8.18s/it]

--------------------


 74%|███████▎  | 346/470 [52:24<16:45,  8.11s/it]

--------------------


 74%|███████▍  | 347/470 [52:29<14:54,  7.27s/it]

--------------------


 74%|███████▍  | 348/470 [52:39<16:34,  8.15s/it]

--------------------


 74%|███████▍  | 349/470 [52:45<15:19,  7.60s/it]

--------------------


 74%|███████▍  | 350/470 [52:58<17:54,  8.95s/it]

--------------------


 75%|███████▍  | 351/470 [53:03<15:53,  8.01s/it]

--------------------


 75%|███████▍  | 352/470 [53:09<14:38,  7.44s/it]

--------------------


 75%|███████▌  | 353/470 [53:16<14:10,  7.27s/it]

--------------------


 75%|███████▌  | 354/470 [53:22<13:05,  6.77s/it]

--------------------


 76%|███████▌  | 355/470 [53:33<15:34,  8.13s/it]

--------------------


 76%|███████▌  | 356/470 [53:41<15:20,  8.08s/it]

--------------------


 76%|███████▌  | 357/470 [53:49<14:56,  7.94s/it]

--------------------


 76%|███████▌  | 358/470 [53:55<13:49,  7.40s/it]

--------------------


 76%|███████▋  | 359/470 [54:03<13:54,  7.52s/it]

--------------------


 77%|███████▋  | 360/470 [54:09<13:06,  7.15s/it]

--------------------


 77%|███████▋  | 361/470 [54:18<13:53,  7.65s/it]

--------------------


 77%|███████▋  | 362/470 [54:28<15:15,  8.48s/it]

--------------------


 77%|███████▋  | 363/470 [54:39<16:25,  9.21s/it]

--------------------


 77%|███████▋  | 364/470 [54:48<15:54,  9.01s/it]

--------------------


 78%|███████▊  | 365/470 [54:58<16:21,  9.34s/it]

--------------------


 78%|███████▊  | 366/470 [55:09<16:59,  9.80s/it]

--------------------


 78%|███████▊  | 367/470 [55:14<14:36,  8.51s/it]

--------------------


 78%|███████▊  | 368/470 [55:21<13:32,  7.97s/it]

--------------------


 79%|███████▊  | 369/470 [55:25<11:34,  6.88s/it]

--------------------


 79%|███████▊  | 370/470 [55:35<12:58,  7.79s/it]

--------------------


 79%|███████▊  | 370/470 [55:37<15:02,  9.02s/it]


KeyboardInterrupt: 

In [27]:

def extract_knowledge_structured(passage: str) -> (List[str], List[dict]):
    """
    Implements the two-step HippoRAG extraction process using a system/user prompt structure.
    """
    try:
        ner_response = ner_chain.invoke({"passage": passage})
        entities = ner_response.named_entities
        # print(f"  -> Extracted Entities: {entities}")
    except Exception as e:
        print(f"  -> Error during structured NER extraction: {e}")
        return [], []

    if not entities:
        print("  -> No entities found, skipping triple extraction.")
        return [], []

    try:
        entities_str = json.dumps(entities, ensure_ascii=False)
        openie_response = openie_chain.invoke({"passage": passage, "entities": entities_str})
        triples = [triple.model_dump() for triple in openie_response.triples]
        # print(f"  -> Extracted Triples: {triples}")
        return entities, triples
    except Exception as e:
        print(f"  -> Error during structured OpenIE extraction: {e}")
        return entities, []
    
    

# knowledge_graph_data = []

print("Starting Knowledge Graph Extraction (HippoRAG Paper Examples)")

i = 0
for context_passage in tqdm(all_context[len(knowledge_graph_data):]):
    extracted_entities, extracted_triples = extract_knowledge_structured(context_passage)

    if extracted_triples:
        knowledge_graph_data.append({
            "id": i,
            "passage": context_passage,
            "entities": extracted_entities,
            "triples": extracted_triples
        })
        
    i += 1
    # print("-" * 20)

Starting Knowledge Graph Extraction (HippoRAG Paper Examples)


100%|██████████| 102/102 [14:39<00:00,  8.62s/it]


In [34]:
# save the knowledge graph data to a json file
with open('knowledge_graph_triples_structured.json', 'w', encoding='utf-8') as f:
    json.dump(knowledge_graph_data, f, ensure_ascii=False, indent=4)

# Check prepared data

In [None]:
import pandas as pd
import igraph as ig
import json
from collections import defaultdict

In [12]:
with open('knowledge_graph_triples_structured.json', 'r', encoding='utf-8') as f:
    knowledge_graph_data = json.load(f)
df_kg = pd.DataFrame(knowledge_graph_data).drop(columns=['id'])
df_kg.head()

Unnamed: 0,passage,entities,triples
0,آفریقای جنوبی با نام رسمی جمهوری آفریقای جنوبی...,"[آفریقای جنوبی, جمهوری آفریقای جنوبی, جنوب آفر...","[{'subject': 'آفریقای جنوبی', 'predicate': 'ha..."
1,مانجاری فاندیس مدل و هنرپیشه اهل هند است.\nمان...,"[مانجاری فاندیس, هند, ۲۰۰۸, سریال اگه بدونی یا...","[{'subject': 'مانجاری فاندیس', 'predicate': 'i..."
2,لوک اسکای‌واکر شخصیتی تخیلی و قهرمان داستان در...,"[لوک اسکای‌واکر, جرج لوکاس, مارک همیل, جنگ ستا...","[{'subject': 'لوک اسکای‌واکر', 'predicate': 'i..."
3,مارشال‌های آمریکایی فیلمی محصول سال ۱۹۹۸ آمریک...,"[مارشال‌های آمریکایی, ۱۹۹۸, آمریکا, استوارت بر...","[{'subject': 'مارشال‌های آمریکایی', 'predicate..."
4,کیران دسای (Kiran Desai) (زاده ۳ سپتامبر ۱۹۷۱)...,"[کیران دسای, Kiran Desai, ۳ سپتامبر ۱۹۷۱, هند,...","[{'subject': 'کیران دسای', 'predicate': 'is', ..."


In [15]:
all_named_entities = set()
all_triples = []
for entity_list in df_kg['entities']:
    all_named_entities.update(entity_list)
    
for triples in df_kg['triples']:
    for triple in triples:
        all_triples.append((triple['subject'], triple['predicate'], triple['object']))
    
print(f"Total unique named entities: {len(all_named_entities)}")
print(f"Total unique triples: {len(all_triples)}")

Total unique named entities: 3589
Total unique triples: 5694


In [16]:
all_named_entities

{'۱۸ ژانویه ۲۰۲۲',
 'جان کارمک',
 'دهه ۵۰ میلادی',
 'تیم ملی فوتبال فرانسه',
 'روسیه تزاری',
 'پنگوئن',
 '۲۲ اکتبر ۱۷۲۱',
 'اد هلمز',
 'پویانمایی آمریکایی',
 'آیفون ۶اس',
 'Yoshitoshi ABe',
 'بهترین زمان',
 '۱۴۰۰ نفر',
 'سیم های فلزی',
 'علامه',
 'نیو سیکس',
 'ناصر تقوایی',
 '۳ اوت ۱۹۷۱',
 'دوران طلایی اسلامی',
 'سیارات',
 'عربی',
 '۱ بهمن ۱۳۴۱',
 'جرمی ایروین',
 'فلسطین',
 'کامران میرزا',
 'هندوآریایی',
 'هیدروکراکینگ',
 'برادران سوپر اسمش',
 'ژاپنی',
 'کاترین دنو',
 'قهوه',
 'جنوب غربی آسیا',
 'اینفینیتی وارد',
 'فرشته سیاه',
 'برادران وارنر پیکچرز',
 'شصت و سومین دوره مسابقه آواز یوروویژن',
 'محمدعلی نجفی',
 'مارشال\u200cهای آمریکایی',
 'آلبانی',
 'Zosui',
 'زنای با محارم',
 'هواپیمایی ایران ایرتور',
 'دیو گرین',
 'جمهوری خلق چین',
 'جینا فیشر',
 'اتحاد جماهیر شوروی سوسیالیستی',
 'الحَمْد لله',
 'جنگ فالکلند',
 'پدرو آلوارز کابرال',
 '۱۴ مارس ۱۹۳۳',
 'کیران دسای',
 'المپیک لندن',
 'پلیس فراری',
 '۲۰۰۵',
 'آفریقای شمالی',
 'جانوران',
 'جشنواره کن',
 'جسیکا لنگ',
 'قارمون',
 'تار',
 '