In [1]:
import kapipe
from kapipe import utils
from kapipe.pipelines import TripleExtractionPipeline

  from .autonotebook import tqdm as notebook_tqdm


### Example case

In [2]:
passage = {
    "title": "Alice's Life.",
    "text": (
        "Alice was born in Tokyo, but now she lives in New York. "
        "She works at a technology company in Manhattan. "
        "Her old friend Bob still lives in Tokyo and works as a teacher. "
        "Alice and Bob studied together at the University of Tokyo. "
        "Charlie, who was once Bob's roommate, decided to move to Paris after living in London. "
        "Diana, who once shared a house with Charlie in London, now happily lives in Sydney and works at a hospital. "
        "Although Alice and Diana live far apart, they remain close friends through frequent video calls."
    )
}


### Define your favorite entity types and relation labels

In [3]:
entity_types =[
    {
        "entity_type": "Person",
        "definition": "A human."
    },
    {
        "entity_type": "Location",
        "definition": "A place."
    }
]

In [4]:
relation_labels = [
    {
        "relation_label": "lives_in",
        "definition": "The subject person lives in the object location now."
    },
    {
        "relation_label": "born_in",
        "definition": "The subject person was born in the object location."
    },
    {
        "relation_label": "works_at",
        "definition": "The subject person works at the object organization or workplace."
    },
    {
        "relation_label": "studied_in",
        "definition": "The subject person studied at the object educational institution."
    },
    {
        "relation_label": "friends_with",
        "definition": "The subject person has a friendship relationship with the object person."
    },
    {
        "relation_label": "roommate_of",
        "definition": "The subject person once shared a house or apartment with the object person."
    }
]

### Load Triple Extraction Pipeline for user-defined entity types and relation labels
- NER: GPT-4o-mini with zero-shot NER prompting
- ED-Retrieval: Dummy (assigning the lowercased mention string as the entity ID)
- ED-Reranking: No reranking (seleting the top-1 candidate)
- DocRE: GPT-4o-mini with zero-shot DocRE prompting

In [5]:
pipe = TripleExtractionPipeline(
    component_kwargs={
        "ner": {
            "identifier": "gpt4omini_any",
            "entity_types": entity_types,
        },
        "ed_retrieval": {
            "identifier": "dummy_entity_retriever"
        },
        "ed_reranking": {
            "identifier": "identical_entity_reranker"
        },
        "docre": {
            "identifier": "gpt4omini_any",
            "relation_labels": relation_labels
        }
    }
)

### Convert the passage to Document object

In [6]:
document = pipe.chunker.convert_passage_to_document(passage=passage, doc_key="d1", do_tokenize=True)

In [7]:
print(utils.pretty_format_dict(document))

{
    "doc_key": "d1",
    "source_passage": {
        "title": "Alice's Life.",
        "text": "Alice was born in Tokyo, but now she lives in New York. She works at a technology company in Manhattan. Her old friend Bob still lives in Tokyo and works as a teacher. Alice and Bob studied together at the University of Tokyo. Charlie, who was once Bob's roommate, decided to move to Paris after living in London. Diana, who once shared a house with Charlie in London, now happily lives in Sydney and works at a hospital. Although Alice and Diana live far apart, they remain close friends through frequent video calls."
    },
    "sentences": [
        "Alice 's Life .",
        "Alice was born in Tokyo , but now she lives in New York .",
        "She works at a technology company in Manhattan .",
        "Her old friend Bob still lives in Tokyo and works as a teacher .",
        "Alice and Bob studied together at the University of Tokyo .",
        "Charlie , who was once Bob 's roommate , dec

### Apply the Triple Extraction Pipeline

In [8]:
document = pipe.extract(document)

### Show the extraction results, prompts, and LLM responses

In [9]:
def markup_mentions(document):
    mentions = document["mentions"]
    mentions = sorted(mentions, key=lambda m: (m["span"][1] - m["span"][0], m["span"][0]))

    # Concatenate the sentences and tokenize the text
    text = " ".join(document["sentences"])
    tokens = text.split(" ")

    # Mark up the beginning/ending tokens of the mentions with entity types
    for m_i, entry in enumerate(mentions):
        begin_i, end_i = entry["span"]
        etype = entry["entity_type"]
        tokens[begin_i] = f"{{{tokens[begin_i]}"
        tokens[end_i] = f"{tokens[end_i]}:{etype}}}"
        
    text = " ".join(tokens)
    return text

def get_triples(document):
    mentions = document["mentions"]
    entities = document["entities"]
    relations = document["relations"]
    
    triples = []
    for entry in relations:
        subject_idx = entry["arg1"]
        object_idx = entry["arg2"]
        relation = entry["relation"]

        subject_id = entities[subject_idx]["entity_id"].upper()
        object_id = entities[object_idx]["entity_id"].upper()
        #subject_names = "|".join(entities[subject_idx]["mention_names"])
        #object_names = "|".join(entities[object_idx]["mention_names"])

        triple = (subject_id, relation, object_id)
        #triple = (subject_names, relation, object_names)
        triples.append(triple)
    
    return triples

In [10]:
print(markup_mentions(document))

{Alice:Person} 's Life . {Alice:Person} was born in {Tokyo:Location} , but now she lives in {New York:Location} . She works at a technology company in {Manhattan:Location} . Her old friend {Bob:Person} still lives in {Tokyo:Location} and works as a teacher . {Alice:Person} and {Bob:Person} studied together at the {University of {Tokyo:Location}:Location} . {Charlie:Person} , who was once {Bob:Person} 's roommate , decided to move to {Paris:Location} after living in {London:Location} . {Diana:Person} , who once shared a house with {Charlie:Person} in {London:Location} , now happily lives in {Sydney:Location} and works at a hospital . Although {Alice:Person} and {Diana:Person} live far apart , they remain close friends through frequent video calls .


In [11]:
print(utils.pretty_format_dict(get_triples(document)))

[
    [
        "ALICE",
        "born_in",
        "TOKYO"
    ],
    [
        "ALICE",
        "lives_in",
        "NEW YORK"
    ],
    [
        "ALICE",
        "works_at",
        "MANHATTAN"
    ],
    [
        "ALICE",
        "studied_in",
        "UNIVERSITY OF TOKYO"
    ],
    [
        "ALICE",
        "friends_with",
        "DIANA"
    ],
    [
        "BOB",
        "lives_in",
        "TOKYO"
    ],
    [
        "BOB",
        "works_at",
        "UNIVERSITY OF TOKYO"
    ],
    [
        "CHARLIE",
        "roommate_of",
        "BOB"
    ],
    [
        "DIANA",
        "roommate_of",
        "CHARLIE"
    ],
    [
        "DIANA",
        "works_at",
        "LONDON"
    ],
    [
        "DIANA",
        "lives_in",
        "SYDNEY"
    ]
]


In [12]:
print(document["ner_prompt"])
print("===================")
print(document["ner_generated_text"])

### Task

You are given a piece of text.

Your task is to:

1. **Identify all entity mentions** in the text — short spans that refer to specific concepts or instances.
2. **Assign an appropriate entity type** to each mention, selecting from the predefined set below.

### Guidelines

- Each entity mention must be an **exact span** from the original text — do **not paraphrase, shorten, or extend** it.
- Assign **only one type** per mention.
- Choose from the following predefined entity types:

- Person: A human.
- Location: A place.

- If no entities are present, output nothing.

### Output Format

For each identified entity mention, output a line in this format:

- Mention Text | Entity Type

Use bullet points (`-`) for each line. Example:

- Aspirin | Chemical  
- breast cancer | Disease  
- light therapy | Treatment

### Test Case

Now, identify all entity mentions in the following text and assign them appropriate types.

Text: Alice 's Life . Alice was born in Tokyo , but now she liv

In [13]:
print(document["docre_prompt"])
print("===================")
print(document["docre_generated_text"])

### Task

You are given a piece of text and a list of **detected entities**, each mapped to a concept in the original knowledge base.

Your task is to:

1. **Identify relationships** between these entities based on the **context of the text**.
2. For each entity pair that holds a relation, assign the most appropriate **relationship label** from the predefined set below.

### Guidelines

- Use the **full document context** to decide whether a relationship exists between a pair of entities.
- You must select the relationship label from the following predefined set:

- lives_in: The subject person lives in the object location now.
- born_in: The subject person was born in the object location.
- works_at: The subject person works at the object organization or workplace.
- studied_in: The subject person studied at the object educational institution.
- friends_with: The subject person has a friendship relationship with the object person.
- roommate_of: The subject person once shared a house 

### Apply Graph Constructor

In [14]:
from kapipe.graph_construction import GraphConstructor

constructor = GraphConstructor()

# Temporally save the triple extraction results (documents)
utils.write_json("./temp_documents.json", [document])

graph = constructor.construct_knowledge_graph(
    path_documents_list=["./temp_documents.json"],
    path_additional_triples=None,
    path_entity_dict=None
)


Processing ./temp_documents.json: 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 2884.67it/s]


### Apply Community Clustering

In [15]:
from kapipe.community_clustering import NeighborhoodAggregation

clusterer = NeighborhoodAggregation()

communities = clusterer.cluster_communities(graph=graph)

In [16]:
print(utils.pretty_format_dict(communities))

[
    {
        "community_id": "ROOT",
        "nodes": null,
        "level": -1,
        "parent_community_id": null,
        "child_community_ids": [
            "Community(alice)",
            "Community(tokyo)",
            "Community(new york)",
            "Community(manhattan)",
            "Community(university of tokyo)",
            "Community(diana)",
            "Community(bob)",
            "Community(charlie)",
            "Community(london)",
            "Community(sydney)"
        ]
    },
    {
        "community_id": "Community(alice)",
        "nodes": [
            "alice",
            "university of tokyo",
            "diana",
            "manhattan",
            "new york",
            "tokyo"
        ],
        "level": 0,
        "parent_community_id": "ROOT",
        "child_community_ids": []
    },
    {
        "community_id": "Community(tokyo)",
        "nodes": [
            "tokyo",
            "bob",
            "alice"
        ],
        "level": 0,
 

### Apply Report Generation

In [17]:
from kapipe.report_generation import TemplateBasedReportGenerator

generator = TemplateBasedReportGenerator()

generator.generate_community_reports(
    graph=graph,
    communities=communities,
    path_output="./temp_reports.jsonl"
)

### Apply Chunking

In [18]:
from kapipe.chunking import Chunker
import json

chunker = Chunker(model_name="en_core_web_sm")

with open("./temp_reports_chunked.jsonl", "w") as fout:
    with open("./temp_reports.jsonl") as fin:
        for line in fin:
            # Load the passage
            passage = json.loads(line.strip())
            print("FROM:")
            print(passage)
            # Split the passage into chunked passages
            chunked_passages = chunker.split_passage_to_chunked_passages(
                passage=passage,
                window_size=100
            )
            # Save the chunked passages
            print("TO:")
            for chunked_passage in chunked_passages:
                print(passage)
                json_str = json.dumps(chunked_passage)
                fout.write(json_str + "\n")

FROM:
{'title': 'The primary entities in this community are: alice, tokyo, new york', 'text': 'This community contains the following entities:\n- alice | Person | NO DESCRIPTION.\n- university of tokyo | Location | NO DESCRIPTION.\n- diana | Person | NO DESCRIPTION.\n- manhattan | Location | NO DESCRIPTION.\n- new york | Location | NO DESCRIPTION.\n- tokyo | Location | NO DESCRIPTION.\nThe relationships between the entities are as follows:\n- alice | born_in | tokyo\n- alice | lives_in | new york\n- alice | works_at | manhattan\n- alice | studied_in | university of tokyo\n- alice | friends_with | diana', 'community_id': 'Community(alice)', 'nodes': ['alice', 'university of tokyo', 'diana', 'manhattan', 'new york', 'tokyo'], 'level': 0, 'parent_community_id': 'ROOT', 'child_community_ids': []}
TO:
{'title': 'The primary entities in this community are: alice, tokyo, new york', 'text': 'This community contains the following entities:\n- alice | Person | NO DESCRIPTION.\n- university of to

### Apply Passage Retrieval (Indexing)

In [19]:
from kapipe.passage_retrieval import Contriever
import os

index_root = "./temp_index_root"
index_name = "temp_index_name"

# Create a directory to store the indexing result
utils.mkdir("./temp_index_root/contriever")
utils.mkdir("./temp_index_root/contriever/indexes/temp_index_name")

# Load the chunked passages
passages = []
for line in open("./temp_reports_chunked.jsonl"):
    passage = json.loads(line.strip())
    passages.append(passage)
print(f"Number of passages: {len(passages)}")

# Initialize the retriever
retriever = Contriever(
    max_passage_length=512,
    pooling_method="average",
    normalize=False,
    gpu_id=0,
    metric="inner-product"
)

# Build index
retriever.make_index(
    passages=passages,
    index_root=index_root,
    index_name=index_name
)

Number of passages: 10


### Example questions

In [20]:
questions = [
    {
        "question_key": "q1",
        "question": "Which city does Alice currently live in, and where was she born?"
    },
    {
        "question_key": "q2",
        "question": "Who once shared a house with Charlie in London, and where does that person work now?"
    }
]


### Apply Passage Retrieval (Search)

In [21]:
# Retrieve top-2 chunked passages for each question (query)
retrieved_passages = retriever.search(
    queries=[q["question"] for q in questions],
    top_k=2
)

# Convert the retrieved passages to Contexts format (required by QA component)
contexts = []
for q, p in zip(questions, retrieved_passages):
    c = {
        "question_key": q["question_key"],
        "contexts": p
    }
    contexts.append(c)

### Apply Question Answering (with retrieved contexts)

In [22]:
from kapipe.qa import QA

answerer = QA(identifier="gpt4o_with_context")

answers = []
for q, c in zip(questions, contexts):
    a = answerer.answer(question=q, contexts_for_question=c)
    answers.append(a)

### Show the QA results, prompts, and LLM response

In [23]:
def show_qa_results(answers):
    for answer in answers:
        print(f"Question Key: {answer['question_key']}")
        print(f"Question: {answer['question']}")
        print(f"Answer: {answer['output_answer']}")

In [24]:
show_qa_results(answers)

Question Key: q1
Question: Which city does Alice currently live in, and where was she born?
Answer: Alice currently lives in New York and was born in Tokyo.
Question Key: q2
Question: Who once shared a house with Charlie in London, and where does that person work now?
Answer: Diana once shared a house with Charlie in London, and she now works in London.


In [25]:
for a in answers:
    print("+++++++++++++++++++++++++++++++++++++++")
    print(a["qa_prompt"])
    print("================")
    print(a["qa_generated_text"])

+++++++++++++++++++++++++++++++++++++++
### Task

You are given a question and a set of **context passages**, each possibly from a different document.

Your task is to:

1. **Answer the question** based on the information in the provided context passages.
2. Provide a **confidence score** between 0.0 and 1.0, indicating how certain you are about your answer.

### Guidelines

- Base your answer **only on the given context passages** — do not use outside knowledge.
- Provide a **single-sentence answer** (no line breaks).
- The score should reflect your confidence in the correctness of your answer:
  - **1.0** = absolutely certain  
  - **0.0** = no idea  

### Output Format

Provide the output in the following format:

Answer: [your answer]  
Score: [score between 0.0 and 1.0]

Example:

Answer: Aspirin is used to treat pain, fever, and inflammation.  
Score: 0.95

### Context

[1] The primary entities in this community are: alice, tokyo, new york : This community contains the following 