In [9]:
%pip install -q langchain-core langchain-community langchain-text-splitters pydantic

Note: you may need to restart the kernel to use updated packages.


# Knowledge Graph Extraction

Using LangChain to create graph components:

- **Node**: Entities in our graph
- **Relationship**: Connections between nodes
- **GraphDocument**: Stores the graph structure

Docs: [LangChain GraphDocument](https://python.langchain.com/api_reference/community/graphs/langchain_community.graphs.graph_document.GraphDocument.html)



In [10]:
# Import things here

from langchain_text_splitters import TokenTextSplitter
from typing import List, Optional

# We intentionally don't use these langchain types because they are not compatible with the new openai api and we want to be able to use it to generate our graph 
# from langchain.pydantic_v1 import BaseModel, Field
# from langchain_community.graphs.graph_document import (
#     Node as BaseNode,
#     Relationship as BaseRelationship,
#     GraphDocument,
# )
from openai_interacter import OpenAIChatInterface
from pydantic import BaseModel, Field

In [11]:
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=200):
    """
    Splits text into chunks of a fixed token length with a specified overlap.
    
    Args:
        text (str): The text to split into chunks.
        chunk_size (int): The size of each chunk in tokens.
        chunk_overlap (int): The number of tokens to overlap between chunks.
    
    Returns:
        list: A list of text chunks.
    """
    # Initialize the text splitter with the specified chunk size and overlap
    text_splitter = TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Split the text into chunks
    chunks = text_splitter.split_text(text)
    
    return chunks



In [12]:
class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str 
  value: str 

class Node(BaseModel):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseModel):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )
    
        
class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [13]:
def create_extraction_chat(extraction_text: str, allowed_nodes: Optional[List[str]] = None, allowed_relationships: Optional[List[str]] = None):
    extraction_chat = OpenAIChatInterface(initial_messages=[{
        "role": "developer", 
        "content": 
        f"""
            # Knowledge Graph Instructions for GPT-4
            ## 1. Overview
            You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
            - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
            - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
            ## 2. Labeling Nodes
            - **Consistency**: Ensure you use basic or elementary types for node labels.
            - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
            - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
            {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
            {'- **Allowed Relationship Types**:' + ", ".join(allowed_relationships) if allowed_relationships else ""}
            ## 3. Handling Numerical Data and Dates
            - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
            - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
            - **Property Format**: Properties must be in a key-value format.
            - **Quotation Marks**: Never use escaped single or double quotes within property values.
            - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
            ## 4. Coreference Resolution
            - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
            If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
            always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
            Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
            ## 5. Strict Compliance
            Adhere to the rules strictly. Non-compliance will result in termination.
        """
        
    }])

    extraction_chat.add_message(role="user", content=f"""

Extract the nodes and relationships in the following data: 
                                {extraction_text}

""")

    extraction_chat.enable_structured_output(KnowledgeGraph)
    return extraction_chat

In [14]:
extraction_chat = create_extraction_chat(
    extraction_text="""
    Albert Einstein was a German-born theoretical physicist who developed the theory of relativity. 
    He was born in Ulm, Germany in 1879. Einstein is best known for his mass-energy equivalence formula E = mc².
    In 1921, he received the Nobel Prize in Physics for his discovery of the law of the photoelectric effect.
    Einstein married Mileva Marić in 1903, and they had two sons together before divorcing in 1919.
    He later married his cousin Elsa Löwenthal in 1919.
    """
)

In [15]:
data = extraction_chat.parse_structured_output()
print(data)

nodes=[Node(properties=[Property(key='birthPlace', value='Ulm, Germany'), Property(key='birthDate', value='1879'), Property(key='field', value='theoretical physics')]), Node(properties=[Property(key='name', value='Mileva Marić')]), Node(properties=[Property(key='name', value='Nobel Prize in Physics'), Property(key='year', value='1921')]), Node(properties=[Property(key='relation', value='photoelectric effect')]), Node(properties=[Property(key='name', value='Elsa Löwenthal')])] rels=[Relationship(properties=[Property(key='relationship', value='developed')]), Relationship(properties=[Property(key='relationshipType', value='mass-energy equivalence formula')]), Relationship(properties=[Property(key='yearMarried', value='1903'), Property(key='numberOfChildren', value='2')]), Relationship(properties=[Property(key='yearDivorced', value='1919')]), Relationship(properties=[Property(key='yearMarried', value='1919')])]


In [16]:
data

KnowledgeGraph(nodes=[Node(properties=[Property(key='birthPlace', value='Ulm, Germany'), Property(key='birthDate', value='1879'), Property(key='field', value='theoretical physics')]), Node(properties=[Property(key='name', value='Mileva Marić')]), Node(properties=[Property(key='name', value='Nobel Prize in Physics'), Property(key='year', value='1921')]), Node(properties=[Property(key='relation', value='photoelectric effect')]), Node(properties=[Property(key='name', value='Elsa Löwenthal')])], rels=[Relationship(properties=[Property(key='relationship', value='developed')]), Relationship(properties=[Property(key='relationshipType', value='mass-energy equivalence formula')]), Relationship(properties=[Property(key='yearMarried', value='1903'), Property(key='numberOfChildren', value='2')]), Relationship(properties=[Property(key='yearDivorced', value='1919')]), Relationship(properties=[Property(key='yearMarried', value='1919')])])