# NER to Grow KG

## Setup

In [2]:
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [3]:
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from typing import Union
from langchain.load.serializable import Serializable

### Classes for Extracting Subgraph from Document

In [9]:
class Property(BaseModel):
    """A single property consisting of key and value"""
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class ExtractedNode(Serializable):
    """Represents a node in a graph with associated properties.

    Attributes:
        entityId (Union[str, int]): A unique identifier for the node.
        type (str): The type or label of the node, default is "Node".
        properties (list): Additional properties and metadata associated with the node.
    """

    entityId: Union[str, int]
    type: str = "Node"
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")


class ExtractedRelationship(Serializable):
    """Represents a directed relationship between two nodes in a graph.

    Attributes:
        source (ExtractedNode): The source node of the relationship.
        target (ExtractedNode): The target node of the relationship.
        type (str): The type of the relationship.
        properties (list): Additional properties associated with the relationship.
    """

    source: ExtractedNode
    target: ExtractedNode
    type: str
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class ExtractedSubGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[ExtractedNode] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[ExtractedRelationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

### Classes and methods for SubGraph Grown from Node in Another Graph
This prepares data for ingestion

In [10]:
class Node(Serializable):
    """Represents a node in a graph with associated properties.

    Attributes:
        id (Union[str, int]): A unique identifier for the node.
        type (str): The type or label of the node, default is "Node".
        properties (dict): Additional properties and metadata associated with the node.
    """

    entityId: Union[str, int]
    type: str = "Node"
    properties: dict = Field(default_factory=dict)


class Relationship(Serializable):
    """Represents a directed relationship between two nodes in a graph.

    Attributes:
        source (Node): The source node of the relationship.
        target (Node): The target node of the relationship.
        type (str): The type of the relationship.
        properties (dict): Additional properties associated with the relationship.
    """

    source: Node
    target: Node
    type: str
    properties: dict = Field(default_factory=dict)


def format_property_key(s: str):
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def format_properties(props):
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def clean_node(node: ExtractedNode):
    return Node(
        entityId=node.entityId.title(), type=node.type.capitalize(), properties=format_properties(node.properties)
    )

def clean_relationship(rel: ExtractedRelationship):
    return Relationship(
        source=clean_node(rel.source), target=clean_node(rel.target), type=rel.type,
        properties=format_properties(rel.properties)
    )


class GrownSubGraph:
    nodes: List[Node]
    rels: List[Relationship]
    sourceNodeKeyValue: Any
    sourceNodeKeyLabel: str
    sourceNodeLabel: str
    def __init__(self, kg: ExtractedSubGraph, source_node_key_value, source_node_key_label: str = 'documentId', source_node_label: str = 'Document'):
        self.nodes = [clean_node(n) for n in kg.nodes]
        self.rels = [clean_relationship(r) for r in kg.rels]
        self.sourceNodeKeyValue = source_node_key_value
        self.sourceNodeKeyLabel = source_node_key_label
        self.sourceNodeLabel = source_node_label

    def __repr__(self) -> str:
        return f'nodes: {self.nodes} \n\n relationships: {self.rels} \n\n sourceNodeKey: ({self.sourceNodeKeyLabel}: {self.sourceNodeKeyValue}) \n sourceNodeLabel: {self.sourceNodeLabel}'

    def __str__(self) -> str:
        return self.__repr__()


### Named Entity Recognition (NER)

In [4]:

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
        allowed_nodes: Optional[List[str]] = None,
        allowed_rels: Optional[List[str]] = None
):
    prompt = ChatPromptTemplate.from_messages(
        [(
            "system",
            f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(ExtractedSubGraph, llm, prompt, verbose=False)

  warn_deprecated(


In [5]:
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)


## Sample NER Pipeline

In [6]:
raw_documents = graph.query('MATCH(n:Chunk) RETURN n.formId AS formId, n.source as source, n.text AS text')
documents = [Document(page_content=x['text'], metadata={"source": x['source'], 'formId': x['formId']}) for x in raw_documents]
documents

[Document(page_content='Metals recycling was our original business, and it has been one of our core businesses for over a century. In the present era of conservation of natural resources and ecological concerns, we are committed to sound ecological and business conduct. Certain governmental regulations regarding environmental concerns, however well-intentioned, may expose us and our industry to potentially significant risks. We believe that recycled materials are commodities that are diverted by recyclers, such as us, from the solid waste streams because of their inherent value. They are identified, purchased, sorted, processed and sold in accordance with carefully established industry specifications.\nWe incurred environmental expenses of approximately $49.3 million, $44.2 million and $49.8 million for 2023, 2022 and 2021, respectively. The expenses included the cost of disposal, environmental personnel at various divisions, permit and license fees, accruals and payments for studies, 

In [7]:
documents[0].metadata

{'source': 'https://www.sec.gov/Archives/edgar/data/22444/000002244423000126/0000022444-23-000126-index.htm',
 'formId': '0000022444-23-000126'}

In [12]:
from tqdm import tqdm

#TODO Add Relationship Type Restrictions
#extract_chain = get_extraction_chain(['Customer', 'Employee', 'Equipment', 'Goal', 'Group', 'Industry', 'Location', 'Metric', 'Organization', 'Product', 'Service', 'Solution' 'RiskFactor', 'Technology'], None)
extract_chain = get_extraction_chain(['Person', 'Place', 'Thing', 'Group', 'Organization', 'Industry', 'Product', 'Service', 'Solution', 'RiskFactor'], None)
doc_sample = documents[0:6]
sub_graphs = []
for i, d in tqdm(enumerate(doc_sample), total=len(doc_sample)):
    extracted_sub_graph = extract_chain.run(d.page_content)
    sub_graphs.append(GrownSubGraph(extracted_sub_graph, d.metadata['formId']))

100%|██████████| 6/6 [04:03<00:00, 40.53s/it]


In [23]:
sub_graphs[1]

nodes: [Node(entityId='40,000', type='Thing', properties={'number': '40,000'}), Node(entityId='Retail', type='Industry'), Node(entityId='Technology', type='Industry'), Node(entityId='Energy', type='Industry'), Node(entityId='Education', type='Industry'), Node(entityId='Hospitality', type='Industry'), Node(entityId='Public Utilities', type='Industry'), Node(entityId='Others', type='Industry'), Node(entityId='Design, Engineering, And Construction-Oriented Firms', type='Group'), Node(entityId='North America', type='Place'), Node(entityId='Global Solutions', type='Group'), Node(entityId='2.5%', type='Thing', properties={'percentage': '2.5%'}), Node(entityId='U.S.', type='Place'), Node(entityId='Digital Printing Provider', type='Thing'), Node(entityId='Architectural, Engineering And Construction Industry (Aec)', type='Industry'), Node(entityId='Revenue', type='Thing'), Node(entityId='Number Of Customers', type='Thing'), Node(entityId='Number Of Service Centers', type='Thing'), Node(entityId

In [24]:
def ingest_grown_subgraph(subgraph:GrownSubGraph, graph:Neo4jGraph):
    # Ingest nodes
    node_res = graph.query(
        """
        UNWIND $data AS row
        CALL apoc.merge.node([row.type], {entityId: row.entityId},
        row.properties, {}) YIELD node
        RETURN elementId(node) AS elementId
        """,
        {"data": [el.__dict__ for el in subgraph.nodes]}
    )
    element_ids = [n['elementId'] for n in node_res]
    # Link back to Source Node
    graph.query(
        f"""
        MATCH (n) WHERE elementId(n) IN $elementIds
        MATCH (s:{subgraph.sourceNodeLabel} {{{subgraph.sourceNodeKeyLabel}: $sourceNodeKey}})
        MERGE (s)-[r:MENTIONS]->(n)
        ON CREATE SET r.creationTime=datetime(), r.lastUpdateDate=datetime()
        ON MATCH SET r.lastUpdateDate=datetime()
        RETURN distinct 'done' AS result
        """,
        {"elementIds": element_ids, "sourceNodeKey": subgraph.sourceNodeKeyValue}
    )
    #Ingest Relationships
    graph.query(
        """
        UNWIND $data AS row
        CALL apoc.merge.node([row.source_label], {entityId: row.source},
        {}, {}) YIELD node as source
        CALL apoc.merge.node([row.target_label], {entityId: row.target},
        {}, {}) YIELD node as target
        CALL apoc.merge.relationship(source, row.type, {}, row.properties, target) YIELD rel
        RETURN distinct 'done'
        """,
        {"data": [{
            "source": el.source.entityId,
            "source_label": el.source.type,
            "target": el.target.entityId,
            "target_label": el.target.type,
            "type": el.type.replace(" ", "_").upper(),
            "properties": el.properties} for el in subgraph.rels]}
    )

In [26]:
for i, sub_graph in tqdm(enumerate(sub_graphs), total=len(sub_graphs)):
    ingest_grown_subgraph(sub_graph, graph)

  0%|          | 0/6 [00:00<?, ?it/s]Failed to read from defunct connection ResolvedIPv4Address(('54.245.8.251', 7687)) (ResolvedIPv4Address(('54.245.8.251', 7687)))
100%|██████████| 6/6 [00:05<00:00,  1.10it/s]
