In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
# OPENAI_API_KEY in .env file

from langchain.graphs import Neo4jGraph

url = "bolt://localhost:7687"
username = "neo4j"
password = "athena_password"
graph = Neo4jGraph(url=url, username=username, password=password)

In [2]:
from langchain.chains.openai_functions import create_structured_output_chain
from langchain.chains.openai_functions.extraction import (
    create_extraction_chain_pydantic,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Check out the new 11/06/2023 models!
gpt3_turbo = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
gpt4_turbo = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

gpt3 = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
gpt4 = ChatOpenAI(model="gpt-4", temperature=0)

# New GPT4-Turbo "works", but now creates duplicate nodes in the relationships.

# New GPT3-Turbo seems to throw an error as well. It throws a ValidationError.

# I had to lock the openai dependency to < 1.0.0 to successfully instantiate the ChatOpenAI variables

In [42]:
from typing import List, Dict, Any, Optional

from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.pydantic_v1 import Field, BaseModel
from langchain.schema import Document


class Property(BaseModel):
    """A single property consisting of key and value"""

    key: str = Field(..., description="key")
    value: str = Field(..., description="value")


class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="List of node properties")


class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")


class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""

    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")

In [37]:
def format_property_key(s: str) -> str:
    words = s.split()

    if not words:
        return s

    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]

    return "".join([first_word] + capitalized_words)


def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""

    properties = {}

    if not props:
        return properties

    for p in props:
        properties[format_property_key(p.key)] = p.value

    return properties


def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""

    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()

    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""

    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [44]:
def get_extraction_chain(
    llm,
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None,
    verbose: bool = False
):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                f"""# Knowledge Graph Instructions for GPT
## 1. Overview
You are a top-tier algorithm designed for extracting information from markdown notes in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """,
            ),
            (
                "human",
                "Use the given format to extract information from the following input: {input}",
            ),
            ("human", "Tip: Make sure to answer in the correct format"),
        ]
    )
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=verbose)

In [39]:
def extract_and_store_graph(
    llm,
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None,
) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(llm, nodes, rels)
    data = extract_chain.run(document.page_content)

    # Construct a graph document
    graph_document = GraphDocument(
        nodes=[map_to_base_node(node) for node in data.nodes],
        relationships=[map_to_base_relationship(rel) for rel in data.rels],
        source=document,
    )

    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [8]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import TokenTextSplitter


loader = DirectoryLoader('../../../../', glob="**/*.md")
raw_docs = loader.load()
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)

docs = text_splitter.split_documents(raw_docs)

still need to deal with cutting off embedding id at top of docs

In [9]:
docs

[Document(page_content='embed_id: asdfjkl;\n\nHeinrich Schliemann\ndiscovers Troy in the 1870s - it’s now known to be a real place! Archaeology is having a revolution. Period of romanticism and philhellenism - Greece gaining independence from Ottoman empire.\n\nfound several layers of different rebuildings of Troy\nTroy - very strategic location at mouth of water trade route\n\nvery destructive digging and some looting - dug a giant hole\n\n“The Treasure of Priam” - lots of wealth - gold & copper items\n\nbro liked to lie, was brilliant linguist but generally questionable & problematic\n\nFueled by success of Troy, Schliemann finds Mycenae\n- again digs straight down, into burial sites “Grave Circle A”\n- finds again lots of gold - remember no gold found in Greece, had to have come from somewhere else via war / trade\n- “palace of Agamemnon”\n\nStill scholarship being conducted on these findings with new information, techniques, and technology. DNA analysis\n\nMinoans 1700 - 1500 BCE\n

In [10]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [11]:
# want to generate embeddings for documents and store them in the graph database as well
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [66]:
# experiment with one doc
chain = get_extraction_chain(gpt4)
data = chain.run(docs[0].page_content)

In [87]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [65]:
# add to neo4j database
graph.add_graph_documents([graph_document])

In [68]:
base_nodes = [map_to_base_node(node) for node in data.nodes]
base_relationships = [map_to_base_relationship(rel) for rel in data.rels]

In [59]:
base_nodes

[Node(id='Heinrichschliemann', type='Person', properties={'discovery': 'Troy', 'discoveryperiod': '1870s', 'occupation': 'archaeologist', 'reputation': 'brilliant linguist, generally questionable and problematic', 'name': 'Heinrichschliemann'}),
 Node(id='Troy', type='Place', properties={'location': 'strategic location at mouth of water trade route', 'discovery': '1870s', 'excavationmethod': 'very destructive digging and some looting', 'name': 'Troy'}),
 Node(id='Treasureofpriam', type='Artifact', properties={'composition': 'gold & copper items', 'name': 'Treasureofpriam'}),
 Node(id='Mycenae', type='Place', properties={'discoveryby': 'HeinrichSchliemann', 'excavationmethod': 'digs straight down into burial sites', 'notablefind': 'Grave Circle A', 'associatedlegend': 'palace of Agamemnon', 'name': 'Mycenae'}),
 Node(id='Minoans', type='Civilization', properties={'timeperiod': '1700 - 1500 BCE', 'origin': 'Crete', 'influence': 'on Mycenaean Greeks', 'economicsystem': 'redistributed econ

In [60]:
base_relationships

[Relationship(source=Node(id='Heinrichschliemann', properties={'name': 'Heinrichschliemann'}), target=Node(id='Troy', properties={'name': 'Troy'}), type='discovered'),
 Relationship(source=Node(id='Heinrichschliemann', properties={'name': 'Heinrichschliemann'}), target=Node(id='Mycenae', properties={'name': 'Mycenae'}), type='discovered'),
 Relationship(source=Node(id='Minoans', properties={'name': 'Minoans'}), target=Node(id='Crete', properties={'name': 'Crete'}), type='originatedFrom'),
 Relationship(source=Node(id='Minoans', properties={'name': 'Minoans'}), target=Node(id='Knossos', properties={'name': 'Knossos'}), type='built'),
 Relationship(source=Node(id='Mycenae', properties={'name': 'Mycenae'}), target=Node(id='Minoans', properties={'name': 'Minoans'}), type='influencedBy')]

In [21]:
doc0_embedding = embeddings_model.embed_documents([docs[0].page_content])[0]

In [39]:
doc0_embedding

[0.0007876761282042037,
 -0.01594940299677683,
 0.013811306955887646,
 -0.020424799401723554,
 -0.013798027226470002,
 0.024554909803294674,
 -0.019893595323855668,
 -0.017250853732611253,
 0.005521202803434515,
 -0.04416962242747612,
 0.019216309565780534,
 0.03731708833286419,
 0.0029681031110497295,
 -0.007270856281477994,
 -0.020013115682582364,
 0.022124651333313625,
 0.025484516939563474,
 0.018685105487912648,
 -0.0031706245726045973,
 0.0005631593872055561,
 -0.030995760178765413,
 0.003165644557657652,
 -0.0015413219602453973,
 -0.012410255921114308,
 0.010398320569321957,
 0.009999917510921044,
 0.03532507490350489,
 -0.03444858854755193,
 0.005670604183165516,
 -0.012941459998982193,
 -0.004993318890751699,
 -0.016254847017931603,
 -0.028074137750492046,
 -0.005813365232526378,
 -0.025736840180402405,
 -0.0006511400393193587,
 0.027941336731025074,
 -0.01423627040444648,
 -0.0003799354544662701,
 0.01778205781047914,
 -0.02316050003021411,
 0.009335912413586186,
 -0.00311584

In [22]:
docs[0].metadata

{'source': '../../../../09-02-23 - Lecture 2 - The Bronze Age.md'}

In [88]:
file_node = BaseNode(id="file_bronze_age", type="ObsidianNote", properties={"name":"file_bronze_age", "source": '../../../../09-02-23 - Lecture 2 - The Bronze Age.md'})
chunk_node = BaseNode(id="file_bronze_age_chunk_0", type="ObsidianNoteChunk", properties={"name":"chunk_bronze_age_0", "estimate_token_chunk_size": 512, "embedding":doc0_embedding})

In [89]:
file_to_chunk_relationship = BaseRelationship(source=file_node, target=chunk_node, type="contains_note_chunk")

In [90]:
chunk_relationships = []
for node in base_nodes:
    rel = BaseRelationship(source=chunk_node, target=node, type="references_node")
    chunk_relationships.append(rel)

In [91]:
chunk_relationships

[Relationship(source=Node(id='file_bronze_age_chunk_0', type='ObsidianNoteChunk', properties={'name': 'chunk_bronze_age_0', 'estimate_token_chunk_size': 512, 'embedding': [-0.006657697985245407, -0.009600887448858167, 0.010940545664503025, -0.01302446010007967, -0.01618415925331683, 0.02300424032068782, -0.021015050707863237, -0.026766109391051274, -0.0005505794175122232, -0.04841716030988084, 0.018619903156894047, 0.03864712418579039, 0.003981763186355774, -0.0064445703936824, -0.022638878735151236, 0.027767471187554182, 0.020108411871467172, 0.016089436293209986, -0.0051319754817869484, 0.0003748760937446049, -0.030500916088987938, 0.005463507239144814, 0.0015646266428825267, -0.00691142099809383, 0.00400544439204379, 0.010284248208555299, 0.03677972176814467, -0.032070615646131895, 0.0004406326796582123, -0.016265351958532882, -0.008315356772034732, -0.01749675416521228, -0.031745852275848564, -0.004347124771892357, -0.016116500528282004, -0.00017306039026902963, 0.02152926186100547

In [85]:
graph_document = GraphDocument(
    nodes = base_nodes + [file_node, chunk_node],
    relationships = base_relationships + chunk_relationships + [file_to_chunk_relationship],
    source = docs[0]
)

In [86]:
# add to neo4j database
graph.add_graph_documents([graph_document])