# Create knowledge graph from pdf with LLM. (neo4j version)
![title](neo4jdogs.png)

In [2]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Load env variables and connect to neo4j database
Please run _docker-compose up_ first on the directory to start the database.    

In [3]:
import os
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv

load_dotenv()

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

In [4]:
from typing import List, Dict, Any, Optional
from langchain.schema import Document
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import OutputParserException
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=4000, 
    response_format={ "type": "json_object" },
    verbose=True)



                    response_format was transferred to model_kwargs.
                    Please confirm that response_format is what you intended.


# Magic
The prompt, the function call and the chain.

In [5]:

from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers.ernie_functions import JsonOutputFunctionsParser
from langchain_core.runnables import  RunnablePassthrough

# The prompt is from langchain examples

system_prompt = """
# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
*Double check* that the JSON structure is correct.

"""


# JSON Schema used in this was generated from python dictionary (langchain.graphs.graph_document) with gpt ui.

functions = [
    {
        "name": "knowledge_graph",
        "description": "A knowledge graph output",
        "parameters": {
            "type": "object",
            "properties": {
                "nodes": {
                    "description": "List of nodes in a graph with associated properties",
                    "type": "array",
                    "items": {
                        "type": "object",
                        "description": "Represents a node in a graph with associated properties",
                        "properties": {
                            "id": {
                                "type": "string"
                            },
                            "type": {
                                "type": "string"
                            },
                            "properties": {
                                "type": "array",
                                "description": "Additional properties and metadata associated with the node.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "key": {
                                            "type": "string"
                                        },
                                        "value": {
                                            "type": "string"
                                        }
                                    },
                                    "required": [
                                        "key",
                                        "value"
                                    ]
                                }
                            }
                        },
                        "required": [
                            "id",
                            "type"
                        ]
                    }
                },
                "rels": {
                    "description": "List of directed relationships between nodes in a graph.",
                    "type": "array",
                    "items": {
                        "type": "object",
                        "description": "Represents a directed relationship between two nodes in a graph.",
                        "properties": {
                            "source": {
                                "type": "object",
                                "properties": {
                                    "id": {
                                        "type": "string"
                                    }
                                },
                                "required": [
                                    "id"
                                ]
                            },
                            "target": {
                                "type": "object",
                                "properties": {
                                    "id": {
                                        "type": "string"
                                    }
                                },
                                "required": [
                                    "id"
                                ]
                            },
                            "type": {
                                "type": "string"
                            },
                            "properties": {
                                "type": "array",
                                "description": "Additional properties and metadata associated with the relationship.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "key": {
                                            "type": "string"
                                        },
                                        "value": {
                                            "type": "string"
                                        }
                                    },
                                    "required": [
                                        "key",
                                        "value"
                                    ]
                                }
                            }
                        },
                        "required": [
                            "source",
                            "target",
                            "type"
                        ]
                    }
                },
            },
            "required": ["nodes", "rels"],
        },
    }
]

# Added some more precise instructions to have more control over the output. Likely you would have existing schemas or terminology that you would like to reuse.

prompt = ChatPromptTemplate.from_messages([
                ("system",system_prompt),
                ("human", 
                 """In this particular case we are interested in dogs. We want to extract information about dog breeds and their characteristics.
                    Characterics should be nodes and relationships should be between dog breeds and their characteristics. 
                    Ignore other entities than dogs, like people and addresses.
                    - **Allowed Node Labels:** Breed, BreedingGroup, Characteristic
                    {user_input}"""),
            ])

chain = (
    {"user_input": RunnablePassthrough()}
    | prompt
    | llm.bind(function_call={"name": "knowledge_graph"}, functions=functions)
    | JsonOutputFunctionsParser()
)



In [6]:
from  langchain.graphs.graph_document import Node, Relationship
from model import map_to_base_node, map_to_base_relationship



def extract_and_store_graph(
    document: Document,
    prompt: str) -> Optional[GraphDocument]:
    # Extract graph data using OpenAI functions
    try: 
        try:
            data = chain.invoke(document.page_content)        
        except OutputParserException as e:
            print("output exception")
            print(e)
        # Construct a graph document
        nodes = []
        rels = []
        try:
            nodes= list(map(map_to_base_node, data["nodes"]))
            rels= map_to_base_relationship(data["rels"], nodes)
        except Exception as e:
            print(e)
        

        graph_document = GraphDocument(
            nodes = nodes,
            relationships = rels,
            source = document
        )
        # Store information into a graph
        return graph_document
    except Exception as e:
        print(e)
        print("Failed to extract graph data from document")
        

# Download test documents

In [7]:
from tqdm import tqdm
import urllib.request

local_folder = "./data/"
os.makedirs(local_folder,exist_ok=True)

doc_names = []

documents = [
"https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf" 
]
for doc in tqdm(documents):
    print("Downloading", doc)
    doc_names.append(doc.split("/")[-1])
    if os.path.isfile(local_folder + doc.split("/")[-1]):
        continue
    urllib.request.urlretrieve(doc, local_folder + doc.split("/")[-1])
    

100%|██████████| 1/1 [00:00<00:00, 6786.90it/s]

Downloading https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf





## PDF to Txt
Read and chuck the docs

In [8]:
import time

from pdf import parse_pdf

# Currently this utility only uses pypdf. For more serious stuff you should use Azure Document Intelligence or similar service.

docs_pages_map = dict()
for doc in doc_names:
    print("Processing ",doc)
    start_time = time.time()
    
    doc_map = parse_pdf(file=local_folder+doc)
    docs_pages_map[doc]= doc_map
    
    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Processed {len(doc_map)} pages in {elapsed_time:.6f} seconds\n")
    
print(docs_pages_map)

Processing  Dog-Breed-Characteristics-Behavior.pdf
Processed 7 pages in 0.159831 seconds

{'Dog-Breed-Characteristics-Behavior.pdf': [(0, 0, ' \n  \n Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \

## Load documents to Neo4J.
Parse the docs with LLM to extract the graph.
This will take some time. Later we store the graph into pickle,so that you don't need to do this all the time.

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#the pdf used was not actual book, so just combining all pages with cleanup into one document and chunking it
# Controlling the chunk-size allows you to finetune how much time is spent in the entity extraction.
# Bigger chunks are better, but lead to more errors, which leads to more time spent in fixing the errors.
# At the moment this is unoptimized and likely could be improved :/
full_doc = ""
graph_docs = []
for doc_name,doc_map in docs_pages_map.items():    
    for page in tqdm(doc_map):
        page_num = page[0] + 1
        content = page[2].strip()
        content = os.linesep.join(
            [
                line.strip() for line in content.splitlines()
                if line and line.strip()
            ]
        )
        full_doc += content + "\n"
if len(full_doc) > 0:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,
        chunk_overlap=150,
    )
    texts = text_splitter.create_documents([full_doc])
    for text in texts:
        graph_docs.append(extract_and_store_graph(text, system_prompt))
        
graph.add_graph_documents(graph_docs)


100%|██████████| 7/7 [00:00<00:00, 21683.99it/s]


output exception
Could not parse function call data: Expecting ',' delimiter: line 204 column 8 (char 3293)
cannot access local variable 'data' where it is not associated with a value
output exception
Could not parse function call data: Expecting ':' delimiter: line 159 column 8 (char 3383)
cannot access local variable 'data' where it is not associated with a value
Azure has not provided the response due to a content filter being triggered
Failed to extract graph data from document
Azure has not provided the response due to a content filter being triggered
Failed to extract graph data from document
output exception
Could not parse function call data: Unterminated string starting at: line 202 column 7 (char 3669)
cannot access local variable 'data' where it is not associated with a value
output exception
Could not parse function call data: Expecting ':' delimiter: line 201 column 8 (char 3346)
cannot access local variable 'data' where it is not associated with a value
output exception
C

AttributeError: 'NoneType' object has no attribute 'nodes'

In [None]:
#Let's pickle the graph so we don't have to redo this all the time
import pickle


with open('./data/graph_docs.pkl','wb') as f:
    pickle.dump(graph_docs, f)

In [None]:
# Only execute this cell if you need to load the graph from pickle (like after kernel restart)
import pickle
with open('./data/graph_docs.pkl','rb') as f:
    graph_docs = pickle.load(f)
    graph.add_graph_documents(graph_docs)
    

In [23]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()
print(graph.structured_schema)

# Need new llm, cypher chain does not work with json output
c_llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=1500,
    verbose=True)

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=c_llm,
    qa_llm=c_llm,
    validate_cypher=True,
    verbose=True
)

{'node_props': {'Breedinggroup': [{'property': 'description', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}], 'Characteristic': [{'property': 'associatedwith', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}], 'Breed': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'temperament', 'type': 'STRING'}, {'property': 'favoriteactivity', 'type': 'STRING'}, {'property': 'disposition', 'type': 'STRING'}, {'property': 'purpose', 'type': 'STRING'}]}, 'rel_props': {}, 'relationships': [{'start': 'Breedinggroup', 'type': 'HASCHARACTERISTIC', 'end': 'Characteristic'}, {'start': 'Breedinggroup', 'type': 'INCLUDESBREED', 'end': 'Breed'}, {'start': 'Breed', 'type': 'BELONGSTO', 'end': 'Breedinggroup'}, {'start': 'Breed', 'type': 'INCLUDESBREED', 'end': 'Breed'}, {'start

In [25]:
cypher_chain.run("Which breeds are loyal?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (b:Breed)-[:HASCHARACTERISTIC]->(c:Characteristic {name: "loyal"})
RETURN b.name AS BreedName
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I don't know the answer."

In [10]:
graph.query("MATCH (n) DETACH DELETE n")

[]