# Create knowledge graph from pdf with LLM. (neo4j version)
![title](neo4jdogs.png)

In [2]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Load env variables and connect to neo4j database
Please run _docker-compose up_ first on the directory to start the database.    

In [3]:
import os
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv

load_dotenv()

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

In [4]:
from langchain.schema import Document
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import OutputParserException
llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=4000,
    verbose=True)



  warn_deprecated(


In [5]:
# Simplified Pydantic model of the graph.
# The Langchain KnowledgeGraph model is too complicated as OpenAI functions schema


from typing import List, Dict, Optional, Union
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseModel):
    "Represents a node in a graph with associated properties"
    id: Union[str, int]
    type: Optional[str] = "Node"
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseModel):
    "Represents a directed relationship between two nodes in a graph."
    source: Union[str, int] = Field(..., description="Id of source node")
    target: Union[str, int] = Field(..., description="Id of target node")
    type: Optional[str] =  Field(..., description="Type of relationship")
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )
    
class KnowledgeGraph(BaseModel):
    """Knowlege graph consisting of nodes and relationships"""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

# Magic
The prompt, the function call and the chain.

In [6]:

from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.tracers import ConsoleCallbackHandler
from langchain.tools import tool

# The prompt is from langchain examples

system_prompt = """
# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
*Double check* that the JSON structure is correct.

"""

# Store function call results to this list
function_responses = []

# Define function call in form of a langchain tool (it's converted to a open-ai function). The function schema is defined with pydantic.
@tool
def knowledge_graph(object: KnowledgeGraph) -> Dict[str, List]:
    """A Tool to convert text to knowledge grap"""
    function_responses.append(object)
    return  True


# Added some more precise instructions to have more control over the output. 
# Likely you would have existing schemas or terminology that you would like to reuse.

prompt = ChatPromptTemplate.from_messages([
                ("system",system_prompt),
                MessagesPlaceholder("chat_history", optional=True),
                ("human", 
                 """In this particular case we are interested in dogs. We want to extract information about dog breeds and their characteristics.
                    Characterics should be nodes and relationships should be between dog breeds and their characteristics. 
                    Ignore other entities than dogs, like people and addresses.
                    - **Allowed Node Labels:** Breed, BreedingGroup, Characteristic
                    {input}"""),
                MessagesPlaceholder("agent_scratchpad"),
            ])

function_agent = create_openai_functions_agent(llm, [knowledge_graph], prompt)
chain = AgentExecutor(agent=function_agent, tools=[knowledge_graph], verbose=True, callbacks=[ConsoleCallbackHandler()])


In [7]:
from model import map_to_base_node, map_to_base_relationship
from langchain_community.graphs.graph_document import GraphDocument

def extract_and_store_graph(data: KnowledgeGraph, document: Document) -> Optional[GraphDocument]:
    # Extract graph data using OpenAI functions
            
        # Construct a graph document
        nodes = []
        rels = []
        try:
            nodes= list(map(map_to_base_node, data.nodes))
            rels= map_to_base_relationship(data.rels, nodes)
        except Exception as e:
            print("parsing exception")
            print(e)
        
        if len(nodes) == 0:
            return None
            
        return GraphDocument(
            nodes = nodes,
            relationships = rels,
            source = document
        )

        

# Download test documents

In [8]:
from tqdm import tqdm
import urllib.request

local_folder = "./data/"
os.makedirs(local_folder,exist_ok=True)

doc_names = []

documents = [
"https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf" 
]
for doc in tqdm(documents):
    print("Downloading", doc)
    doc_names.append(doc.split("/")[-1])
    if os.path.isfile(local_folder + doc.split("/")[-1]):
        continue
    urllib.request.urlretrieve(doc, local_folder + doc.split("/")[-1])
    

100%|██████████| 1/1 [00:00<00:00, 7943.76it/s]

Downloading https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf





## PDF to Txt
Read and chuck the docs

In [9]:
import time

from pdf import parse_pdf

# Currently this utility only uses pypdf. For more serious stuff you should use Azure Document Intelligence or similar service.

docs_pages_map = dict()
for doc in doc_names:
    print("Processing ",doc)
    start_time = time.time()
    
    doc_map = parse_pdf(file=local_folder+doc)
    docs_pages_map[doc]= doc_map
    
    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Processed {len(doc_map)} pages in {elapsed_time:.6f} seconds\n")
    
print(docs_pages_map)

Processing  Dog-Breed-Characteristics-Behavior.pdf
Processed 7 pages in 0.149654 seconds

{'Dog-Breed-Characteristics-Behavior.pdf': [(0, 0, ' \n  \n Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \

## Load documents to Neo4J.
Parse the docs with LLM to extract the graph.
This will take some time. Later we store the graph into pickle,so that you don't need to do this all the time.

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

full_doc = ""
graph_docs = []


for doc_name,doc_map in docs_pages_map.items():    
    for page in tqdm(doc_map):
        try:
            text = page[2].strip()
            # This will update the list function_responses. It's a global variable. (Not optimal I know)
            data = chain.invoke(
                {
                    "input": text,
                }
            )      
            for graph_doc in function_responses:
                tmp = extract_and_store_graph(graph_doc, Document(page_content=text))
                if tmp:                                    
                    graph_docs.append(tmp)
                
            function_responses.clear()
        except OutputParserException as e:
            print("output exception")
            print(e)

    
graph.add_graph_documents(graph_docs)
         


  0%|          | 0/7 [00:00<?, ?it/s]

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \nBreeds in this group were bred to herd sh

 14%|█▍        | 1/7 [01:28<08:52, 88.79s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the following structure:

- **BreedingGroup Node:**
  - ID: HerdingGroup
  - Properties: description: "Breeds in this group were bred to herd sheep and cattle."

- **Characteristic Nodes:**
  - IDs: Alert, Smart, Independent, Confident, Trainable, Loyal, Affectionate

- **Breed Nodes:**
  - IDs: Collies, Sheepdogs, CattleDogs, Corgis, Shepherds, BelgianMalinois, BelgianTervuren

- **Relationships:**
  - Breeds Collies, Sheepdogs, CattleDogs, Corgis, Shepherds, BelgianMalinois, and BelgianTervuren belong to the HerdingGroup.
  - The HerdingGroup has the characteristics Alert, Smart, Independent, Confident, Trainable, Loyal, and Affectionate.

This graph represents the herding group of dog breeds and their associated characteristics as described in the provided text.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [88.78s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been success

 29%|██▊       | 2/7 [03:18<08:24, 100.96s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the provided information about dog breeds and their characteristics. The graph includes nodes for the Hound group, Sighthounds, Scent Hounds, and various characteristics such as Independent, Sweet, Intelligent, and Affectionate. It also includes nodes for specific breeds like Afghan, Saluki, Ibizan, and others. Relationships have been established between the breeding groups and their characteristics, as well as between the breeding groups and the breeds they include.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [109.46s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with the provided information about dog breeds and their characteristics. The graph includes nodes for the Hound group, Sighthounds, Scent Hounds, and various characteristics such as Independent, Sweet, Intelligent, and Affectionate. It also includes nodes for specific breeds like Afghan,

 43%|████▎     | 3/7 [05:42<08:02, 120.68s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the provided information about dog breeds and their characteristics. Here is the structured representation:

### Nodes:
1. **Breeding Groups:**
   - NonSportingGroup: Fastest growing group with varied sizes, shapes, colors, coat types. Often dogs who no longer perform their original duties.
   - SportingGroup: Bred to assist hunters, can be therapy, assistance, search/rescue dogs. Athletic, active, strong, need plenty of exercise.

2. **Characteristics:**
   - Playful
   - Smart
   - Curious
   - Friendly
   - Loyal
   - Eager
   - Happy
   - Trainable
   - Bright
   - Confident
   - Lovable

3. **Breeds:**
   - American Eskimo
   - Boston Terrier
   - Bulldogs
   - Shar Pei
   - Chow Chow
   - Dalmation
   - Keeshond
   - Poodle
   - Shiba Inu
   - Lhasa Apso
   - Tibetan Terrier
   - Flat Coated
   - Curly Coated
   - Labrador
   - Golden
   - Chesapeake Bay (Note: Not as friendly and outgoing, also used as guard dog

 57%|█████▋    | 4/7 [06:59<05:10, 103.62s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another operation, feel free to ask.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [77.45s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another operation, feel free to ask."
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 5 of 7 \nToy Group : \nDogs in  this group are miniature versions of dogs from many other gro

 71%|███████▏  | 5/7 [09:46<04:12, 126.47s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the provided information about dog breeds and their characteristics, organized into the Toy and Working breeding groups. Each breed has been associated with its respective group and characteristics.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [166.98s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with the provided information about dog breeds and their characteristics, organized into the Toy and Working breeding groups. Each breed has been associated with its respective group and characteristics."
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 6 of 7 \n \nProtection Breeds : Most pro

 86%|████████▌ | 6/7 [12:56<02:28, 148.11s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another query, feel free to ask![0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [190.12s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another query, feel free to ask!"
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 7 of 7 \nTraining Resources : \nMarin Humane ’s Behavior & Training  conducts various training class

100%|██████████| 7/7 [13:30<00:00, 115.82s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the characteristics related to dog behavior and training as nodes. There are no relationships specified in the provided text between dog breeds and these characteristics, so no relationships have been included in the graph. If you have specific information about dog breeds and how they relate to these characteristics, please provide that, and I can update the knowledge graph accordingly.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [33.73s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with the characteristics related to dog behavior and training as nodes. There are no relationships specified in the provided text between dog breeds and these characteristics, so no relationships have been included in the graph. If you have specific information about dog breeds and how they relate to these characteristics, please provide that, and I can update the know




In [11]:
#Let's pickle the graph so we don't have to redo this all the time
import pickle


with open('./data/graph_docs.pkl','wb') as f:
    pickle.dump(graph_docs, f)

In [10]:
# Only execute this cell if you need to load the graph from pickle (like after kernel restart)
import pickle
with open('./data/graph_docs.pkl','rb') as f:
    graph_docs = pickle.load(f)
    graph.add_graph_documents(graph_docs)
    

In [13]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()
print(graph.structured_schema)

# Need new llm, cypher chain does not work with json output
c_llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=1500,
    verbose=True)

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=c_llm,
    qa_llm=c_llm,
    validate_cypher=True,
    verbose=True
)

{'node_props': {'Breedinggroup': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}], 'Characteristic': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}], 'Breed': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}]}, 'rel_props': {}, 'relationships': [{'start': 'Breedinggroup', 'type': 'HASCHARACTERISTIC', 'end': 'Characteristic'}, {'start': 'Breedinggroup', 'type': 'INCLUDESBREED', 'end': 'Breed'}, {'start': 'Breed', 'type': 'BELONGSTO', 'end': 'Breedinggroup'}, {'start': 'Breed', 'type': 'HASCHARACTERISTIC', 'end': 'Characteristic'}]}


In [16]:
from IPython.display import Markdown, display
response = cypher_chain.invoke("Make table of different breeding groups and their characteristics")
display(Markdown(response["result"]))



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (bg:Breedinggroup)-[:HASCHARACTERISTIC]->(c:Characteristic)
RETURN bg.name AS BreedingGroupName, collect(c.name) AS Characteristics
[0m
Full Context:
[32;1m[1;3m[{'BreedingGroupName': 'HerdingGroup', 'Characteristics': ['Affectionate', 'Loyal', 'Trainable', 'Confident', 'Independent', 'Smart', 'Alert']}, {'BreedingGroupName': 'HoundGroup', 'Characteristics': ['Affectionate', 'Intelligent', 'Sweet', 'Independent']}, {'BreedingGroupName': 'NonSportingGroup', 'Characteristics': ['Eager', 'Friendly', 'Curious', 'Loyal', 'Smart', 'Playful']}, {'BreedingGroupName': 'SportingGroup', 'Characteristics': ['Trainable', 'Confident', 'Friendly', 'Eager', 'Bright', 'Lovable', 'Happy']}, {'BreedingGroupName': 'ToyGroup', 'Characteristics': ['Alert', 'Intelligent', 'Sensitive', 'Sassy', 'Fearless', 'Affectionate']}, {'BreedingGroupName': 'WorkingGroup', 'Characteristics': ['Calm', 'Confident', 'Smart', 

Certainly! Here's a table of different breeding groups and their characteristics:

| Breeding Group    | Characteristics                                      |
|-------------------|------------------------------------------------------|
| Herding Group     | Affectionate, Loyal, Trainable, Confident, Independent, Smart, Alert |
| Hound Group       | Affectionate, Intelligent, Sweet, Independent        |
| Non-Sporting Group| Eager, Friendly, Curious, Loyal, Smart, Playful      |
| Sporting Group    | Trainable, Confident, Friendly, Eager, Bright, Lovable, Happy |
| Toy Group         | Alert, Intelligent, Sensitive, Sassy, Fearless, Affectionate |
| Working Group     | Calm, Confident, Smart, Strong, Loyal, Protective, Courageous |

Each breeding group is associated with a unique set of characteristics that are commonly found within the breeds that make up the group.

In [11]:
graph.query("MATCH (n) DETACH DELETE n")

[]