# Create knowledge graph from pdf with LLM. (neo4j version)
![title](neo4jdogs.png)

In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Load env variables and connect to neo4j database
Please run _docker-compose up_ first on the directory to start the database.    

In [2]:
import os
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv

load_dotenv()

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

In [3]:
from langchain.schema import Document
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import OutputParserException
llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=4000,
    verbose=True)



  warn_deprecated(


In [4]:
# Simplified Pydantic model of the graph.
# The Langchain KnowledgeGraph model is too complicated as OpenAI functions schema


from typing import List, Dict, Optional, Union
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseModel):
    "Represents a node in a graph with associated properties"
    id: Union[str, int]
    type: Optional[str] = "Node"
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseModel):
    "Represents a directed relationship between two nodes in a graph."
    source: Union[str, int] = Field(..., description="Id of source node")
    target: Union[str, int] = Field(..., description="Id of target node")
    type: Optional[str] =  Field(..., description="Type of relationship")
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )
    
class KnowledgeGraph(BaseModel):
    """Knowlege graph consisting of nodes and relationships"""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

# Magic
The prompt, the function call and the chain.

In [15]:

from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.tracers import ConsoleCallbackHandler
from langchain.tools import tool

# The prompt is from langchain examples

system_prompt = """
# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
*Double check* that the JSON structure is correct.

"""

# Store function call results to this list
function_responses = []

# Define function call in form of a langchain tool (it's converted to a open-ai function). The function schema is defined with pydantic.
@tool
def knowledge_graph(object: KnowledgeGraph) -> Dict[str, List]:
    """A Tool to convert text to knowledge grap"""
    function_responses.append(object)
    return  True


# Added some more precise instructions to have more control over the output. 
# Likely you would have existing schemas or terminology that you would like to reuse.

prompt = ChatPromptTemplate.from_messages([
                ("system",system_prompt),
                MessagesPlaceholder("chat_history", optional=True),
                ("human", 
                 """In this particular case we are interested in dogs. We want to extract information about dog breeds and their characteristics.
                    Characterics should be nodes and relationships should be between dog breeds and their characteristics. 
                    Ignore other entities than dogs, like people and addresses.
                    - **Allowed Node Labels:** Breed, BreedingGroup, Characteristic
                    {input}"""),
                MessagesPlaceholder("agent_scratchpad"),
            ])

function_agent = create_openai_functions_agent(llm, [knowledge_graph], prompt)
chain = AgentExecutor(agent=function_agent, tools=[knowledge_graph], verbose=True, callbacks=[ConsoleCallbackHandler()])


In [6]:
from model import map_to_base_node, map_to_base_relationship
from langchain_community.graphs.graph_document import GraphDocument

def extract_and_store_graph(data: KnowledgeGraph, document: Document) -> Optional[GraphDocument]:
    # Extract graph data using OpenAI functions
            
        # Construct a graph document
        nodes = []
        rels = []
        try:
            nodes= list(map(map_to_base_node, data.nodes))
            rels= map_to_base_relationship(data.rels, nodes)
        except Exception as e:
            print("parsing exception")
            print(e)
        
        if len(nodes) == 0:
            return None
            
        return GraphDocument(
            nodes = nodes,
            relationships = rels,
            source = document
        )

        

# Download test documents

In [7]:
from tqdm import tqdm
import urllib.request

local_folder = "./data/"
os.makedirs(local_folder,exist_ok=True)

doc_names = []

documents = [
"https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf" 
]
for doc in tqdm(documents):
    print("Downloading", doc)
    doc_names.append(doc.split("/")[-1])
    if os.path.isfile(local_folder + doc.split("/")[-1]):
        continue
    urllib.request.urlretrieve(doc, local_folder + doc.split("/")[-1])
    

100%|██████████| 1/1 [00:00<00:00, 2796.20it/s]

Downloading https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf





## PDF to Txt
Read and chuck the docs

In [8]:
import time

from pdf import parse_pdf

# Currently this utility only uses pypdf. For more serious stuff you should use Azure Document Intelligence or similar service.

docs_pages_map = dict()
for doc in doc_names:
    print("Processing ",doc)
    start_time = time.time()
    
    doc_map = parse_pdf(file=local_folder+doc)
    docs_pages_map[doc]= doc_map
    
    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Processed {len(doc_map)} pages in {elapsed_time:.6f} seconds\n")
    
print(docs_pages_map)

Processing  Dog-Breed-Characteristics-Behavior.pdf
Processed 7 pages in 0.153604 seconds

{'Dog-Breed-Characteristics-Behavior.pdf': [(0, 0, ' \n  \n Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \

## Load documents to Neo4J.
Parse the docs with LLM to extract the graph.
This will take some time. Later we store the graph into pickle,so that you don't need to do this all the time.

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

full_doc = ""
graph_docs = []


for doc_name,doc_map in docs_pages_map.items():    
    for page in tqdm(doc_map):
        try:
            text = page[2].strip()
            # This will update the list function_responses. It's a global variable. (Not optimal I know)
            data = chain.invoke(
                {
                    "input": text,
                }
            )      
            for graph_doc in function_responses:
                tmp = extract_and_store_graph(graph_doc, Document(page_content=text))
                if tmp:                                    
                    graph_docs.append(tmp)
                
            function_responses.clear()
        except OutputParserException as e:
            print("output exception")
            print(e)

    
graph.add_graph_documents(graph_docs)
         


  0%|          | 0/7 [00:00<?, ?it/s]

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \nBreeds in this group were bred to herd sh

 14%|█▍        | 1/7 [01:49<10:59, 109.97s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the following structure:

### Nodes:
1. **BreedingGroup**: HerdingGroup
   - *description*: Breeds in this group were bred to herd sheep and cattle.
2. **Characteristic**: Alert
3. **Characteristic**: Smart
4. **Characteristic**: Independent
5. **Characteristic**: Confident
6. **Characteristic**: Trainable
7. **Characteristic**: Loyal
8. **Characteristic**: Affectionate
9. **Breed**: Collies
10. **Breed**: Sheepdogs
11. **Breed**: CattleDogs
12. **Breed**: Corgis
13. **Breed**: Shepherds
14. **Breed**: BelgianMalinois
15. **Breed**: BelgianTervuren

### Relationships:
- **Collies** belongsTo **HerdingGroup**
- **Sheepdogs** belongsTo **HerdingGroup**
- **CattleDogs** belongsTo **HerdingGroup**
- **Corgis** belongsTo **HerdingGroup**
- **Shepherds** belongsTo **HerdingGroup**
- **BelgianMalinois** belongsTo **HerdingGroup**
- **BelgianTervuren** belongsTo **HerdingGroup**
- **HerdingGroup** hasCharacteristic **Alert**
-

 29%|██▊       | 2/7 [04:18<11:02, 132.51s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the following structure:

### Nodes:
1. **BreedingGroup**: HoundGroup
   - *description*: Originally bred to hunt, strong prey drive, want to chase down small animals.
2. **BreedingGroup**: Sighthounds
   - *description*: Bred to find and chase down larger animals such as deer and elk, long and lean, excellent vision, run for a long stretch but are basically sprinters, used to hunt in many parts of the world, may not be interested in obedience work and can be hard to train.
3. **BreedingGroup**: ScentHounds
   - *description*: Usually smaller than sighthounds, bred to chase smaller prey to the ground, incredible sense of smell, long ears, ears can scrape the ground to stir up more smells, forever in search of a scent and will run away if a scent beckons.
4. **Characteristic**: Independent
5. **Characteristic**: Sweet
6. **Characteristic**: Intelligent
7. **Characteristic**: Affectionate
8. **Breed**: Afghan
9. **Breed*

 43%|████▎     | 3/7 [06:48<09:23, 140.77s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the provided information about dog breeds and their characteristics. The graph includes nodes for two breeding groups (Non-Sporting Group and Sporting Group), their descriptions, and various characteristics associated with them. Additionally, it contains nodes for specific breeds within these groups and a note about the Chesapeake Bay breed's temperament. The relationships in the graph connect the breeding groups to their respective characteristics.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [150.58s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with the provided information about dog breeds and their characteristics. The graph includes nodes for two breeding groups (Non-Sporting Group and Sporting Group), their descriptions, and various characteristics associated with them. Additionally, it contains nodes for specific breeds within these groups a

 57%|█████▋    | 4/7 [09:56<07:57, 159.20s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another query, feel free to ask![0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [187.46s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established. If you need further information or another query, feel free to ask!"
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 5 of 7 \nToy Group : \nDogs in  this group are miniature versions of dogs from many other groups. Ma

 71%|███████▏  | 5/7 [13:02<05:38, 169.10s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with the provided information about dog breeds and their characteristics, structured into nodes and relationships.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [186.64s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with the provided information about dog breeds and their characteristics, structured into nodes and relationships."
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 6 of 7 \n \nProtection Breeds : Most protection dogs were originally bred to be \nherding dogs or flock guards. They are now used for a variety of \npurposes including police work a nd guarding homes and property.  \n•

 86%|████████▌ | 6/7 [16:10<02:55, 175.42s/it]

[32;1m[1;3mThe knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established according to the provided text.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [187.67s] Exiting Chain run with output:
[0m{
  "output": "The knowledge graph has been successfully created with breeds and their characteristics as nodes, and the relationships between them have been established according to the provided text."
}

[1m> Finished chain.[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 7 of 7 \nTraining Resources : \nMarin Humane ’s Behavior & Training  conducts various training classes. Sign up on our Behavior & Training  \npage  for the next available cla

100%|██████████| 7/7 [16:23<00:00, 140.46s/it]

[32;1m[1;3mThe provided text does not contain specific information about dog breeds or their characteristics. It mainly discusses services offered by Marin Humane's Behavior & Training, such as training classes, private training, consultations, and the importance of licensing and microchipping dogs. Since there are no details on individual dog breeds or their characteristics, no knowledge graph can be generated from this text. If you have text that includes specific information about dog breeds and their characteristics, please provide it, and I can assist in creating a knowledge graph based on that information.[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [12.55s] Exiting Chain run with output:
[0m{
  "output": "The provided text does not contain specific information about dog breeds or their characteristics. It mainly discusses services offered by Marin Humane's Behavior & Training, such as training classes, private training, consultations, and the importance of li




In [10]:
#Let's pickle the graph so we don't have to redo this all the time
import pickle


with open('./data/graph_docs.pkl','wb') as f:
    pickle.dump(graph_docs, f)

In [11]:
# Only execute this cell if you need to load the graph from pickle (like after kernel restart)
import pickle
with open('./data/graph_docs.pkl','rb') as f:
    graph_docs = pickle.load(f)
    graph.add_graph_documents(graph_docs)
    

In [13]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()
print(graph.structured_schema)

# Need new llm, cypher chain does not work with json output
c_llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=1500,
    verbose=True)

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=c_llm,
    qa_llm=c_llm,
    validate_cypher=True,
    verbose=True
)

{'node_props': {'Node': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}]}, 'rel_props': {}, 'relationships': [{'start': 'Node', 'type': 'HASCHARACTERISTIC', 'end': 'Node'}, {'start': 'Node', 'type': 'BELONGSTO', 'end': 'Node'}, {'start': 'Node', 'type': 'INCLUDESBREED', 'end': 'Node'}]}


In [14]:
cypher_chain.invoke("Which breeds are loyal?")

  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (loyaltyTrait:Node {name: "loyal"})-[:HASCHARACTERISTIC]-(breed:Node)-[:INCLUDESBREED]-(species:Node)
RETURN breed.id, breed.name
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I don't know the answer."

In [19]:
graph.query("MATCH (n) DETACH DELETE n")

[]