In [47]:
%pip install -r requirements.txt

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f6bf867b350>


UnboundLocalError: cannot access local variable 'child' where it is not associated with a value

# Load env variables and connect to neo4j database
Please run _docker-compose up_ first on the directory to start the database.    

In [1]:
import os
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv

load_dotenv()

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

ValueError: Could not connect to Neo4j database. Please ensure that the url is correct

In [None]:
# This is based on https://github.com/tomasonjo/blogs/blob/master/llm/openaifunction_constructing_graph.ipynb
from typing import List, Dict, Any, Optional
from langchain.schema import Document
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import OutputParserException
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=4000, 
    response_format={ "type": "json_object" },
    verbose=True)



                    response_format was transferred to model_kwargs.
                    Please confirm that response_format is what you intended.


In [None]:

from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers.ernie_functions import JsonOutputFunctionsParser
from langchain_core.runnables import  RunnablePassthrough

system_prompt = """
# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
*Double check* that the JSON structure is correct.

"""


functions = [
    {
        "name": "knowledge_graph",
        "description": "A knowledge graph output",
        "parameters": {
            "type": "object",
            "properties": {
                "nodes": {
                    "description": "List of nodes in a graph with associated properties",
                    "type": "array",
                    "items": {
                        "type": "object",
                        "description": "Represents a node in a graph with associated properties",
                        "properties": {
                            "id": {
                                "type": "string"
                            },
                            "type": {
                                "type": "string"
                            },
                            "properties": {
                                "type": "array",
                                "description": "Additional properties and metadata associated with the node.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "key": {
                                            "type": "string"
                                        },
                                        "value": {
                                            "type": "string"
                                        }
                                    },
                                    "required": [
                                        "key",
                                        "value"
                                    ]
                                }
                            }
                        },
                        "required": [
                            "id",
                            "type"
                        ]
                    }
                },
                "rels": {
                    "description": "List of directed relationships between nodes in a graph.",
                    "type": "array",
                    "items": {
                        "type": "object",
                        "description": "Represents a directed relationship between two nodes in a graph.",
                        "properties": {
                            "source": {
                                "type": "object",
                                "properties": {
                                    "id": {
                                        "type": "string"
                                    }
                                },
                                "required": [
                                    "id"
                                ]
                            },
                            "target": {
                                "type": "object",
                                "properties": {
                                    "id": {
                                        "type": "string"
                                    }
                                },
                                "required": [
                                    "id"
                                ]
                            },
                            "type": {
                                "type": "string"
                            },
                            "properties": {
                                "type": "array",
                                "description": "Additional properties and metadata associated with the relationship.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "key": {
                                            "type": "string"
                                        },
                                        "value": {
                                            "type": "string"
                                        }
                                    },
                                    "required": [
                                        "key",
                                        "value"
                                    ]
                                }
                            }
                        },
                        "required": [
                            "source",
                            "target",
                            "type"
                        ]
                    }
                },
            },
            "required": ["nodes", "rels"],
        },
    }
]

prompt = ChatPromptTemplate.from_messages([
                ("system",system_prompt),
                ("human", 
                 """In this particular case we are interested in dogs. We want to extract information about dog breeds and their characteristics.
                    Characterics should be nodes and relationships should be between dog breeds and their characteristics. 
                    Ignore other entities than dogs, like people and addresses.
                    - **Allowed Node Labels:** Breed, BreedingGroup, Characteristic
                    {user_input}"""),
            ])

chain = (
    {"user_input": RunnablePassthrough()}
    | prompt
    | llm.bind(function_call={"name": "knowledge_graph"}, functions=functions)
    | JsonOutputFunctionsParser()
)



In [None]:
from  langchain.graphs.graph_document import Node, Relationship
from model import map_to_base_node, map_to_base_relationship



def extract_and_store_graph(
    document: Document,
    prompt: str) -> Optional[GraphDocument]:
    # Extract graph data using OpenAI functions
    try: 
        try:
            data = chain.invoke(document.page_content)        
        except OutputParserException as e:
            print("output exception")
            print(e)
        # Construct a graph document
        nodes = []
        rels = []
        try:
            nodes= list(map(map_to_base_node, data["nodes"]))
            rels= map_to_base_relationship(data["rels"], nodes)
        except Exception as e:
            print(e)
        

        graph_document = GraphDocument(
            nodes = nodes,
            relationships = rels,
            source = document
        )
        # Store information into a graph
        return graph_document
    except Exception as e:
        print(e)
        print("Failed to extract graph data from document")
        

# Download test documents

In [None]:
from tqdm import tqdm
import urllib.request

local_folder = "./data/"
os.makedirs(local_folder,exist_ok=True)

doc_names = []

documents = [
"https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf" 
]
for doc in tqdm(documents):
    print("Downloading", doc)
    doc_names.append(doc.split("/")[-1])
    if os.path.isfile(local_folder + doc.split("/")[-1]):
        continue
    urllib.request.urlretrieve(doc, local_folder + doc.split("/")[-1])
    

100%|██████████| 1/1 [00:00<00:00, 1917.83it/s]

Downloading https://www.marinhumane.org/wp-content/uploads/2017/06/Dog-Breed-Characteristics-Behavior.pdf





## PDF to Txt
Read and chuck the docs

In [None]:
import time

from pdf import parse_pdf


docs_pages_map = dict()
for doc in doc_names:
    print("Processing ",doc)
    start_time = time.time()
    
    doc_map = parse_pdf(file=local_folder+doc)
    docs_pages_map[doc]= doc_map
    
    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Processed {len(doc_map)} pages in {elapsed_time:.6f} seconds\n")
    
print(docs_pages_map)

Processing  Dog-Breed-Characteristics-Behavior.pdf
Processed 7 pages in 0.135352 seconds

{'Dog-Breed-Characteristics-Behavior.pdf': [(0, 0, ' \n  \n Behavior & Training  \n 415.506.6 280 \n Available B&T Services  \n \n \n171 Bel Marin Keys Blvd., Novato, CA  94949    Dog Breed Characteristics & Behavior  \nLike us at :   Page 1 of 7 \nDog Breed Characteristics & Behavior  \n \nWhy is it important to know about the characteristics and behavior of different breeds?  \nAll dogs are individuals and have their own personalities. At the same time, different breeds tend to also \nhave certain characteristics that help define that particular breed. This information can be helpful to you \nwhen you are choosing a  dog or trying to understand his  behavior.  \n \nThe AKC (American Kennel Club) places dog breeds within seven different groups. In order to ac count for \nthe different behaviors within a particular group, some groups can be further subdivided into families.  \n \nHerding group:  \

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#the pdf used was not actual book, so just combining all pages with cleanup into one document and chunking it
full_doc = ""
graph_docs = []
for doc_name,doc_map in docs_pages_map.items():    
    for page in tqdm(doc_map):
        page_num = page[0] + 1
        content = page[2].strip()
        content = os.linesep.join(
            [
                line.strip() for line in content.splitlines()
                if line and line.strip()
            ]
        )
        full_doc += content + "\n"
if len(full_doc) > 0:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150,
    )
    texts = text_splitter.create_documents([full_doc])
    for text in texts:
        graph_docs.append(extract_and_store_graph(text, system_prompt))
        
graph.add_graph_documents(graph_docs)


100%|██████████| 7/7 [00:00<00:00, 19959.30it/s]


KeyboardInterrupt: 

In [None]:
#Let's pickle the graph so we don't have to redo this all the time
import pickle


with open('./data/graph_docs.pkl','wb') as f:
    pickle.dump(graph_docs, f)

In [None]:
import pickle
with open('./data/graph_docs.pkl','rb') as f:
    graph_docs = pickle.load(f)

In [None]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()
print(graph.structured_schema)

# Need new llm, cypher chain does not work with json output
c_llm = AzureChatOpenAI(
    model=os.getenv("OPENAI_DEPLOYMENT_NAME"), 
    temperature=0, 
    max_tokens=1500,
    verbose=True)

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=c_llm,
    qa_llm=c_llm,
    validate_cypher=True,
    verbose=True
)

{'node_props': {'Node': [{'property': 'id', 'type': 'STRING'}], 'Characteristic': [{'property': 'associatedwith', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}], 'Breedinggroup': [{'property': 'name', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}], 'Breed': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'additionalinfo', 'type': 'STRING'}, {'property': 'temperament', 'type': 'STRING'}, {'property': 'favoriteactivity', 'type': 'STRING'}, {'property': 'purpose', 'type': 'STRING'}, {'property': 'size', 'type': 'STRING'}]}, 'rel_props': {}, 'relationships': [{'start': 'Node', 'type': 'PRONETO', 'end': 'Characteristic'}, {'start': 'Breedinggroup', 'type': 'HASCHARACTERISTIC', 'end': 'Characteristic'}, {'start': 'Breedinggroup', 'type': 'INCLUDESBREE

In [None]:
cypher_chain.run("Which groups are trainable?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (b:Breedinggroup)-[:HASCHARACTERISTIC]->(c:Characteristic {name: "Trainable"})
RETURN b.id, b.name, b.description;
[0m
Full Context:
[32;1m[1;3m[{'b.id': 'Retrievers', 'b.name': 'Retrievers', 'b.description': None}, {'b.id': 'HerdingGroup', 'b.name': 'HerdingGroup', 'b.description': 'Breeds in this group were bred to herd sheep and cattle'}][0m

[1m> Finished chain.[0m


'The Herding Group, which includes breeds that were bred to herd sheep and cattle, is known for being trainable. These breeds often have a strong work ethic and intelligence that makes them responsive to training. The Retrievers group is also typically very trainable, as these dogs have been bred to work closely with humans and are usually eager to please.'

In [None]:
graph.query("MATCH (n) DETACH DELETE n")

[]

In [56]:
import os
from gremlin_python.driver import client, serializer
import nest_asyncio
nest_asyncio.apply()
client = client.Client(
    url=os.environ["GREMLIN_URI"],
    traversal_source="g",
    username="/dbs/rag/colls/kg",
    password=os.environ["GREMLIN_PASSWORD"],
    message_serializer=serializer.GraphSONSerializersV2d0(),
)

def build_node_update_query(label_value, id_value, properties):
    base_query = f"g.V().hasLabel('{label_value}').has('id','{id_value}').fold().coalesce(unfold(),addV('{label_value}').property('id','{id_value}').property('type', '{label_value}')"
    for key, value in properties.items():
        base_query += f".property('{key}', '{value}')"
    
    return base_query + ")"

def build_source_to_target_query(type, source, target, properties):
    source_query = f".hasLabel('{source['label']}').has('id','{source['id']}')"
    target_query = f".hasLabel('{target['label']}').has('id','{target['id']}')"
    source = f"g.V()" + source_query	
    target = f"g.V()" + target_query
    
    base_query = f"""
    {source} 
    .as('source')  
    .outE('{type}').inV(){target_query}
    .fold()  
    .coalesce(  
        __.unfold(),  
        __.addE('{type}').from('source').to(  
            {target}
        )  
    )
    """
    for key, value in properties.items():
        base_query += f".property('{key}', '{value}')"
    

    return base_query

for document in graph_docs:
            # Import nodes
            for el in document.nodes:
                #print (el)
                query = build_node_update_query(el.type, el.id, el.properties)
                #print(query)
                #client.submit(query)
                
            for el in document.relationships:
                print (el)
                # Find or create the source vertex
                source = client.submit(build_node_update_query(el.source.type, el.source.id, el.source.properties)).all().result()[0]                                
                # Find or create the target vertex
                target = client.submit(build_node_update_query(el.target.type, el.target.id, el.target.properties)).all().result()[0] 
                #target = self.g.V().hasLabel(el.target.type).has('id', el.target.id).fold().coalesce(self.g.unfold(), self.g.addV(el.target.type).property('id', el.target.id).property('pk', 'partitionKeyValue')).next()
                # Find or create the edge
                
                edge = client.submit(build_source_to_target_query(el.type, source, target, el.properties))
                print(edge)
                # Update the properties of the edge
                #for key, value in el.properties.items():

                    #edge.property(key, value)'id', el.target.id).next())
                #edge = source.addEdge(el.type.replace(" ", "_").upper(), target)
                #for key, value in el.properties.items():
                    #edge.property(key, value)

#rs = client.submit(
#    message=(
#   ),
#    bindings={
##        "label_value": "Breed",
 #       "id_value": "Breed"}
 #   )
#print(rs.all().result())


source=Node(id='HerdingGroup', type='Breedinggroup', properties={'description': 'Breeds in this group were bred to herd sheep and cattle', 'name': 'HerdingGroup'}) target=Node(id='StalkingAndStaring', type='Characteristic', properties={'associatedwith': 'herding behavior', 'name': 'StalkingAndStaring'}) type='hasCharacteristic'
g.V().hasLabel('Breedinggroup').has('id','HerdingGroup')
        .outE('hasCharacteristic')
        .fold()
        .coalesce(
            g.where(
                g.unfold().inV().hasLabel('Characteristic').has('id','StalkingAndStaring')
            ),
            g.V().hasLabel('Breedinggroup').has('id','HerdingGroup')
            .addE('hasCharacteristic')
            .to(g.V().hasLabel('Characteristic').has('id','StalkingAndStaring'))
        )
        .where(g.inV().hasLabel('Characteristic').has('id','StalkingAndStaring'))"
        .fold()
        .coalesce(
            g.unfold(),
            g.V().hasLabel('Breedinggroup').has('id','HerdingGroup')
      

KeyboardInterrupt: 

In [None]:
import ssl
from httpx import HTTPTransport
from GremlinGraph import GremlinGraph
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection

g = traversal().with_remote(
  DriverRemoteConnection(url=os.environ["GREMLIN_URI"], username="/dbs/rag/colls/kg", password=os.environ["GREMLIN_PASSWORD"],
                         transport_factory=lambda: HTTPTransport(read_timeout=60,
                                                                    write_timeout=20,
                                                                    heartbeat=10,
                                                                    call_from_event_loop=True,
                                                                    max_content_length=100*1024*1024,
                                                                    ssl_options=ssl.create_default_context(ssl.Purpose.CLIENT_AUTH))))
g = traversal().with_remote(DriverRemoteConnection(url=os.environ["GREMLIN_URI"],username="/dbs/rag/colls/kg", password=os.environ["GREMLIN_PASSWORD"]))
print(g.V().has_label('breed').next())
for document in graph_docs:
            # Import nodes
            for el in document.nodes:
                node =  g.V().has_label(el.type).has('id', el.id).tryNext().orElseGet(lambda: self.g.addV(el.type).property('id', el.id).next())
                print(node)
                    
                    


RuntimeError: Connection was closed by server.