In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter

loader = TextLoader("sem_and_relation_data.txt")
documents = loader.load()

text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=2)
documents = text_splitter.split_documents(documents)

documents


[Document(metadata={'source': 'sem_and_relation_data.txt'}, page_content='\nIn the heart of Silicon Valley, the Stanford Research Institute (SRI) is pioneering groundbreaking research in artificial intelligence. \nDr. John Smith, a renowned computer scientist, leads a team focused on developing advanced natural language processing algorithms. \nThe team recently published a paper titled "Innovations in AI: Beyond the Basics" under patent ID US 2023045678A1, \nwhich outlines their novel approach to machine learning.\n\nStanford University collaborates closely with Google'),
 Document(metadata={'source': 'sem_and_relation_data.txt'}, page_content=' with Google Inc. on this project, aiming to improve search engine capabilities. \nThe research includes semantic techniques for understanding context in search queries and relational methods to enhance information retrieval. \nFor instance, their work explores how deep learning models can predict user intent more accurately by analyzing search

In [2]:
import warnings 
warnings.filterwarnings("ignore")

import configparser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI

config = configparser.ConfigParser()
config.read('config.ini')

embeddings_model = config.get('model', 'embeddings_model')
google_api_key = config.get('api_keys', 'google_api_key')
groq_api_key = config.get('api_keys', 'groq_api_key')
open_ai_key = config.get('api_keys', 'open_api_key')

## OpenAI - GPT Models | Closed Source API
# llm =  ChatOpenAI(temperature=0, api_key=open_ai_key, model="gpt-4o-mini")

## Gemini | Closed Source API
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro", temperature=0, 
#                              google_api_key=google_api_key,
#                             convert_system_message_to_human=True
# )

## llama 3 | Private Local llm
# llm = ChatOpenAI(temperature=0, api_key='sk-', model="llama3", base_url="http://3.94.151.199:11434/v1")
# llm = ChatOpenAI(temperature=0, api_key='sk-', model="llama3", base_url="http://localhost:8080/v1")

## Private LLM | 3rd party Hosted 
llm = ChatOpenAI(temperature=0, model= "llama3-70b-8192",  # "llama3-8b-8192",  #"llama3-70b-8192"
        base_url="https://api.groq.com/openai/v1",
        api_key=groq_api_key
    )


In [3]:
from langchain_community.graphs import Neo4jGraph


url = "bolt://localhost:7687"
username ="neo4j"
password = "password"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

print(graph)

<langchain_community.graphs.neo4j_graph.Neo4jGraph object at 0x12fd66a50>


In [5]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain_core.pydantic_v1 import BaseModel, Field


In [18]:
from typing import Optional, List
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field


class Relationship(BaseModel):
    source: str = Field(description="The source node of the relationship")
    target: str = Field(description="The target node of the relationship")
    type: str = Field(description="The type of relationship between the source and target nodes")

class Node(BaseModel):
    id: str = Field(description="The unique identifier of the node")
    type: str = Field(description="The type of the node")
    properties: Dict[str, str] = Field(description="Additional properties of the node")

class KnowledgeGraph(BaseModel):
    nodes: List[Node] = Field(description="List of nodes in the knowledge graph")
    relationships: List[Relationship] = Field(description="List of relationships in the knowledge graph")

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    # Assuming BaseNode takes id, type, and properties as arguments
    return BaseNode(id=node.id, type=node.type, properties=node.properties)

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    # Assuming BaseRelationship expects source and target to be dictionaries
    return BaseRelationship(
        source={"id": rel.source},
        target={"id": rel.target},
        type=rel.type
    )

In [19]:
def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
):
    prompt = PromptTemplate(
        template="""# Knowledge Graph Instructions
        ## 1. Overview
        You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
        - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
        - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
        ## 2. Labeling Nodes
        - **Consistency**: Ensure you use basic or elementary types for node labels.
        - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
        - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
        ## 3. Coreference Resolution
        - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
        If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
        always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
        Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
        ## 4. Strict Compliance
        Adhere to the rules strictly. Non-compliance will result in termination.

        Use the given format to extract information from the following input: {input}

        {format_instructions}
        """,
        input_variables=["input"],
        partial_variables={"format_instructions": PydanticOutputParser(pydantic_object=KnowledgeGraph).get_format_instructions()}
    )

    return prompt


def extract_and_store_graph(
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None
) -> KnowledgeGraph:
    if not isinstance(document, Document):
        raise TypeError(f"Expected document to be an instance of Document, got {type(document)}")
    
    # Extract graph data using the prompt and JSON parser
    prompt = get_extraction_chain(nodes, rels)
    parser = PydanticOutputParser(pydantic_object=KnowledgeGraph)
    
    # Assuming you have defined 'llm' somewhere in your code
    chain = prompt | llm | parser
    
    data = chain.invoke({"input": document.page_content})
    print(data)
    print(f"nodes: {nodes}, rels: {rels}")
    
    # Construct a graph document
    graph_document = GraphDocument(
        nodes=[map_to_base_node(node) for node in data.nodes],
        relationships=[map_to_base_relationship(rel) for rel in data.relationships],
        source=document
    )
    print(graph_document)
    
    graph.add_graph_documents([graph_document], True)
    
    return data


In [20]:
from tqdm import tqdm

# Iterate over chunks and call extract_and_store_graph
for i, d in tqdm(enumerate(documents), total=len(documents)):
    print(f"Processing chunk {i}: {d}")
    data = extract_and_store_graph(d)
    print("Graph stored successfully.")
    


  0%|          | 0/10 [00:00<?, ?it/s]

Processing chunk 0: page_content='
In the heart of Silicon Valley, the Stanford Research Institute (SRI) is pioneering groundbreaking research in artificial intelligence. 
Dr. John Smith, a renowned computer scientist, leads a team focused on developing advanced natural language processing algorithms. 
The team recently published a paper titled "Innovations in AI: Beyond the Basics" under patent ID US 2023045678A1, 
which outlines their novel approach to machine learning.

Stanford University collaborates closely with Google' metadata={'source': 'sem_and_relation_data.txt'}
nodes=[Node(id='Stanford Research Institute', type='organization', properties={'location': 'Silicon Valley'}), Node(id='Dr. John Smith', type='person', properties={'title': 'computer scientist'}), Node(id='Stanford University', type='university', properties={}), Node(id='Google', type='company', properties={}), Node(id='Innovations in AI: Beyond the Basics', type='paper', properties={'patentId': 'US 2023045678A1'})]

 10%|█         | 1/10 [00:01<00:16,  1.88s/it]

Graph stored successfully.
Processing chunk 1: page_content=' with Google Inc. on this project, aiming to improve search engine capabilities. 
The research includes semantic techniques for understanding context in search queries and relational methods to enhance information retrieval. 
For instance, their work explores how deep learning models can predict user intent more accurately by analyzing search patterns and user behavior.

The Department of Computer Science at Stanford University also hosts annual conferences where experts discuss advancements in the field. 
In these conferences, terms like "artificial intelligence", "machine learning' metadata={'source': 'sem_and_relation_data.txt'}


 20%|██        | 2/10 [00:04<00:17,  2.19s/it]

nodes=[Node(id='Google Inc.', type='organization', properties={'name': 'Google Inc.'}), Node(id='Department of Computer Science at Stanford University', type='organization', properties={'name': 'Department of Computer Science at Stanford University'}), Node(id='search engine', type='concept', properties={'description': 'a system that retrieves and ranks online content'}), Node(id='semantic techniques', type='concept', properties={'description': 'methods for understanding context in search queries'}), Node(id='relational methods', type='concept', properties={'description': 'approaches to enhance information retrieval'}), Node(id='deep learning models', type='concept', properties={'description': 'algorithms that predict user intent by analyzing search patterns and user behavior'}), Node(id='artificial intelligence', type='concept', properties={'description': 'a field of study focused on creating intelligent machines'}), Node(id='machine learning', type='concept', properties={'description

 30%|███       | 3/10 [00:06<00:15,  2.20s/it]

nodes=[Node(id='Machine Learning', type='concept', properties={'description': 'Frequently discussed concept'}), Node(id='Data Science', type='concept', properties={'description': 'Frequently discussed concept'}), Node(id='Stanford Research Institute', type='organization', properties={'description': 'Cornerstone for innovations in artificial intelligence and machine learning'}), Node(id='Dr. John Smith', type='person', properties={'description': 'Distinguished figure in the field of computer science', 'title': 'Leader of Stanford Research Institute'}), Node(id='Artificial Intelligence', type='concept', properties={'description': 'Area of innovation at Stanford Research Institute'}), Node(id='Natural Language Processing', type='concept', properties={'description': 'Area of development at Stanford Research Institute'}), Node(id='Silicon Valley', type='location', properties={'description': 'Vibrant ecosystem'})] relationships=[Relationship(source='Stanford Research Institute', target='Arti

 40%|████      | 4/10 [00:08<00:11,  1.95s/it]

nodes=[Node(id='Stanford University', type='organization', properties={'name': 'Stanford University'}), Node(id='Google Inc.', type='organization', properties={'name': 'Google Inc.'}), Node(id='Microsoft', type='organization', properties={'name': 'Microsoft'}), Node(id='Innovations in AI: Beyond the Basics', type='researchPaper', properties={'title': 'Innovations in AI: Beyond the Basics', 'patentId': 'US 2023045678A1'})] relationships=[Relationship(source='Stanford University', target='Google Inc.', type='collaboration'), Relationship(source='Stanford University', target='Microsoft', type='collaboration'), Relationship(source='Stanford University', target='Innovations in AI: Beyond the Basics', type='author')]
nodes: None, rels: None
nodes=[Node(id='Stanford University', type='organization', properties={'name': 'Stanford University'}), Node(id='Google Inc.', type='organization', properties={'name': 'Google Inc.'}), Node(id='Microsoft', type='organization', properties={'name': 'Microso

 50%|█████     | 5/10 [00:11<00:11,  2.33s/it]

nodes=[Node(id='Department of Computer Science at Stanford University', type='organization', properties={'name': 'Department of Computer Science at Stanford University'}), Node(id='Stanford University', type='university', properties={'name': 'Stanford University'}), Node(id='data science', type='field of study', properties={'name': 'data science'}), Node(id='computational linguistics', type='field of study', properties={'name': 'computational linguistics'}), Node(id='robotics', type='field of study', properties={'name': 'robotics'}), Node(id='artificial neural networks', type='concept', properties={'name': 'artificial neural networks'}), Node(id='reinforcement learning', type='concept', properties={'name': 'reinforcement learning'}), Node(id='big data analytics', type='concept', properties={'name': 'big data analytics'}), Node(id='ethical AI', type='concept', properties={'name': 'ethical AI'}), Node(id='data privacy', type='concept', properties={'name': 'data privacy'})] relationships=

 60%|██████    | 6/10 [00:22<00:22,  5.58s/it]

nodes=[Node(id='MIT Media Lab', type='institution', properties={'name': 'MIT Media Lab'}), Node(id='Harvard University', type='institution', properties={'name': 'Harvard University'}), Node(id='Stanford Research Institute', type='institution', properties={'name': 'Stanford Research Institute'}), Node(id='Silicon Valley', type='region', properties={'name': 'Silicon Valley', 'location': 'Northern California'}), Node(id='quantum computing', type='technology', properties={'name': 'quantum computing'}), Node(id='blockchain technology', type='technology', properties={'name': 'blockchain technology'}), Node(id='augmented reality', type='technology', properties={'name': 'augmented reality'}), Node(id='virtual reality', type='technology', properties={'name': 'virtual reality'}), Node(id='IoT', type='technology', properties={'name': 'IoT', 'description': 'Internet of Things'})] relationships=[Relationship(source='MIT Media Lab', target='quantum computing', type='focus'), Relationship(source='Har

 70%|███████   | 7/10 [00:38<00:26,  8.68s/it]

nodes=[Node(id='Silicon Valley', type='region', properties={'description': 'a region in Northern California'}), Node(id='San Francisco Bay Area', type='region', properties={'description': 'a region in California'}), Node(id='Santa Clara Valley', type='region', properties={'description': 'a valley in California'}), Node(id='Sunnyvale', type='city', properties={'description': 'a city in California'}), Node(id='Mountain View', type='city', properties={'description': 'a city in California'}), Node(id='Palo Alto', type='city', properties={'description': 'a city in California'}), Node(id='Menlo Park', type='city', properties={'description': 'a city in California'}), Node(id='California', type='state', properties={'description': 'a state in the United States'}), Node(id='Northern California', type='region', properties={'description': 'a region in California'})] relationships=[Relationship(source='Silicon Valley', target='Santa Clara Valley', type='locatedIn'), Relationship(source='Silicon Val

 80%|████████  | 8/10 [00:53<00:21, 10.82s/it]

nodes=[Node(id='San Jose', type='city', properties={'description': 'Largest city in Silicon Valley'}), Node(id='Silicon Valley', type='region', properties={'description': 'Birthplace of Silicon Valley'}), Node(id='California', type='state', properties={'description': 'Third-largest in the United States'}), Node(id='United States', type='country', properties={'description': '13th-most populous'}), Node(id='Santa Clara', type='city', properties={'description': 'Major city in Silicon Valley'}), Node(id='Redwood City', type='city', properties={'description': 'Major city in Silicon Valley'}), Node(id='Cupertino', type='city', properties={'description': 'Major city in Silicon Valley'}), Node(id='San Jose Metropolitan Area', type='metropolitan area', properties={'description': 'Third-highest GDP per capita in the world'}), Node(id='Zürich', type='city', properties={'description': 'Highest GDP per capita in the world'}), Node(id='Oslo', type='city', properties={'description': 'Second-highest G

 90%|█████████ | 9/10 [01:07<00:11, 11.96s/it]

nodes=[Node(id='Silicon Valley', type='region', properties={'description': 'A region in the United States'}), Node(id='United States', type='country', properties={'description': 'A country in North America'})] relationships=[Relationship(source='Silicon Valley', target='United States', type='locatedIn')]
nodes: None, rels: None
nodes=[Node(id='Silicon Valley', type='region', properties={'description': 'A region in the United States'}), Node(id='United States', type='country', properties={'description': 'A country in North America'})] relationships=[Relationship(source=Node(id='Silicon Valley'), target=Node(id='United States'), type='locatedIn')] source=Document(metadata={'source': 'sem_and_relation_data.txt'}, page_content=" it also had the highest percentage of homes valued at $1 million or more in the United States.[6]\n\nSilicon Valley is home to many of the world's largest high-tech corporations, including the headquarters of more than 30 businesses in the Fortune 1000, and thousan

100%|██████████| 10/10 [01:18<00:00,  7.82s/it]

nodes=[Node(id='Silicon Valley', type='region', properties={'description': 'A region in California, USA'}), Node(id='integrated circuit', type='technology', properties={'description': 'A type of electronic circuit'}), Node(id='microprocessor', type='technology', properties={'description': 'A type of central processing unit'}), Node(id='microcomputer', type='technology', properties={'description': 'A type of small computer'}), Node(id='information technology workers', type='profession', properties={'description': 'People working in the information technology industry'})] relationships=[Relationship(source='Silicon Valley', target='integrated circuit', type='developed'), Relationship(source='Silicon Valley', target='microprocessor', type='developed'), Relationship(source='Silicon Valley', target='microcomputer', type='developed'), Relationship(source='Silicon Valley', target='information technology workers', type='employs')]
nodes: None, rels: None
nodes=[Node(id='Silicon Valley', type='




In [21]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()
print(graph.schema)


cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=llm,
    qa_llm=llm,
    validate_cypher=True, # Validate relationship directions
    # return_intermediate_steps=True,
    verbose=True
)


Node properties:
Document {text: STRING, source: STRING, id: STRING}
Organization {location: STRING, name: STRING, id: STRING, field: STRING, project: STRING, event: STRING, description: STRING}
Person {id: STRING, name: STRING, profession: STRING, field: STRING}
Node {id: STRING}
Department {name: STRING, id: STRING, affiliation: STRING}
Field {id: STRING, name: STRING}
Concept {id: STRING, name: STRING, description: STRING}
Location {id: STRING, name: STRING, region: STRING, geographicalarea: STRING, description: STRING}
Publication {name: STRING, id: STRING, patentid: STRING}
Paper {name: STRING, id: STRING, patentid: STRING, title: STRING}
Event {location: STRING, name: STRING, id: STRING, frequency: STRING}
Topic {name: STRING, id: STRING}
Funding {name: STRING, id: STRING}
Project {name: STRING, id: STRING}
Institution {id: STRING, name: STRING, focus: STRING, partnerships: STRING}
Region {location: STRING, name: STRING, description: STRING, id: STRING}
City {islargestcityof: STR

In [24]:
cypher_chain.invoke({"query": "Stanford Research Institute"})




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (o:Organization {name: "Stanford Research Institute"}) RETURN o;[0m
Full Context:
[32;1m[1;3m[{'o': {'field': 'research in artificial intelligence', 'name': 'Stanford Research Institute', 'location': 'Silicon Valley', 'id': 'Stanford Research Institute'}}][0m

[1m> Finished chain.[0m


{'query': 'Stanford Research Institute',
 'result': 'Stanford Research Institute is a research institute located in Silicon Valley, focused on research in artificial intelligence.'}

## 2nd Approach

In [25]:
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Neo4jVector
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)


# The Neo4jVector Module will connect to Neo4j and create a vector index if needed.

db = Neo4jVector.from_documents(
    documents, embeddings, url=url, username=username, password=password
)

query = "what is this standford all talking about ?"
docs_with_score = db.similarity_search_with_score(query, k=2)
docs_with_score

  warn_deprecated(


[(Document(metadata={'source': 'sem_and_relation_data.txt'}, page_content='machine learning", and "data science" are frequently discussed. \nThe goal is to integrate cutting-edge technologies to solve real-world problems, with collaborations extending to various tech companies and academic institutions\n\nIn the vibrant ecosystem of Silicon Valley, Stanford Research Institute (SRI) has become a cornerstone for innovations in artificial intelligence and machine learning. Under the leadership of Dr. John Smith, a distinguished figure in the field of computer science, the institute is advancing the development of sophisticated natural language processing'),
  0.5843997001647949),
 (Document(metadata={'source': 'sem_and_relation_data.txt'}, page_content='\nIn the heart of Silicon Valley, the Stanford Research Institute (SRI) is pioneering groundbreaking research in artificial intelligence. \nDr. John Smith, a renowned computer scientist, leads a team focused on developing advanced natural 

In [27]:
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI

graph = Neo4jGraph(url=url, username=username, password=password)


chain = GraphCypherQAChain.from_llm(
    llm, graph=graph, verbose=True
)

chain.run("Who is john smith")

  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: "John Smith"}) RETURN p;[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I don't know the answer."