In [22]:
%pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


In [23]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access environment variables
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [24]:
# Define Neo4j credentials
NEO4J_URI = "neo4j+s://8e6af5fb.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "6Q7v8t1FcCKs6UkuOeUk83v2GVKqw8xZPvMk6g0hYCg"

# Import necessary libraries
from langchain_community.graphs import Neo4jGraph

# Create Neo4j graph connection
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

# Verify connection
print("Neo4j graph connection established successfully")



Neo4j graph connection established successfully


In [25]:


# # Import necessary libraries for document loading and splitting
# from langchain_community.document_loaders import WikipediaLoader
# from langchain_text_splitters import TokenTextSplitter

# # Read the wikipedia article
# raw_documents = WikipediaLoader(query="Elizabeth I").load()
# # Define chunking strategy
# text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
# # Split the first 3 documents into smaller chunks for easier processing
# documents = text_splitter.split_documents(raw_documents[:3]) 

In [None]:
# Import necessary libraries for JSON serialization
import json
import os
from langchain_core.documents import Document

# Define the file path for saving/loading documents
json_file_path = "elizabeth_documents.json"

# Option 1: Save documents to JSON
def save_documents_to_json(docs, file_path):
    """Save LangChain documents to a JSON file."""
    # Convert Document objects to dictionaries
    docs_dict = [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]
    
    # Write to JSON file
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(docs_dict, f, ensure_ascii=False, indent=2)
    
    print(f"Documents saved to {file_path}")

# Option 2: Load documents from JSON
def load_documents_from_json(file_path):
    """Load LangChain documents from a JSON file."""
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        return []
    
    with open(file_path, "r", encoding="utf-8") as f:
        docs_dict = json.load(f)
    
    # Convert dictionaries back to Document objects
    docs = [Document(page_content=doc["page_content"], metadata=doc["metadata"]) for doc in docs_dict]
    print(f"Loaded {len(docs)} documents from {file_path}")
    return docs

# Save the current documents to JSON
# save_documents_to_json(documents, json_file_path)

# To load documents in the future, uncomment the line below and comment out the Wikipedia loading code
documents = load_documents_from_json(json_file_path)


Loaded 6 documents from elizabeth_documents.json


In [27]:
documents

[Document(metadata={'title': 'Elizabeth I', 'summary': 'Elizabeth I (7 September 1533 – 24 March 1603) was Queen of England and Ireland from 17 November 1558 until her death in 1603. She was the last and longest reigning monarch of the House of Tudor. Her eventful reign, and its effect on history and culture, gave name to the Elizabethan era.\nElizabeth was the only surviving child of Henry VIII and his second wife, Anne Boleyn. When Elizabeth was two years old, her parents\' marriage was annulled, her mother was executed, and Elizabeth was declared illegitimate. Henry restored her to the line of succession when she was 10, via the Third Succession Act 1543. After Henry\'s death in 1547, Elizabeth\'s younger half-brother Edward VI ruled until his own death in 1553, bequeathing the crown to a Protestant cousin, Lady Jane Grey, and ignoring the claims of his two half-sisters, the Catholic Mary and the younger Elizabeth, in spite of statutes to the contrary. Edward\'s will was set aside w

In [28]:
# Import necessary libraries for LLM and graph transformation
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer


llm = ChatOpenAI(model="gpt-4o", temperature=1)
llm_transformer = LLMGraphTransformer(llm)

#extract graph data
graph_documents = llm_transformer.convert_to_graph_documents(documents)

#store to neo4j
graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)


In [29]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [30]:
# Extract entities from text
from pydantic import BaseModel, Field
from typing import List
from langchain_openai import ChatOpenAI
from openai_interacter import OpenAIChatInterface


class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

# Define the messages to send to the model
messages = [
    {
        "role": "system",
        "content": "You are extracting organization and person entities from the text."
    },
    {
        "role": "user",
        "content": "Extract the entities from the following text: {question}"
    }
]

# Function to process text and extract entities
def extract_entities(text):
    formatted_messages = [
        {"role": "system", "content": messages[0]["content"]},
        {"role": "user", "content": messages[1]["content"].replace("{question}", text)}
    ]

    print("Input messages: ", formatted_messages)

    # Create a new GPT-4o model instance
    entity_llm = OpenAIChatInterface(model_name="gpt-4o", temperature=1, initial_messages=formatted_messages)
    entity_llm.enable_structured_output(Entities)
    return entity_llm.parse_structured_output()

In [31]:
#entities = extract_entities(text = "Where was Amelia Earhart born?")
#entities

In [32]:
import re

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

def remove_lucene_chars(text: str) -> str:
    """
    Remove special characters that have meaning in Lucene syntax.
    
    This function removes or escapes characters that would be interpreted
    as special operators in Lucene full-text search syntax.
    """
    # Remove or escape Lucene special characters
    return re.sub(r'[+\-&|!(){}[\]^"~*?:\\]', ' ', text)

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines 
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

In [42]:
def structured_retriever(question: str) -> str:

    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    print("\n[Structured Retriever] Processing question:", question)
    
    result = ""
    # Extract entities from question
    print("\n[Structured Retriever] Extracting entities...")
    entities = extract_entities(text=question)
    print("[Structured Retriever] Found entities:", entities)
    
    # Process each entity
    for entity in entities.names:
        
        # Generate full text query
        query = generate_full_text_query(entity)
        
        # Execute graph query
        print("[Structured Retriever] Executing graph query...")
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": query},
        )
        
        # Process results
        relationships = [el['output'] for el in response]
        print(f"[Structured Retriever] Found {len(relationships)} relationships")
        if len(relationships) == 0:
            print(f"[Structured Retriever] Warning: No relationships found for entity '{entity}'")
            
        result += "n".join(relationships)
    
    print("\n[Structured Retriever] Retrieval complete")
    return result

In [34]:
# structured_retriever(question = "Who is Elizabeth I?")

In [46]:



from augmented_llm.llm import AugmentedLLM, LLMProvider
import json

# Create an AugmentedLLM instance with Anthropic provider
llm = AugmentedLLM(
    system_prompt="You are a helpful assistnat that uses the data they can get from a graph database to answer the user's question.",
    provider=LLMProvider.ANTHROPIC,
    model_name="claude-3-7-sonnet-20250219",
    temperature=1, 
    max_tokens=64000, 
    use_react=True,
    debug_tools=True,
    debug_tokens=True,
    debug_messages=True
)





In [47]:
# Add the structured retriever as a tool
llm.add_tool(
    name="query_knowledge_graph",
    description="Retrieves information about from a knowledge graph",
    input_schema={
            "question": {
                "type": "string",
                "description": "A simple question indicating what information you want to get from the database. Try to mention the specific entities (places, people) etc you want information about.", 
                "required": True
            }
        },
    handler=structured_retriever
)



In [50]:
prompt = "Who was queen elizabeth I born to?"

# Print the response while filtering out tool/debug messages
for chunk in llm.generate(prompt):
    if isinstance(chunk, str):
        # Skip tool-related outputs and debug messages
        if any(marker in chunk for marker in [
            "[Tool Result]",
            "[Tool Use Started]", 
            "[Tool Input]",
            "[Stream Started]",
            "[Message Complete]",
            "[Stop Reason]",
            "[Debug]",
            "[Continuing conversation"
        ]):
            continue
        print(chunk, end="", flush=True)

[Debug] Stream Started
<thinking>
The user is asking about Elizabeth I's parents - specifically, who was she born to. This is a straightforward question about her parentage, which should be available in the knowledge graph. Based on historical records, Elizabeth I was the daughter of King Henry VIII of England and his second wife, Anne Boleyn. Let me query the knowledge graph to confirm this information.
</thinking>
[Debug] Tool Use Started: query_knowledge_graph
[Debug] Tool Input: {
  "question": "Who were the parents of Queen Elizabeth I?"
}
[Debug] Stop Reason: tool_use
[Debug] Message Complete

[Debug] Executing tool: query_knowledge_graph
[Debug] Tool input: {
  "question": "Who were the parents of Queen Elizabeth I?"
}

[Structured Retriever] Processing question: Who were the parents of Queen Elizabeth I?

[Structured Retriever] Extracting entities...
Input messages:  [{'role': 'system', 'content': 'You are extracting organization and person entities from the text.'}, {'role': '



[Structured Retriever] Found 0 relationships

[Structured Retriever] Retrieval complete
[Debug] Tool result: 

[Debug] Stream Started
<thinking>
The query didn't return any results. Let me try a broader query about Elizabeth I to see if parent relationships are included in that data.
</thinking>
[Debug] Tool Use Started: query_knowledge_graph
[Debug] Tool Input: {
  "question": "Who was Elizabeth I?"
}
[Debug] Stop Reason: tool_use
[Debug] Message Complete

[Debug] Executing tool: query_knowledge_graph
[Debug] Tool input: {
  "question": "Who was Elizabeth I?"
}

[Structured Retriever] Processing question: Who was Elizabeth I?

[Structured Retriever] Extracting entities...
Input messages:  [{'role': 'system', 'content': 'You are extracting organization and person entities from the text.'}, {'role': 'user', 'content': 'Extract the entities from the following text: Who was Elizabeth I?'}]




[Structured Retriever] Found entities: names=['Elizabeth I']

[Structured Retriever] Processing entity: Elizabeth I
[Structured Retriever] Generated query: Elizabeth~2 AND I~2
[Structured Retriever] Executing graph query...
[Structured Retriever] Found 50 relationships

[Structured Retriever] Retrieval complete
[Debug] Tool result: Elizabeth I - QUEEN_OF -> EnglandnElizabeth I - QUEEN_OF -> IrelandnElizabeth I - AFFILIATION -> House Of TudornElizabeth I - NAMESAKE -> Elizabethan EranElizabeth I - CHILD_OF -> Henry ViiinElizabeth I - CHILD_OF -> Anne BoleynnElizabeth I - SIBLING -> Edward VinElizabeth I - SIBLING -> Mary InElizabeth I - ADVISER -> William CecilnElizabeth I - FOUNDER -> Church Of EnglandnElizabeth I - SUCCEEDED_BY -> James Vi Of ScotlandnElizabeth I - SUPPORTED_BY -> Sir Francis WalsinghamnElizabeth I - FOREIGN_AFFAIRS -> FrancenElizabeth I - FOREIGN_AFFAIRS -> SpainnElizabeth I - NAMED_AFTER -> Elizabethan EranElizabeth I - SUCCESSION -> James Vi Of ScotlandnElizabeth I