In [8]:
#RESET TO BLANK

youtube_video = "https://www.youtube.com/watch?v=_zlO2T4lffo"
wikipedia_query = "Neo4j"
wiki_limit = 3

In [3]:
# !pip install langchain neo4j langchain-openai wikipedia youtube-transcript-api

In [4]:
# %pip install --user "google-cloud-aiplatform>=1.25.0" --upgrade
# %pip install --user "google-cloud-aiplatform[pipelines]>=1.25.0"
# %pip install --user "langchain>=0.0.216"
# %pip install --user neo4j
# %pip install --user pydantic
# %pip install --user gradio
# %pip install --user IProgress
# %pip install --user tqdm
# %pip install --user langchain_openai
# %pip install --user youtube-transcript-api
# %pip install --user wikipedia

In [2]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://xxxxx.production-orch-0009.neo4j.io"
username ="neo4j"
password = "xxxxxx"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [6]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [7]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [8]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "sk-xxxxx"
llm = ChatOpenAI(model="gpt-4", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [9]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [9]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import YoutubeLoader

# Define chunking strategy
raw_documents = YoutubeLoader.from_youtube_url(
    youtube_video, add_video_info=False
).load()


In [10]:
for doc in raw_documents:
    print(doc.metadata['source'])

_zlO2T4lffo


In [11]:
#query wikipedia for each of the docs found in the YouTube transcriptions 
i = 0 
for doc in raw_documents:
    wiki_source = doc.metadata['source']
    print(wiki_source)
    # Read the wikipedia article
    if "https://en.wikipedia.org/wiki/" in wiki_source:
        wiki_source = wiki_source.replace("https://en.wikipedia.org/wiki/", "").replace("_"," ")
        print("Querying Wikipedia for ", wiki_source)
        raw_documents += WikipediaLoader(query=wiki_source,load_all_available_meta=False).load()
        i = i +1
        if i >= wiki_limit:
            break

_zlO2T4lffo


In [22]:
if wikipedia_query > "":
    raw_documents += WikipediaLoader(query=wikipedia_query,load_all_available_meta=False).load()

In [27]:
#chunk the text
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents[:3])

In [16]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 4/4 [02:50<00:00, 42.56s/it]


In [17]:
print("graph loaded")

graph loaded


### Generate Cypher

In [3]:
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph
from langchain.llms import VertexAI
from langchain.prompts.prompt import PromptTemplate

CYPHER_GENERATION_TEMPLATE = """You are an expert Neo4j Cypher translator who understands the question in english and convert to Cypher strictly based on the Neo4j Schema provided and following the instructions below:
1. Generate Cypher query compatible ONLY for Neo4j Version 5
2. Do not use EXISTS, SIZE keywords in the cypher. Use alias when using the WITH keyword
3. Use only Nodes and relationships mentioned in the schema
4. Always enclose the Cypher output inside 3 backticks
5. Always do a case-insensitive and fuzzy search for any properties related search. Eg: to search for a Company name use `toLower(c.name) contains 'neo4j'`
6. Candidate node is synonymous to Person
7. Always use aliases to refer the node in the query
8. Cypher is NOT SQL. So, do not mix and match the syntaxes
Schema:
{schema}
Samples:
Question: Who is neo4j funded by?
Answer: MATCH (s:Software)-[:FUNDEDBY]-(c:Company) WHERE tolower(s.name) = 'neo4j' RETURN c.name

Question: how much funding has neo4j secured?
Answer: MATCH (s:Software)-[f:FUNDEDBY]-(c:Company) WHERE toLower(s.name) = 'neo4j' WITH f, apoc.text.regexGroups(f.amount, '\\$([0-9.]+)([A-Za-z]*)') AS matches WITH f, matches[0][1] AS amountStr, matches[0][2] AS scale WITH f, toFloat(amountStr) * CASE scale WHEN 'K' THEN 1000 WHEN 'M' THEN 1000000 WHEN 'B' THEN 1000000000 ELSE 1 END AS amount RETURN sum(amount)

Question: When was Neo4j's last round of funding?
Answer: MATCH (s:Software)-[f:FUNDEDBY]-(c:Company) WHERE toLower(s.name) = 'neo4j' WITH f, apoc.date.parse(f.date, 's', 'MMMM yyyy') AS parsedDate RETURN parsedDate AS dateCol ORDER BY parsedDate DESC

Question: What is Cypher?
Answer: MATCH (l:Language) WHERE toLower(l.name) = 'cypher' RETURN l.description

Question: How much was the last round of funding for neo4j?
Answer: MATCH (s:Software)-[f:FUNDEDBY]-(c:Company) WHERE toLower(s.name) = 'neo4j' WITH f, apoc.date.parse(f.date, 's', 'MMMM yyyy') AS parsedDate RETURN f.amount AS dateCol ORDER BY parsedDate DESC

Question: {question}
Answer:
"""
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

graph = Neo4jGraph(
    url=url, 
    username='neo4j', 
    password=password
)
chain = GraphCypherQAChain.from_llm(
    VertexAI(model_name='code-bison',
            max_output_tokens=2048,
            temperature=0,
            top_p=0.95,
            top_k=0.40), graph=graph, verbose=True,
            cypher_prompt=CYPHER_GENERATION_PROMPT,
    return_intermediate_steps=True
)

  warn_deprecated(


In [4]:
r = chain("""How much was the last round of funding for neo4j?""")

  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Software)-[f:FUNDEDBY]-(c:Company)
WHERE toLower(s.name) = 'neo4j'
WITH f, apoc.text.regexGroups(f.amount, '\$([0-9.]+)([A-Za-z]*)') AS matches
WITH f, matches[0][1] AS amountStr, matches[0][2] AS scale
WITH f, toFloat(amountStr) * CASE scale WHEN 'K' THEN 1000 WHEN 'M' THEN 1000000 WHEN 'B' THEN 1000000000 ELSE 1 END AS amount
RETURN amount
ORDER BY f.date DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'amount': 80000000.0}][0m

[1m> Finished chain.[0m


In [21]:
print(f"Intermediate steps: {r['intermediate_steps']}")
print(f"Final answer: {r['result']}")

Intermediate steps: [{'query': "cypher\nMATCH (s:Software)-[f:FUNDEDBY]-(c:Company)\nWHERE toLower(s.name) = 'neo4j'\nWITH f, apoc.text.regexGroups(f.amount, '\\$([0-9.]+)([A-Za-z]*)') AS matches\nWITH f, matches[0][1] AS amountStr, matches[0][2] AS scale\nWITH f, toFloat(amountStr) * CASE scale WHEN 'K' THEN 1000 WHEN 'M' THEN 1000000 WHEN 'B' THEN 1000000000 ELSE 1 END AS amount\nRETURN sum(amount)\n"}, {'context': [{'sum(amount)': 441000000.0}]}]
Final answer:  Neo4j has secured $441,000,000 in funding.


In [5]:
import gradio as gr
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
llm = VertexAI(model_name='code-bison',
            max_output_tokens=2048,
            temperature=0,
            top_p=0.95,
            top_k=0.40)
agent_chain = chain
def chat_response(input_text):
    response = agent_chain.run(input_text)
    return response

interface = gr.Interface(fn = chat_response, inputs = "text", outputs = "text", 
                         description = "YouTube Chatbot")

interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://da966a36ce21d36e12.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Concept) WHERE toLower(c.name) = 'rag' RETURN c.description
[0m
Full Context:
[32;1m[1;3m[{'c.description': 'Retrieval Augmented Generation, a big deal in building LLM backed applications in 2023'}][0m

[1m> Finished chain.[0m


ERROR:    Exception in ASGI application
  + Exception Group Traceback (most recent call last):
  |   File "/opt/conda/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 412, in run_asgi
  |     result = await app(  # type: ignore[func-returns-value]
  |   File "/opt/conda/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
  |     return await self.app(scope, receive, send)
  |   File "/opt/conda/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
  |     await super().__call__(scope, receive, send)
  |   File "/opt/conda/lib/python3.10/site-packages/starlette/applications.py", line 116, in __call__
  |     await self.middleware_stack(scope, receive, send)
  |   File "/opt/conda/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
  |     raise exc
  |   File "/opt/conda/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
  |     await s