In [164]:
from langchain_community.vectorstores.neo4j_vector import Neo4jVector
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from getpass import getpass
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate, PromptTemplate
from langchain import hub
from langchain_openai import ChatOpenAI
from graphdatascience import GraphDataScience
import pandas as pd

In [2]:
from langchain_google_vertexai import VertexAIEmbeddings
from google.auth.credentials import Credentials

In [125]:
openai_api_key = getpass()

 ········


In [5]:
neo4j_password = getpass()

 ········


In [128]:
gcloud_token = getpass()

 ········


In [52]:
neo4j_uri = "neo4j+s://bc9f751a.databases.neo4j.io"
neo4j_user = "neo4j"
gds = GraphDataScience(neo4j_uri, auth=(neo4j_user, neo4j_password))

In [102]:
def query_docs_and_questions(query):
    result = gds.run_cypher(
    """ 
    //Encode query
    WITH genai.vector.encode($query, "VertexAI", { token: $token, projectId: $project }) AS queryVector
    
    //Find the closest generated question and its context 
    CALL db.index.vector.queryNodes("generated_question_embeddings", 1, queryVector) YIELD node, score
    MATCH (node)-[:GENERATED_FROM_CONTEXT]->(d:Document)
    WITH queryVector, score, d
    ORDER BY gds.similarity.cosine(queryVector, d.embedding)
    WITH queryVector, score as closestQuestionScore, collect(d) AS questionContext

    //Find the seven closest documents to the query and sort by score
    CALL db.index.vector.queryNodes("document-embeddings", $maxTotalDocs, queryVector) YIELD node, score
    WITH questionContext, closestQuestionScore, node, score
    ORDER BY score DESC
    WITH questionContext, closestQuestionScore, collect(node) AS docs, collect(score)[$maxTotalDocs - $maxQuestionDocs - 1] AS cutoffDocScore

    //Check whether question or closest doc is closer to the query
    WITH questionContext, closestQuestionScore, docs, cutoffDocScore,
    CASE WHEN cutoffDocScore < closestQuestionScore THEN $maxQuestionDocs ELSE 0 END AS contextCount,
    CASE WHEN cutoffDocScore < closestQuestionScore THEN true ELSE false END AS usedQuestion
    WITH questionContext[..contextCount] + [d in docs WHERE NOT d in questionContext[..contextCount] | d][..$maxTotalDocs - size(questionContext[..contextCount])] AS combinedList, usedQuestion
    
    //Return results
    UNWIND combinedList as doc
    RETURN doc.text AS text, usedQuestion, doc{.url} AS metadata""",
        {"query":query, "token": gcloud_token, "project": "neo4j-ps-202001", "maxQuestionDocs": 10, "maxTotalDocs": 12})
    return result

In [103]:
topic_results = query_docs_and_questions("What parameters are important for FastRP?")

In [104]:
topic_results

Unnamed: 0,text,usedQuestion,metadata
0,0 1-.018-1.042Zm-4.69 9.64a1.998 1.998 0 0 0 2...,True,{'url': 'https://github.com/danb-neo4j/gds-gui...
1,"quality, Neo4j's FastRP documentation provides...",True,{'url': 'https://github.com/danb-neo4j/gds-gui...
2,5. Creating FastRP node embeddings Next we use...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...
3,"by incorporating node properties, making the g...",True,{'url': 'https://github.com/danb-neo4j/gds-gui...
4,2. Tuning algorithm parameters In order to imp...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...
5,3.5 0 1 1-4.95-4.95l2.5-2.5a3.5 3.5 0 0 1 4.95...,True,{'url': 'https://github.com/danb-neo4j/gds-gui...
6,jobId String Generated internally yes An ID th...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...
7,This has some implications on how to use FastR...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...
8,Configuration used for running the algorithm. ...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...
9,preProcessingMillis Integer Milliseconds for p...,True,{'url': 'https://neo4j.com/docs/graph-data-sci...


In [108]:
def query_documents(query):
    result = gds.run_cypher(
    """ 
    WITH genai.vector.encode($query, "VertexAI", { token: $token, projectId: $project }) AS queryVector
    CALL db.index.vector.queryNodes("document-embeddings", $maxTotalDocs, queryVector) YIELD node, score
    RETURN node.text AS text, score, node{.url} AS metadata""",
        {"query":query, "token": gcloud_token, "project": "neo4j-ps-202001", "maxTotalDocs": 12})
    return result

In [109]:
doc_results = query_documents("What parameters are important for FastRP?")

In [110]:
doc_results

Unnamed: 0,text,score,metadata
0,3.5 0 1 1-4.95-4.95l2.5-2.5a3.5 3.5 0 0 1 4.95...,0.896098,{'url': 'https://github.com/danb-neo4j/gds-gui...
1,2. Tuning algorithm parameters In order to imp...,0.894735,{'url': 'https://neo4j.com/docs/graph-data-sci...
2,"by incorporating node properties, making the g...",0.886423,{'url': 'https://github.com/danb-neo4j/gds-gui...
3,jobId String Generated internally yes An ID th...,0.880479,{'url': 'https://neo4j.com/docs/graph-data-sci...
4,5. Creating FastRP node embeddings Next we use...,0.880283,{'url': 'https://neo4j.com/docs/graph-data-sci...
5,This has some implications on how to use FastR...,0.879461,{'url': 'https://neo4j.com/docs/graph-data-sci...
6,Configuration used for running the algorithm. ...,0.879033,{'url': 'https://neo4j.com/docs/graph-data-sci...
7,preProcessingMillis Integer Milliseconds for p...,0.876872,{'url': 'https://neo4j.com/docs/graph-data-sci...
8,embedding List of Float FastRP node embedding....,0.875791,{'url': 'https://neo4j.com/docs/graph-data-sci...
9,"GraphSAGE, or HashGNN.</li>\n<li><strong>Machi...",0.875726,{'url': 'https://github.com/danb-neo4j/gds-gui...


In [60]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a Neo4j expert providing information about Neo4j software.
                  Be as helpful as possible and return as much information as possible.
                  Do not answer any questions that do not relate to Neo4j.
                  Answer any questions based solely on the context below:
                  <context>
                  {context}
                  </context>"""),
    ("user", "{input}"),
])

In [126]:
chat = ChatOpenAI(openai_api_key = openai_api_key, temperature = 0, model = "gpt-3.5-turbo")
chat_chain = prompt | chat

In [175]:
def compare_retrieval(question):
    question_context = query_docs_and_questions(question)
    used_questions = question_context.loc[0,'usedQuestion']
    doc_context = query_documents(question)
    merge_df = question_context.merge(doc_context, how="outer", on="text")
    question_only_context = merge_df[pd.isna(merge_df['metadata_x'])]['text'].to_list()
    without_question_only_context = merge_df[pd.isna(merge_df['metadata_y'])]['text'].to_list()
    question_response = chat_chain.invoke({"context":". ".join(question_context['text'].to_list()), "input": question})
    doc_response = chat_chain.invoke({"context":". ".join(doc_context['text'].to_list()), "input": question})
    return {"used_questions": used_questions, 
            "with_question_response": question_response.content, 
            "without_question_response": doc_response.content,
            "question_only_context": question_only_context,
            "without_question_only_context": without_question_only_context}                                          
                                     

In [181]:
def display_results(question):
    result_dict = compare_retrieval(question)
    print("Used questions?", result_dict['used_questions'])
    print()
    print("With questions:", result_dict['with_question_response'])
    if result_dict['used_questions']:
        print()
        print("Context only used in question response:", result_dict['question_only_context'])
    print()
    print("Without questions:", result_dict['without_question_response'])
    if result_dict['used_questions']:
        print()
        print("Context only used in documents-only response:", result_dict['without_question_only_context'])   

In [179]:
display_results("Does Neo4j work with vectors?")

Used questions? True

With questions: Yes, Neo4j works with vectors through node embedding algorithms. Node embedding algorithms compute low-dimensional vector representations of nodes in a graph. These vectors, also called embeddings, can be used for machine learning tasks. The Neo4j Graph Data Science library contains several node embedding algorithms such as FastRP, GraphSAGE, Node2Vec, and HashGNN. These algorithms can generate vector representations of nodes in a graph, allowing for various machine learning applications.

Context only used in question response: ["(a) (()-[:KNOWS]->()){3,5} (b) (a)-[r*..5 {name: 'Filipa'}]->(b) (a) (()-[r {name: 'Filipa'}]->()){1,5} (b) Equijoins The variable of a variable-length relationship can be used in subsequent patterns to refer to the list of relationships the variable is bound to. This is the same as the equijoin for variables bound to single nodes or relationships. This section uses the following graph: To recreate the graph, run the foll

In [182]:
display_results("Why might people use virtual nodes with GDS?")

Used questions? False

With questions: People might use virtual nodes with GDS (Graph Data Science) for various reasons, such as:

1. **Graph Grouping**: Large graphs can be hard to understand or visualize. By grouping nodes based on certain properties into virtual nodes, users can aggregate information and create a more concise and easier-to-understand representation of the graph.

2. **Aggregate Relationships**: Virtual nodes can be used to aggregate relationships between groups of nodes, providing a summary view of the connections within the graph.

3. **Projection of Data**: Virtual nodes can be used to project data in a different way, such as aggregating relationships into one or collapsing intermediate nodes into virtual relationships. This can help simplify the graph structure for specific analysis or visualization purposes.

4. **Enhanced Security**: Virtual nodes can be used to hide away sensitive properties or intermediate nodes/relationships for security reasons, ensuring th

In [185]:
display_results("How can a graph be restored from a backup in Neo4j?")

Used questions? True

With questions: To restore a graph from a backup in Neo4j, you can follow these steps:

1. First, ensure that you have backed up your graph with the necessary metadata included.
2. Use the "gds.alpha.backup" procedure to back up your graph to disk.
3. To restore the backed-up graph, you can use the "gds.alpha.restore" procedure.
4. Provide the name of the graph you want to restore as a parameter to the restore procedure.
5. The restore procedure will load the graph back into memory, allowing you to access the previously backed-up data.

By following these steps, you can easily restore a graph from a backup in Neo4j.

Context only used in question response: [':exit; Restore the database backup You use the neo4j-admin database restore command to restore the database backup, and then the Cypher command CREATE DATABASE name to create the restored database in the system database. For information about the command syntax, options, and usage, see Restore a database backu

In [186]:
display_results("Please explain the APOC Core Temporal Functions and provide examples of their usage.")

Used questions? False

With questions: The APOC Core Temporal Functions provide support for working with temporal types, timestamps, and date string values in Neo4j. Here are the main functions along with examples of their usage:

1. `apoc.temporal.toZonedTemporal(time String, format String, timezone String)`:
   - This function parses the given date string using the specified format into the given time zone.
   - Example:
     ```
     RETURN apoc.temporal.toZonedTemporal("2022-01-15 12:30:00", "yyyy-MM-dd HH:mm:ss", "America/New_York") AS zonedTime
     ```

2. `apoc.temporal.format(temporal Any, format String)`:
   - This function formats the given temporal value into the specified time format.
   - Example:
     ```
     RETURN apoc.temporal.format(datetime(), "yyyy-MM-dd HH:mm:ss") AS formattedTime
     ```

3. `apoc.temporal.formatDuration(input Any, format String)`:
   - This function formats the given duration into the specified time format.
   - Example:
     ```
     RETURN a

In [187]:
display_results("What are the most important differences between APOC Core and APOC Extended?")

Used questions? True

With questions: The most important differences between APOC Core and APOC Extended are as follows:

1. **Official Support**: APOC Core is officially supported by Neo4j, while APOC Extended is maintained by members of the community. This means that APOC Core procedures are officially endorsed and guaranteed to work seamlessly with Neo4j, while APOC Extended procedures may not have the same level of official support.

2. **Dependencies**: APOC Core contains battle-hardened procedures and functions with no external dependencies. On the other hand, APOC Extended may contain additional procedures that require external dependencies, which may need to be manually installed.

3. **Documentation**: APOC Core has its own dedicated documentation site, separate from the APOC Extended documentation. Each procedure in the documentation is tagged with an "Extended" tag if it is part of the APOC Extended edition.

4. **Installation Process**: If you are using procedures from the 

In [188]:
display_results("How does Weakly Connected Components (WCC) help you find disconnected parts in a network?")

Used questions? True

With questions: Weakly Connected Components (WCC) algorithm helps you find disconnected parts or islands within a network by identifying clusters in a graph where each node can reach every other node if the relationship direction is ignored. This means that nodes within the same component can reach each other, but there may not be connections between different components. By running the WCC algorithm on a graph, you can identify separate islands or components of nodes that are not connected to each other, thus helping you understand the overall connectivity of the network.

Context only used in question response: []

Without questions: The Weakly Connected Components (WCC) algorithm helps you find disconnected parts or islands within a network by identifying sets of connected nodes where each node can reach every other node if the relationship direction is ignored. This means that nodes within the same component can reach each other, but there may not be connectio

In [189]:
display_results("When should I use Leiden instead of Louvain for community detection?")

Used questions? False

With questions: You should consider using Leiden instead of Louvain for community detection when you want to use a hierarchical approach to community detection and when a slightly higher computational cost is an acceptable tradeoff. Leiden algorithm provides improvements over Louvain for detecting smaller communities within a graph. It incorporates a refinement phase that enhances its ability to detect smaller communities compared to Louvain. Additionally, Leiden is a good choice when you need to identify communities in large-scale networks and are interested in understanding their hierarchical structure.

Without questions: Leiden is a likely the optimal choice when you want to use a hierarchical approach to community detection and when a slightly higher computational cost is an acceptable tradeoff. Leiden algorithm performs the same hierarchical community detection as the Louvain algorithm but with improvements that address Louvain's challenges with detecting s

In [190]:
display_results("When should I use choose the Leiden algorithm for community detection?")

Used questions? True

With questions: You should choose the Leiden algorithm for community detection when you want to use a hierarchical approach to community detection and when a slightly higher computational cost is an acceptable tradeoff. Leiden algorithm performs hierarchical community detection similar to Louvain but with improvements that address Louvain's challenges with detecting small communities. It incorporates a 'refinement' phase into the algorithm, enhancing its ability to detect smaller communities within the graph.

Context only used in question response: ['3. Examples In this section we will show examples of running the Leiden community detection algorithm on a concrete graph. The intention is to illustrate what the results look like and to provide a guide in how to make use of the algorithm in a real setting. We will do this on a small social network graph of a handful nodes connected in a particular pattern. The example graph looks like this: The following Cypher sta