In [142]:
from graphdatascience import GraphDataScience
from getpass import getpass
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
import instructor
from pydantic import BaseModel, Field, ValidationError
from typing import List, Tuple
from typing_extensions import Annotated
from pydantic.functional_validators import BeforeValidator
import pandas as pd
import time
from openai import OpenAI
import os

In [2]:
neo4j_password = getpass()

 ········


In [3]:
openai_api_key = getpass()

 ········


In [59]:
os.environ['OPENAI_API_KEY'] = openai_api_key

In [4]:
gds = GraphDataScience('neo4j+s://bc9f751a.databases.neo4j.io', auth=("neo4j", neo4j_password))
gds.set_database("neo4j")

# Find groups of documents that have been used as context and the questions associated with those document groups

In [5]:
g_context, result = gds.graph.cypher.project("""
MATCH (q:Message)-[:NEXT]->(a:Message)-[:HAS_CONTEXT]->(d:Document) WHERE NOT  coalesce(a.rating, 'Unrated') = 'Bad'
RETURN gds.graph.project("message_context", q, d, {sourceNodeLabels: ["Message"], targetNodeLabels: ["Document"]})""")

In [6]:
result

{'relationshipCount': 2486,
 'graphName': 'message_context',
 'query': '\nMATCH (q:Message)-[:NEXT]->(a:Message)-[:HAS_CONTEXT]->(d:Document) WHERE NOT  coalesce(a.rating, ******) = ******\nRETURN gds.graph.project(******, q, d, {sourceNodeLabels: [******], targetNodeLabels: [******]})',
 'projectMillis': 23,
 'configuration': {'readConcurrency': 4,
  'undirectedRelationshipTypes': [],
  'jobId': 'a5b1a3f0-991c-4b54-8d21-8e7f48028c97',
  'logProgress': True,
  'query': '\nMATCH (q:Message)-[:NEXT]->(a:Message)-[:HAS_CONTEXT]->(d:Document) WHERE NOT  coalesce(a.rating, ******) = ******\nRETURN gds.graph.project(******, q, d, {sourceNodeLabels: [******], targetNodeLabels: [******]})',
  'inverseIndexedRelationshipTypes': [],
  'creationTime': neo4j.time.DateTime(2024, 3, 11, 13, 42, 5, 149016517, tzinfo=<UTC>)},
 'nodeCount': 1791}

In [7]:
gds.nodeSimilarity.mutate(g_context, similarityCutoff=1.0, mutateRelationshipType="HAS_SAME_CONTEXT", mutateProperty="similarity")

preProcessingMillis                                                       0
computeMillis                                                            12
mutateMillis                                                              7
postProcessingMillis                                                      0
nodesCompared                                                           287
relationshipsWritten                                                     54
similarityDistribution    {'min': 1.0, 'p5': 1.0, 'max': 1.0, 'p99': 1.0...
configuration             {'mutateProperty': 'similarity', 'jobId': 'edb...
Name: 0, dtype: object

In [8]:
gds.graph.relationships.stream(g_context, "HAS_SAME_CONTEXT")

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,14477,14481,HAS_SAME_CONTEXT
1,14481,14477,HAS_SAME_CONTEXT
2,14491,14514,HAS_SAME_CONTEXT
3,14509,16218,HAS_SAME_CONTEXT
4,14509,16234,HAS_SAME_CONTEXT
5,14509,17460,HAS_SAME_CONTEXT
6,14509,17468,HAS_SAME_CONTEXT
7,14514,14491,HAS_SAME_CONTEXT
8,16001,16021,HAS_SAME_CONTEXT
9,16001,17391,HAS_SAME_CONTEXT


In [9]:
gds.wcc.mutate(g_context, nodeLabels=["Message"], relationshipTypes=["HAS_SAME_CONTEXT"], mutateProperty="wccId")

mutateMillis                                                             0
nodePropertiesWritten                                                  287
componentCount                                                         271
componentDistribution    {'min': 1, 'p5': 1, 'max': 5, 'p999': 5, 'p99'...
postProcessingMillis                                                     3
preProcessingMillis                                                      0
computeMillis                                                            2
configuration            {'mutateProperty': 'wccId', 'jobId': 'f5049188...
Name: 0, dtype: object

In [186]:
question_context_df = gds.run_cypher("""
CALL gds.graph.nodeProperties.stream("message_context", "wccId", "Message")
YIELD nodeId, propertyValue
WITH propertyValue as wccId, gds.util.asNode(nodeId) AS q
MATCH (q)-[:NEXT]->()-[:HAS_CONTEXT]->(d:Document)
RETURN wccId,
count(distinct trim(q.content)) AS questionCount,
count(distinct q) AS messageCount,
collect(distinct trim(q.content)) AS questions,
collect(distinct q.id) AS questionIds,
collect(distinct d.index) AS docIndex,
collect(distinct d.text) AS docText
ORDER BY messageCount DESC
""")

In [187]:
question_context_df

Unnamed: 0,wccId,questionCount,messageCount,questions,questionIds,docIndex,docText
0,11,1,5,[what are the most important hyperparameters f...,"[user-5d55f3ef-ff91-4f45-bd26-b7f0ced11e09, us...","[7138, 13623, 9620, 4959, 9627, 9617, 7152, 95...","[by incorporating node properties, making the ..."
1,117,1,4,[Can you explain HashGNN to me?],"[user-12f4d979-4539-47bf-902d-aca3e431058f, us...","[4963, 4947, 4953, 4217, 4982, 4933, 4222, 493...","[2.3. Feature generation, 1.6. Orientation, 1...."
2,220,1,3,[What is GDS?],"[user-9a27580d-994a-41c2-af6c-16e6507ba6fb, us...","[8dd6c3d4-6372-4c91-a914-d1a366e33513, 9565, 1...","[I run this query that I had prepared here, th..."
3,248,1,3,[what is gds?],"[user-7bd84ecd-45cc-4240-a72e-a0d9a6e3ba35, us...","[9779, 3973, 8273c1fe-ca9c-4750-b782-a4f60934a...",[Production-quality Indicates that the featur...
4,223,1,2,[How does GDS work on a secondary node in a Ne...,"[user-f8f519de-13f6-425c-b4b2-49caf3178731, us...","[9201, 9193, 9194, 9198, 1401, 1398, 9197, 139...",[single Core member and a Read Replica is a va...
...,...,...,...,...,...,...,...
266,265,1,1,"[Great,.what asset should i focus more in my a...",[user-1ef61129-a186-4ba4-887d-eb813438e954],"[f3972647-8a96-4a30-9afa-beca5f262933, f089acd...",['s sections there's Parts there's definitions...
267,266,1,1,[okay how do i find that],[user-9a582e4c-e38f-4c34-9555-d6ee443ce30f],"[4fdc0bf5-3f56-449d-804e-d68d02dce1a6, a53f004...",[lower partNeo4jManhattan if you were to look ...
268,267,1,1,[Hello Agent Neo4j!],[user-18f032bd-87aa-46b1-b5ad-620020bce2b4],"[10643, 7906, 12248, 33c0a3b5-b2ba-4aa2-bdb1-b...","[Let’s start with a simple test. chain.run(""""""..."
269,268,1,1,[Can you tell me about betweeness centrality],[user-d707b734-d20c-4af8-9e57-9d533718bf8b],"[1081, 7352, 7869, 4699, 7349, 3306, 1019, 330...",[Weighted in-degree (total amount received) We...


# Generate new questions

In [128]:
client = instructor.patch(OpenAI())

In [137]:
def ends_in_question_mark(s: str) -> str:
    assert s[-1:] == "?", f'{s} does not end in a question mark'
    return s

In [165]:
class Question(BaseModel):
    text: str = Field(
        description="A single sentence ending in a question mark asking a question about Neo4j software.",
        pattern=r'.*\?$')

class GeneratedQuestions(BaseModel):
    questions: List[Question]
    

In [182]:
def get_questions(questions, texts):
    message = f"""You are a Neo4j expert providing information about Neo4j software.
      Below is a list of one or more questions and a list of texts that could contain the answer for those questions.
      Create list of up to three new questions that could be answered with the same texts. 
      Do not repeat a question that was provided to you.
      Return the questions as a pipe-delimited list.
      <questions>
      {questions}
      </quesitons>
      <texts>
      {texts}
      </texts>"""
    question_result = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_model = GeneratedQuestions,
        max_retries=2,
        messages=[{"role":"user", "content":message}])
    return [q.text for q in question_result.questions]

In [185]:
def write_questions_to_neo4j(row):
    try:
        generated_questions = get_questions(row['questions'], row['docText'])
        gds.run_cypher("""
            UNWIND $generatedQuestions AS generatedQuestion
            MERGE (gq:GeneratedQuestion {content: trim(generatedQuestion)})
            WITH gq
            UNWIND $docIndex AS index
            MATCH (d:Document {index:index}) 
            MERGE (gq)-[:GENERATED_FROM_CONTEXT]->(d)
            WITH DISTINCT gq
            UNWIND $questionIds AS id
            MATCH (m:Message {id:id}) MERGE (gq)-[:COPIED_CONTEXT_FROM]->(m)
            """,
                       {"generatedQuestions": generated_questions, "docIndex": row['docIndex'], "questionIds": row['questionIds']})
        return generated_questions
    except Exception as e:
        print(e)
        print(row['docText'])
        return e

In [132]:
gds.run_cypher("""
CREATE CONSTRAINT generated_question_node_key IF NOT EXISTS FOR (g:GeneratedQuestion) require g.content IS NODE KEY""")

In [189]:
get_questions(question_context_df.loc[31,'questions'], question_context_df.loc[31,'docText'])

['want you to act as an experienced graph data scientist who works at Neo4j. A customer asks you how large language models (LLMs) like ChatGPT can assist with graph data science, specifically using Neo4j Graph Data Science algorithms. How would you advise this customer to explore integrating LLMs into their graph data science workflows? What would likely be the easiest or most impactful ways in which an LLM can make them more productive and effective?']

In [192]:
question_context_df['generated_questions'] = question_context_df.apply(write_questions_to_neo4j, axis=1)

1 validation error for GeneratedQuestions
questions.0.text
  String should match pattern '.*\?$' [type=string_pattern_mismatch, input_value='What considerations are ...down data comparisons? ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.5/v/string_pattern_mismatch
['choice where explainability is important and you can narrow down the universe of comparisons to a subset of your\xa0data.', 'We can ask another follow-up question. Here, I got pretty excited. GPT-4 ability to infer new Cypher queries based on the training examples is astounding. It has a good understanding of the Cypher itself, and when it grasps a given graph schema, it performs very well. We have also added training examples that generate Cypher statements that update the database. Additionally, we can also let it know which movies we like.', 'setting. One thing to note is that follow-up questions, where the model has to rely on previous dialogue to understand the context of the questi

In [193]:
question_context_df

Unnamed: 0,wccId,questionCount,messageCount,questions,questionIds,docIndex,docText,generated_questions
0,11,1,5,[what are the most important hyperparameters f...,"[user-5d55f3ef-ff91-4f45-bd26-b7f0ced11e09, us...","[7138, 13623, 9620, 4959, 9627, 9617, 7152, 95...","[by incorporating node properties, making the ...",[What are the key parameters for hyperparamete...
1,117,1,4,[Can you explain HashGNN to me?],"[user-12f4d979-4539-47bf-902d-aca3e431058f, us...","[4963, 4947, 4953, 4217, 4982, 4933, 4222, 493...","[2.3. Feature generation, 1.6. Orientation, 1....","[Can you explain HashGNN to me?, What is the H..."
2,220,1,3,[What is GDS?],"[user-9a27580d-994a-41c2-af6c-16e6507ba6fb, us...","[8dd6c3d4-6372-4c91-a914-d1a366e33513, 9565, 1...","[I run this query that I had prepared here, th...",[What is GDS?]
3,248,1,3,[what is gds?],"[user-7bd84ecd-45cc-4240-a72e-a0d9a6e3ba35, us...","[9779, 3973, 8273c1fe-ca9c-4750-b782-a4f60934a...",[Production-quality Indicates that the featur...,"[what is gds?, What are the different tiers fo..."
4,223,1,2,[How does GDS work on a secondary node in a Ne...,"[user-f8f519de-13f6-425c-b4b2-49caf3178731, us...","[9201, 9193, 9194, 9198, 1401, 1398, 9197, 139...",[single Core member and a Read Replica is a va...,[What is the role of a Read Replica instance i...
...,...,...,...,...,...,...,...,...
266,265,1,1,"[Great,.what asset should i focus more in my a...",[user-1ef61129-a186-4ba4-887d-eb813438e954],"[f3972647-8a96-4a30-9afa-beca5f262933, f089acd...",['s sections there's Parts there's definitions...,[What asset should i focus more in my asset gr...
267,266,1,1,[okay how do i find that],[user-9a582e4c-e38f-4c34-9555-d6ee443ce30f],"[4fdc0bf5-3f56-449d-804e-d68d02dce1a6, a53f004...",[lower partNeo4jManhattan if you were to look ...,[How to find information in Neo4j software?]
268,267,1,1,[Hello Agent Neo4j!],[user-18f032bd-87aa-46b1-b5ad-620020bce2b4],"[10643, 7906, 12248, 33c0a3b5-b2ba-4aa2-bdb1-b...","[Let’s start with a simple test. chain.run(""""""...",1 validation error for GeneratedQuestions\nque...
269,268,1,1,[Can you tell me about betweeness centrality],[user-d707b734-d20c-4af8-9e57-9d533718bf8b],"[1081, 7352, 7869, 4699, 7349, 3306, 1019, 330...",[Weighted in-degree (total amount received) We...,[Can you tell me about Eigenvector Centrality?...


# Examine generated questions

In [None]:
generated_question_df = gds.run_cypher("""
MATCH (gq:GeneratedQuestion)-[:COPIED_CONTEXT_FROM]->(m)
RETURN m.content AS question, collect(DISTINCT gq.content) AS generated_questions
""")
generated_question_df