In [2]:
!pip install --quiet langchain langchain-community langchain-anthropic neo4j

In [1]:
import pandas as pd
questions = pd.read_csv("text2cypher_questions.csv")
questions.head()

  from pandas.core import (


Unnamed: 0,question,type,database
0,What are the top 5 movies with a runtime great...,Simple Retrieval Queries,recommendations
1,List the first 3 directors born before 1950.,Simple Retrieval Queries,recommendations
2,Which 5 users have rated more than 20 movies?,Simple Retrieval Queries,recommendations
3,Identify the top 5 actors who have acted in mo...,Simple Retrieval Queries,recommendations
4,What are the top 3 genres associated with movi...,Simple Retrieval Queries,recommendations


In [2]:
selected_questions = questions[
    questions["database"].isin(
        [
            "recommendations",
            "companies",
            "twitch",
            "twitter",
            "gameofthrones",
            "movies",
            "neoflix",
        ]
    )
]
print(len(selected_questions))


4991


In [3]:
schemas = pd.read_csv('text2cypher_schemas.csv')
schemas.head()
schema_dict = {}
for i, row in schemas.iterrows():
    schema_dict[row['database']] = row['schema']

In [4]:
from typing import List, Union

from langchain.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema
from langchain_community.graphs import Neo4jGraph
from langchain_core.messages import (
    AIMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnablePassthrough
from langchain_anthropic import ChatAnthropic
import os

os.environ["ANTHROPIC_API_KEY"] = 'sk-'
# LLMs
cypher_llm = ChatAnthropic(model="claude-3-opus-20240229", temperature=0.0, default_request_timeout=20)

# Generate Cypher statement based on natural language input
cypher_template = """Based on the Neo4j graph schema below, write a Cypher query that would answer the user's question:
{schema}

Question: {question}
Cypher query:"""  # noqa: E501

cypher_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Given an input question, convert it to a Cypher query. No pre-amble.
Additional instructions:
- Ensure that queries checking for non-null properties use `IS NOT NULL` in a straightforward manner.
- Don't use `size((n)--(m))` for counting relationships. Instead use the new `count{{(n)--(m)}}` syntax.
- Incorporate the new existential subqueries in examples where the query needs to check for the existence of a pattern.
  Example: MATCH (p:Person)-[r:IS_FRIENDS_WITH]->(friend:Person)
            WHERE exists{{ (p)-[:WORKS_FOR]->(:Company {{name: 'Neo4j'}})}}
            RETURN p, r, friend"""        ),
        ("human", cypher_template),
    ]
)

cypher_response = (
    cypher_prompt
    | cypher_llm
    | StrOutputParser()
)

In [None]:
cypher_responses = []
for i, row in selected_questions.iterrows():
    print(i)
    schema = schema_dict[row['database']]
    try:
        cypher = cypher_response.invoke({"question": row["question"], "schema": schema})
        cypher_responses.append({"question": row["question"], "database": row["database"], "cypher":cypher, "type":row["type"]})
    except Exception as e:
        print(e)
        #print('timeout' + str(e))
        #cypher_responses.append("")

In [29]:
combined_df = pd.DataFrame().from_records(cypher_responses)
DEMO_URL = "neo4j+s://demo.neo4jlabs.com" 

In [None]:
syntax_error = []
returns_results = []
timeouts = []
last_graph = ""
for i, row in combined_df.reset_index().iterrows():
    print(i)
    if i % 100 == 0:
        print(i)
    # To avoid a new driver for every request
    if row['database'] != last_graph:
        last_graph = row["database"]
        print(last_graph)
        graph = Neo4jGraph(
            url=DEMO_URL, 
            username=row["database"], 
            password=row["database"], 
            database=row["database"], 
            refresh_schema=False, 
            timeout=10)
    try:
        data = graph.query(row['cypher'])
        if data:
            returns_results.append(True)
        else:
            returns_results.append(False)
        syntax_error.append(False)
        timeouts.append(False)
    except ValueError as e:
        if "Generated Cypher Statement is not valid" in str(e):
            syntax_error.append(True)
            print(f"Syntax error in Cypher query: {e}")
        else:
            syntax_error.append(False)
            print(f"Other ValueError: {e}")
        returns_results.append(False)
        timeouts.append(False)
    except Exception as e:
        if e.code  == "Neo.ClientError.Transaction.TransactionTimedOutClientConfiguration":
            returns_results.append(False)
            syntax_error.append(False)
            timeouts.append(True)
        else:
            returns_results.append(False)
            syntax_error.append(False)
            timeouts.append(True)
            
        
    

In [47]:
combined_df["syntax_error"] = syntax_error
combined_df["timeout"] = timeouts
combined_df["returns_results"] = returns_results

In [48]:
final_df = combined_df[
    [
        "question",
        "cypher",
        "type",
        "database",
        "syntax_error",
        "timeout",
        "returns_results",
    ]
]
final_df.head()

Unnamed: 0,question,cypher,type,database,syntax_error,timeout,returns_results
0,What are the top 5 movies with a runtime great...,MATCH (m:Movie)\nWHERE m.runtime > 120\nRETURN...,Simple Retrieval Queries,recommendations,False,False,True
1,List the first 3 directors born before 1950.,MATCH (d:Director)\nWHERE d.born < date('1950-...,Simple Retrieval Queries,recommendations,False,False,True
2,Which 5 users have rated more than 20 movies?,"MATCH (u:User)-[r:RATED]->(m:Movie)\nWITH u, c...",Simple Retrieval Queries,recommendations,False,False,True
3,Identify the top 5 actors who have acted in mo...,MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\nWITH a...,Simple Retrieval Queries,recommendations,False,False,True
4,What are the top 3 genres associated with movi...,MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)\nWHERE ...,Simple Retrieval Queries,recommendations,False,False,True


In [49]:
# Assume df is your DataFrame and col1, col2, col3 are the boolean columns
distribution_col1 = final_df["syntax_error"].value_counts()
distribution_col2 = final_df["timeout"].value_counts()
distribution_col3 = final_df["returns_results"].value_counts()

print("Distribution for syntax_error:\n", distribution_col1)
print("Distribution for timeout:\n", distribution_col2)
print("Distribution for returns_results:\n", distribution_col3)

Distribution for syntax_error:
 syntax_error
False    4772
True      194
Name: count, dtype: int64
Distribution for timeout:
 timeout
False    4898
True       68
Name: count, dtype: int64
Distribution for returns_results:
 returns_results
True     3987
False     979
Name: count, dtype: int64


In [50]:
final_df.to_csv("text2cypher_claudeopus.csv", index=False)