In [1]:
import os
from dotenv import load_dotenv

import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain

load_dotenv()

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'


kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)



In [2]:
# What companies do you know about?
kg.query("""
  MATCH (com:Company)
  RETURN apoc.text.capitalizeAll(toLower(com.companyName)) as companyName
""")

[{'companyName': 'Netapp Inc'},
 {'companyName': 'Palo Alto Networks Inc'},
 {'companyName': 'Fedex Corp'},
 {'companyName': 'Gsi Technology Inc'},
 {'companyName': 'News Corp New'},
 {'companyName': 'Nike Inc'},
 {'companyName': 'Western Digital Corp.'},
 {'companyName': 'Seagate Technology'},
 {'companyName': 'Atlassian Corp Plc'},
 {'companyName': 'Apple Inc'}]

In [None]:
# Where is Blackrock located?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)
  RETURN p
""")

In [None]:
# How many investment firms are at the same address as Blackrock?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)<-[:LOCATED_AT]-(other:Manager)
  RETURN count(other) as numManagers
""")

In [None]:
# Which state has the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are the cities in California with the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are top investment firms in San Francisco?
kg.query("""
  MATCH p=(mgr:Manager)-[:LOCATED_AT]->(address:Address),
         (mgr)-[owns:OWNS_STOCK_IN]->(:Company)
         WHERE address.city = $city
  RETURN mgr.managerName as city, sum(owns.value) as totalInvestmentValue
    ORDER BY totalInvestmentValue DESC
    LIMIT 10
""", params={"city": "San Francisco", "state": "California"})

In [5]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

cypherChain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

def prettyCypherChain(question: str) -> str:
    response = cypherChain.run(question)
    print(textwrap.fill(response, 80))


In [6]:
prettyCypherChain("What companies do you know about?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Company)
RETURN c[0m
Full Context:
[32;1m[1;3m[{'c': {'cusip': '64110D104', 'names': ['Netapp Inc', 'NETAPP INC'], 'companyAddress': 'Headquarters Dr, San Jose, CA 95134, USA', 'companyName': 'NETAPP INC', 'cusip6': '64110D', 'location': POINT(-121.952086 37.4167918)}}, {'c': {'cusip': '697435105', 'names': ['Palo Alto Networks Inc.', 'PALO ALTO NETWORKS INC', 'PALO ALTO NETWORKS INC PUT', 'None'], 'companyAddress': '3000 Tannery Way, Santa Clara, CA 95054, USA', 'companyName': 'PALO ALTO NETWORKS INC', 'cusip6': '697435', 'location': POINT(-121.9828954 37.383283)}}, {'c': {'cusip': '31428X106', 'names': ['FedEx Corp', 'FEDEX CORP'], 'companyAddress': 'Delaware, USA', 'companyName': 'FEDEX CORP', 'cusip6': '31428X', 'location': POINT(-75.52766989999999 38.9108325)}}, {'c': {'cusip': '36241U106', 'names': ['GSI TECHNOLOGY INC'], 'companyAddress': '1213 Elko Dr, Sunnyvale, CA 94089, USA', 'com

In [7]:
prettyCypherChain("What investment firms do you know about?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (m:Manager)-[:OWNS_STOCK_IN]->(c:Company)
RETURN DISTINCT m.managerName[0m
Full Context:
[32;1m[1;3m[{'m.managerName': 'CSS LLC/IL'}, {'m.managerName': 'BOKF, NA'}, {'m.managerName': 'BANK OF NOVA SCOTIA'}, {'m.managerName': 'Jefferies Financial Group Inc.'}, {'m.managerName': 'DEUTSCHE BANK AG\\'}, {'m.managerName': 'TORONTO DOMINION BANK'}, {'m.managerName': 'STATE BOARD OF ADMINISTRATION OF FLORIDA RETIREMENT SYSTEM'}, {'m.managerName': 'NISA INVESTMENT ADVISORS, LLC'}, {'m.managerName': 'ONTARIO TEACHERS PENSION PLAN BOARD'}, {'m.managerName': 'STATE STREET CORP'}][0m

[1m> Finished chain.[0m
I know about CSS LLC/IL, BOKF, NA, BANK OF NOVA SCOTIA, Jefferies Financial
Group Inc., DEUTSCHE BANK AG, TORONTO DOMINION BANK, STATE BOARD OF
ADMINISTRATION OF FLORIDA RETIREMENT SYSTEM, NISA INVESTMENT ADVISORS, LLC,
ONTARIO TEACHERS PENSION PLAN BOARD, and STATE STREET CORP. These are some
inves

In [None]:
prettyCypherChain("Where is Blackrock located?")

In [None]:
prettyCypherChain("What is Blackrock's cik?")

In [None]:
prettyCypherChain("What investment firms are in San Francisco?")

In [None]:
prettyCypherChain("What investment firms are in New York?")

In [None]:
cypherChain.run("What firms in San Francisco have the most investment value?")

In [None]:
# Which state has the most public companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numCompanies
    ORDER BY numCompanies DESC
""")

In [None]:
prettyCypherChain("Which state has the most public companies listed?")

In [None]:
prettyCypherChain("How many public companies listed per state?")

In [None]:
# Which city in California has the most companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numCompanies
    ORDER BY numCompanies DESC
""")

In [None]:
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
    YIELD node, score
  WITH node as com
  RETURN com.location, com.companyAddress
""", params={"companyName": "Fedex"})

In [None]:
# Which investment firms are near Fedex?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
         YIELD node, score
  WITH node as com
  MATCH (mgr:Manager)
    WHERE point.distance(mgr.location, com.location) < $withinMeters
  WITH mgr, 
    toInteger(point.distance(mgr.location, com.location) / 1000) as distanceKm
  RETURN mgr.managerName, mgr.managerAddress, 
        apoc.number.format(distanceKm) + "km" as distance

""", params={"companyName": "Fedex", "withinMeters": 100 * 1000})