In [2]:
import os
from dotenv import load_dotenv

import googlemaps

import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain

load_dotenv()

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

GOOGLE_MAPS_API_KEY = os.getenv("GOOGLE_MAPS_API_KEY")

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)


In [None]:
# Is Blackrock an investment firm in the database?
kg.query("""
CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
RETURN node.managerName, score
""")

In [None]:
# Where is Blackrock located?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)
  RETURN p
""")

In [None]:
# How many investment firms are at the same address as Blackrock?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)<-[:LOCATED_AT]-(other:Manager)
  RETURN count(other) as numManagers
""")

In [None]:
# Which state has the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are the cities in California with the most investment firms?
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are top investment firms in San Francisco?
kg.query("""
  MATCH p=(mgr:Manager)-[:LOCATED_AT]->(address:Address),
         (mgr)-[owns:OWNS_STOCK_IN]->(:Company)
         WHERE address.city = $city
            AND address.state = $state
  RETURN mgr.managerName as city, sum(owns.value) as totalInvestmentValue
    ORDER BY totalInvestmentValue DESC
    LIMIT 10
""", params={"city": "San Francisco", "state": "California"})

In [None]:
# Create a langchain vector store from the existing Neo4j knowledge graph.
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

# Create a retriever from the vector store
retriever = neo4j_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=retriever
)

def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 80))

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)

def prettyllm(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = llm.invoke(question)
    print(textwrap.fill(response.content, 80))
    # print(response.content)

companyPrompt = ChatPromptTemplate.from_messages([
    ("system", "You are an industry analyst."),
    ("user", "{input}")
])

companyChain = companyPrompt | llm

def prettyCompanyChain(question:str) -> str:
    response = companyChain.invoke({"input":question})
    print(textwrap.fill(response.content, 80))


In [None]:
structured_summary_retrieval_query = """
MATCH (mgr:Manager)-[owns:OWNS_STOCK_IN]->(com:Company),
    (mgr:Manager)-[:LOCATED_AT]->(address:Address)
WITH mgr, owns, com, address ORDER BY owns.shares DESC LIMIT 10
WITH 
    mgr.managerName + " owns " + owns.shares + " of " + com.companyName + 
    " at a value of $" + apoc.number.format(owns.value) + ". " +
    mgr.managerName + " is located in " + address.city + ", " + address.state + ". " 
    AS sentence
WITH collect(sentence) AS sentences
RETURN apoc.text.join(sentences, "\n") + "\n"  AS text,
        1.0 as score,
        {
            type: "structured_summary",
            source: "cypher"
        } as metadata
"""


structured_summary_vector_store = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j",
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    retrieval_query=structured_summary_retrieval_query,
)

# Create a retriever from the vector store
structured_retriever = structured_summary_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
structured_chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=structured_retriever
)

def prettyStructuredChain(question: str) -> str:
    response = structured_chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 80))

In [None]:
prettyStructuredChain("What are the top 10 investment firms in San Francisco?")

In [None]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:
# What investment firms are in San Francisco?
MATCH (m:Manager)-[:LOCATED_AT]->(a:Address)
    WHERE a.city = 'San Francisco'
RETURN m.managerName

# What firms in San Francisco have the most investments?
MATCH (m:Manager)-[:LOCATED_AT]->(a:Address),
        (m:Manager)-[owns:OWNS_STOCK_IN]->(c:Company)
    WHERE a.city = 'San Francisco'
WITH m, sum(owns.value) as totalInvestmentValue
RETURN m.managerName + " owns $" + apoc.number.format(totalInvestmentValue) + " worth of shares."


The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

cypherChain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

def prettyCypherChain(question: str) -> str:
    response = cypherChain.run(question)
    print(textwrap.fill(response, 80))


In [None]:
prettyCypherChain("What investment firms are in San Francisco?")

In [None]:
prettyCypherChain("What investment firms are in New York?")

In [None]:
cypherChain.run("What firms in San Francisco have the most investments?")

In [None]:
prettyCompanyChain("In a single sentence, what is is Fedex's industry segment?")

In [None]:
prettyllm("What is the address of Blackrock?")


In [None]:
prettychain("What companies do you know about?")

In [None]:
prettychain("What companies are in San Jose?")

In [None]:
prettychain("What is the address of Blackrock?")

In [None]:
prettychain("Where is Netapp headquartered?")

In [None]:
prettychain("What companies are headquartered in San Jose?")

In [None]:
prettychain("In a single sentence, what is Netapp's industry segment?")

In [None]:
company_rows = kg.query("""
  MATCH (com:Company)
  RETURN com { .cusip6, .companyName } as company
""")

companies = list(map(lambda row: row['company'], company_rows))

print(companies[0])

In [None]:
set_company_location_cypher = """
  MATCH (com:Company {cusip6: $cusip6})
  SET com.companyAddress = $companyAddress
  SET com.location = point({latitude: $latitude, longitude: $longitude})
  MERGE (addr:Address {city: $city, state: $state})
  MERGE (com)-[:LOCATED_AT]->(addr)
"""
for company in companies:
    result = chain(f"Where is {company['companyName']} headquartered?")
    address_statement = result['answer']
    address_geocodes = gmaps.geocode(address_statement)
    if len(address_geocodes) > 0:
        address_geocode = address_geocodes[0]

        print(f"{company['companyName']} is located at {address_geocode['formatted_address']}")

        cusip6 = company['cusip6']

        location = get_location(address_geocode)
        city = get_city(address_geocode)
        state = get_state(address_geocode)
        country = get_country(address_geocode)

        cityOrState = city if city else state
        stateOrCountry = state if state else country 

        if location and cityOrState and stateOrCountry:
            kg.query(set_company_location_cypher, params={
                "companyAddress": address_geocode['formatted_address'],
                "cusip6": company['cusip6'],
                "latitude": location['lat'],
                "longitude": location['lng'],
                "city": cityOrState['long_name'],
                "state": stateOrCountry['long_name']
            })
    else:
        print(f"no geocode found for {company['companyName']} at {address_statement}")

In [None]:
# Which state has the most public companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numCompanies
    ORDER BY numCompanies DESC
""")

In [None]:
# Which city in California has the most companies listed?
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numCompanies
    ORDER BY numCompanies DESC
""")

In [None]:
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
    YIELD node, score
  WITH node as com
  RETURN com.location, com.companyAddress
""", params={"companyName": "Fedex"})

In [None]:
# Which investment firms are near Fedex?
kg.query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
         YIELD node, score
  WITH node as com
  MATCH (mgr:Manager)
    WHERE point.distance(mgr.location, com.location) < $withinMeters
  WITH mgr, 
    toInteger(point.distance(mgr.location, com.location) / 1000) as distanceKm
  RETURN mgr.managerName, mgr.managerAddress, 
        apoc.number.format(distanceKm) + "km" as distance

""", params={"companyName": "Fedex", "withinMeters": 100 * 1000})