# Setup

In [None]:
%run 'shared.ipynb'

In [None]:
import googlemaps

gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)

# Geocoding an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

geocode_result

In [None]:
def filter_city(address_component):
    if (address_component['types'] == ['locality', 'political']
        or address_component['types'] == ['administrative_area_level_3', 'political'] 
        or address_component['types'] == ['postal_town']
        or address_component['types'] == ['political', 'sublocality', 'sublocality_level_1']
        or address_component['types'] == ['neighborhood', 'political']
        or address_component['types'] == ['locality', 'political']
        ):
          return True
    return False

def get_city(geocode):
    found_city = None
    administrative_area_level_3 = None
    locality = None
    postal_town = None
    sublocality_level_1 = None
    neighborhood = None

    for address_component in geocode['address_components']:
        match address_component['types']:
            case ['administrative_area_level_3', 'political']:
                administrative_area_level_3 = address_component
                continue
            case ['locality', 'political']:
                locality = address_component
                continue
            case ['postal_town']:
                postal_town = address_component
                continue
            case ['political', 'sublocality', 'sublocality_level_1']:
                sublocality_level_1 = address_component
                continue
            case ['neighborhood', 'political']:
                neighborhood = address_component
                continue
    possible_city = [locality, administrative_area_level_3, sublocality_level_1, postal_town, neighborhood]
    found_city = list(filter(None, possible_city))
    return found_city[0] if found_city else None

def filter_state(address_component):
    if address_component['types'] == ['administrative_area_level_1', 'political']:
          return True  
    return False

def get_state(geocode):
    state_list = list(filter(filter_state, geocode['address_components']))
    if (state_list):
        return state_list[0]
    else:
        return None
    
def filter_postalcode(address_component):
    if address_component['types'] == ['postal_code']:
          return True  
    return False
    
def get_postalcode(geocode):
    zipcode_list = list(filter(filter_postalcode, geocode['address_components']))
    if (zipcode_list):
        return zipcode_list[0]
    else:
        return None
    
def filter_country(address_component):
    if address_component['types'] == ['country', 'political']:
          return True  
    return False

def get_country(geocode):
    country_list = list(filter(filter_country, geocode['address_components']))
    if (country_list):
        return country_list[0]
    else:
        return None

def get_location(geocode):
    return geocode['geometry']['location']


def long_name(address_component):
    if address_component:
        return address_component['long_name'] if address_component['long_name'] else None
    else:
        return None

def print_address(name, address, city, state, postal, country):
    print(f"{name} is located at {address}")
    print(f"\tcomponents: {city}, {state} {postal}, {country}")
    
# example_geocode = geocode_result[0]
# print("city", get_city(example_geocode))
# print("state", get_state(example_geocode))
# print("zip", get_postalcode(example_geocode))
# print("country", get_country(example_geocode))
# print("location", get_location(example_geocode))



In [None]:
cypher_geoloc = gdb.execute_query("CALL apoc.spatial.geocode('1600 Amphitheatre Parkway, Mountain View, CA');").records[0]

print(f"latitude: {cypher_geoloc['latitude']} longitude: {cypher_geoloc['longitude']}")

print(f"address_components: {cypher_geoloc}")

In [None]:

gdb.execute_query("""
  CREATE POINT INDEX address_locations IF NOT EXISTS
  FOR (n:Address) ON (n.location)
""")

gdb.execute_query("SHOW INDEXES").records

In [None]:
get_managers_cypher = """
  MATCH (mgr:Manager)
  RETURN mgr { .cik, .name, .address, .location}
"""

manager_rows = gdb.execute_query(get_managers_cypher).records

managers = list(map(lambda row: row['mgr'], manager_rows))

In [None]:
managers[0]

In [None]:
# add geocode to each manager

for manager in managers:
  if 'geocode' not in manager:
    geocode_for_address = gmaps.geocode(manager['address'])
    if len(geocode_for_address) > 0:
      manager['geocode'] = geocode_for_address[0] # accept first result


In [None]:
# create composite index on city, state for Addresses

gdb.execute_query("""
CREATE INDEX composite_address_index IF NOT EXISTS
FOR (n:Address) 
ON (n.city, n.state)
""")

In [None]:
set_manager_location_cypher = """
  MATCH (mgr:Manager {cik: $managerCik})
  SET mgr.location = point({latitude: $latitude, longitude: $longitude})
  MERGE (addr:Address {city: $city, state: $state})
   ON CREATE SET addr.country = $country
   ON MATCH SET addr.location = point({latitude: $latitude, longitude: $longitude})
  MERGE (mgr)-[:LOCATED_AT]->(addr)
"""

for manager in managers:
  if 'geocode' not in manager:
    continue

  location = get_location(manager['geocode'])
  city = get_city(manager['geocode'])
  state = get_state(manager['geocode'])
  country = get_country(manager['geocode'])
  postal = get_postalcode(manager['geocode'])

  cityOrState = city if city else state
  stateOrCountry = state if state else country 

  print_address(manager['name'], manager['address'],
      long_name(cityOrState), 
      long_name(stateOrCountry), 
      long_name(postal), 
      long_name(country)
  )

  if location and cityOrState and stateOrCountry:
    gdb.execute_query(set_manager_location_cypher,
      managerCik= manager['cik'],
      latitude= location['lat'],
      longitude= location['lng'],
      city= long_name(cityOrState),
      state= long_name(stateOrCountry),
      country= long_name(country)
    )
  else:
    print (f"no location for {manager['name']} {manager['cik']} within {manager['geocode']}")


In [None]:
# Where is Blackrock located?
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)
  RETURN p
""").records

In [None]:
# How many investment firms are at the same address as Blackrock?
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes("fullTextManagerNames", "Blackrock") YIELD node, score
  MATCH p=(node)-[:LOCATED_AT]->(address:Address)<-[:LOCATED_AT]-(other:Manager)
  RETURN count(other) as numManagers
""").records

In [None]:
# Which state has the most investment firms?
gdb.execute_query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are the cities in California with the most investment firms?
gdb.execute_query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")

In [None]:
# What are top investment firms in San Francisco?
gdb.execute_query("""
  MATCH p=(mgr:Manager)-[:LOCATED_AT]->(address:Address),
         (mgr)-[owns:OWNS_STOCK_IN]->(:Company)
         WHERE address.city = $city
            AND address.state = $state
  RETURN mgr.name as city, sum(owns.value) as totalInvestmentValue
    ORDER BY totalInvestmentValue DESC
    LIMIT 10
""", 
  city="San Francisco", 
  state="California"
).records

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher statements for particular questions:
# What investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(a:Address)
    WHERE a.city = 'San Francisco'
RETURN mgr.name

# What firms in San Francisco have the most investments?
MATCH (mgr:Manager)-[:LOCATED_AT]->(a:Address),
        (mgr:Manager)-[owns:OWNS_STOCK_IN]->(com:Company)
    WHERE a.city = 'San Francisco'
WITH mgr, sum(owns.value) as totalInvestmentValue
RETURN mgr.name + " owns $" + apoc.number.format(totalInvestmentValue) + " worth of shares."


The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

kg=Neo4jGraph(
        url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
    )
cypherChain = prettifyChain(GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
))


In [None]:
cypherChain("What investment firms are in San Francisco?")

In [None]:
cypherChain("What investment firms are in New York?")

In [None]:
cypherChain("What firms in San Francisco have the most investments?")

In [None]:
company_rows = gdb.execute_query("""
  MATCH (com:Company)
  RETURN com { .cusip6, .name } as company
""").records

companies = list(map(lambda row: row['company'], company_rows))

print(companies[0])

In [None]:
# Create a langchain vector store from the existing Neo4j knowledge graph.
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings_api,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

# Create a retriever from the vector store
retriever = neo4j_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain = RetrievalQAWithSourcesChain.from_chain_type(
    chat_api, chain_type="stuff", retriever=retriever
)


In [None]:
chain(f"Where is Fedex headquartrered?")

In [None]:
set_company_location_cypher = """
  MATCH (com:Company {cusip6: $cusip6})
  SET com.address = $companyAddress
  SET com.location = point({latitude: $latitude, longitude: $longitude})
  MERGE (addr:Address {city: $city, state: $state} )
  SET addr.country = $country
  SET addr.location = point({latitude: $latitude, longitude: $longitude})
  MERGE (com)-[:LOCATED_AT]->(addr)
"""

for company in companies:
    result = chain(f"Where is {company['name']} headquartered?")
    address_statement = result['answer']
    address_geocodes = gmaps.geocode(address_statement)
    if len(address_geocodes) > 0:
        address_geocode = address_geocodes[0]

        cusip6 = company['cusip6']

        location = get_location(address_geocode)
        city = get_city(address_geocode)
        state = get_state(address_geocode)
        postal = get_postalcode(address_geocode)
        country = get_country(address_geocode)

        cityOrState = city if city else state
        stateOrCountry = state if state else country 

        print_address(company['name'], address_geocode['formatted_address'],
            long_name(cityOrState), 
            long_name(stateOrCountry), 
            long_name(postal), 
            long_name(country)
        )

        if location and cityOrState and stateOrCountry:
            kg.query(set_company_location_cypher, params={
                "companyAddress": address_geocode['formatted_address'],
                "cusip6": company['cusip6'],
                "latitude": location['lat'],
                "longitude": location['lng'],
                "city": long_name(cityOrState),
                "state": long_name(stateOrCountry),
                "country": long_name(country)
            })
    else:
        print(f"no geocode found for {company['companyName']} at {address_statement}")

In [None]:
# Which state has the most public companies listed?
gdb.execute_query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numCompanies
    ORDER BY numCompanies DESC
""").records

In [None]:
# Which city in California has the most companies listed?
gdb.execute_query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numCompanies
    ORDER BY numCompanies DESC
""").records

In [None]:
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
    YIELD node, score
  WITH node as com
  RETURN com.location, com.address
""", companyName="Fedex").records[0]

In [None]:
# Which investment firms are near Fedex?
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
         YIELD node, score
  WITH node as com
  MATCH (mgr:Manager)
    WHERE point.distance(mgr.location, com.location) < $withinMeters
  WITH mgr, 
    toInteger(point.distance(mgr.location, com.location) / 1000) as distanceKm
  RETURN mgr.name, mgr.address, 
        apoc.number.format(distanceKm) + "km" as distance

""", companyName="Fedex", withinMeters= 100 * 1000).records

In [None]:
# Investment firms within an hour drive of Netapp?
gdb.execute_query("""
  CALL db.index.fulltext.queryNodes("fullTextCompanyNames", $companyName) 
         YIELD node, score
  WITH node as com
  MATCH (com)-[:LOCATED_AT]->(comAddr:Address),
         (mgr:Manager)-[:LOCATED_AT]->(mgrAddr:Address)
    WHERE point.distance(comAddr.location, mgrAddr.location) < $withinMeters
  WITH mgr, mgrAddr,
    toInteger(point.distance(comAddr.location, mgrAddr.location) / 1000) as distanceKm
  RETURN count(mgr.name) as numManagers, mgrAddr.city + ", " + mgrAddr.state as cityState,
        apoc.number.format(distanceKm) + "km" as distance
    ORDER BY distance ASC LIMIT 10

""", companyName="Netapp", withinMeters= 50 * 1000).records

In [None]:
kg.query("SHOW INDEXES")