# Construct With Cypher

Can the entire knowledge graph construction happen with Cypher? 

## Prepare environment

In [None]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

print(f"Connecting to Neo4j at {NEO4J_URI} as {NEO4J_USERNAME}")

DATA_URL = os.getenv('DATA_URL')

if (DATA_URL is None): 
    print("No DATA_URL environment variable set. Please set this to the URL of the data to be loaded.")
    exit(1)

print(f"Loading data from {DATA_URL}")


OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if (OPENAI_API_KEY is None): 
    print("No OPENAI_API_KEY environment variable set. Please set this to the OpenAI API key.")
    exit(1)
    
# Create a knowledge graph using Langchain's Neo4j integration.
# This will be used for direct querying of the knowledge graph. 
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

def checkPlugin(pluginName):
    pluginFunctions = kg.query('''
        SHOW FUNCTIONS yield name
        WHERE name STARTS WITH $pluginName''', 
    params={'pluginName': pluginName})

    pluginProcedures = kg.query('''
        SHOW PROCEDURES yield name
        WHERE name STARTS WITH $pluginName''', 
    params={'pluginName': pluginName})

    if (len(pluginFunctions) == 0) and (len(pluginProcedures) == 0):
        print(f"No {pluginName} procedures or functions found. Has the {pluginName} plugin been installed and enabled?")
        exit(1)

checkPlugin('gds')
checkPlugin('genai')
checkPlugin('apoc')

# Reset the graph, if needed

In [None]:
# Drop all constraints and indexes
for constraint in kg.query('SHOW CONSTRAINTS'):
    kg.query(f"DROP CONSTRAINT {constraint['name']}")

for index in kg.query('SHOW INDEXES'):
    print(f"Removing index {index['name']}:")
    kg.query(f"""
        DROP INDEX `{index['name']}`
    """)

# Remove all nodes and relationships
kg.query("""
        MATCH (all)
        DETACH DELETE all
""")

# Import Form10-k files


In [None]:
# Create a uniqueness constraint on the formdId property of Form nodes 
kg.query('CREATE CONSTRAINT unique_form IF NOT EXISTS FOR (n:Form) REQUIRE n.formId IS UNIQUE')


In [None]:
form10KFiles = [
    '0000106040-23-000024.json',
    '0000320187-23-000039.json',
    '0000950170-23-027948.json',
    '0000950170-23-033201.json',
    '0001096906-23-001489.json',
    '0001137789-23-000049.json',
    '0001327567-23-000024.json',
    '0001558370-23-011516.json',
    '0001564708-23-000368.json',
    '0001650372-23-000040.json'
]

for form10KFile in form10KFiles:
    form10kUrl = f"{DATA_URL}form10k/{form10KFile}"
    print(f"Loading {form10kUrl}")
    kg.query("""
      WITH apoc.text.regexGroups($URL,'([^\/]*)\.json')[0][1] AS formId
      CALL apoc.load.json($URL) YIELD value
      MERGE (f:Form {formId: formId}) 
        ON CREATE SET f = value, f.formId = formId
    """,
    params={'URL': form10kUrl})


In [None]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")


In [None]:
# move all text into separate chunk nodes. these are "mega chunks" that are not yet split
for item in ['item1','item1a','item7','item7a']:
  # note this is using a string template to insert the item name into the query
  # icky but convenient here because we can't parameterize a property key
  kg.query(f"""
  MATCH (f:Form)
  WITH f, "0000" as chunkSeqId
  WITH f, chunkSeqId, f.formId + "-{item}-chunk" + chunkSeqId as chunkId
  MERGE (section:Chunk {{chunkId: chunkId}})
  ON CREATE SET 
      section.chunkSeqId = chunkSeqId,
      section.text = f.{item}
  MERGE (f)-[:SECTION {{f10kItem: "{item}"}}]->(section)
  """)



In [None]:
# split the chunks into a linked list of smaller chunks
kg.query("""
MATCH (f:Form)-[s:SECTION]->(first:Chunk)
WITH f, s, first
WITH f, s, first, apoc.text.split(first.text, "\s+") as tokens
CALL apoc.coll.partition(tokens, 1000) YIELD value
WITH f, s, first, apoc.text.join(value, " ") as chunk
WITH f, s, first, collect(chunk) as chunks
CALL {
    WITH f, s, first, chunks
    WITH f, s, first, chunks, [idx in range(1, size(chunks) -1) | 
         { chunkId: f.formId + "-" + s.f10kItem + "-chunk" + apoc.number.format(idx, "#0000"), text: chunks[idx] }] as chunkProps 
    CALL apoc.create.nodes(["Chunk"], chunkProps) yield node
    SET first.text = head(chunks)
    MERGE (node)-[:PART_OF]->(f)
    WITH first, collect(node) as chunkNodes
    CALL apoc.nodes.link(chunkNodes, 'NEXT')
    WITH first, head(chunkNodes) as nextNode
    MERGE (first)-[:NEXT]->(nextNode)
}
RETURN f.formId
 """)

In [None]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

In [None]:
# Create vector embeddings for all the Chunk text, in batches.
# Use this for larger number of chunks so that the query
# can be re-run without losing all progress
kg.query("""
  MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
  CALL {
    WITH chunk
    WITH chunk, genai.vector.encode(chunk.text, "OpenAI", {token: $openAiApiKey}) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)    
  } IN TRANSACTIONS OF 10 ROWS
  """, 
  params={"openAiApiKey":OPENAI_API_KEY} 
)

# Import form 13 csv

In [None]:
# Create a uniqueness constraint on the cusip6 property of Company nodes 
kg.query("""
    CREATE CONSTRAINT unique_company 
        IF NOT EXISTS FOR (com:Company) 
        REQUIRE com.cusip6 IS UNIQUE
""")

# Create a uniqueness constraint on the managerCik property of Manager nodes 
kg.query("""
CREATE CONSTRAINT unique_manager 
  IF NOT EXISTS
  FOR (n:Manager) 
  REQUIRE n.managerCik IS UNIQUE
""")
# Create a full-text index of Manager names
kg.query("""
CREATE FULLTEXT INDEX fullTextManagerNames
  IF NOT EXISTS
  FOR (mgr:Manager) 
  ON EACH [mgr.managerName]
""")


In [None]:
kg.query("""
LOAD CSV WITH HEADERS FROM $URL as row
MERGE (com:Company {cusip6: row.cusip6})
  ON CREATE SET com.companyName = row.companyName,
                com.cusip = row.cusip
MERGE (mgr:Manager {managerCik: row.managerCik})
    ON CREATE SET mgr.managerName = row.managerName,
            mgr.managerAddress = row.managerAddress
MERGE (mgr)-[owns:OWNS_STOCK_IN { 
    reportCalendarOrQuarter: row.reportCalendarOrQuarter }]->(com)
    ON CREATE
      SET owns.value  = toFloat(row.value), 
          owns.shares = toInteger(row.shares)
""",
params={'URL': DATA_URL + "form13.csv"})

# Connect the two datasets

In [None]:
# Connect all `Company` nodes to their corresponding `Form` nodes
# based on the `cusip6` property
kg.query("""
  MATCH (com:Company), (form:Form)
    WHERE com.cusip6 = form.cusip6
  SET com.names = form.names
  MERGE (com)-[:FILED]->(form)
""")