# Ingest

## Setup

In [2]:
# Load Credentials
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os

load_dotenv('cred.env', override=True)

#AWS
ACCESS_KEY = os.getenv('ACCESS_KEY')
SECRET_KEY = os.getenv('SECRET_KEY')

# Neo4j
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) 

## Create Indexes

In [3]:
# Unique Ids
driver.execute_query('CREATE CONSTRAINT unique_agreement_id IF NOT EXISTS FOR (n:Agreement) REQUIRE (n.contract_id) IS UNIQUE')
driver.execute_query('CREATE CONSTRAINT unique_country_id IF NOT EXISTS FOR (n:Country) REQUIRE (n.name) IS UNIQUE')
driver.execute_query('CREATE CONSTRAINT unique_organization_id IF NOT EXISTS FOR (n:Organization) REQUIRE (n.name) IS UNIQUE')
#driver.execute_query('CREATE CONSTRAINT unique_contract_clause_id IF NOT EXISTS FOR (n:ContractClause) REQUIRE (n.type) IS UNIQUE')
driver.execute_query('CREATE CONSTRAINT unique_excerpt_id IF NOT EXISTS FOR (n:Excerpt) REQUIRE (n.text) IS UNIQUE')
driver.execute_query('CREATE CONSTRAINT unique_clause_type_id IF NOT EXISTS FOR (n:ClauseType) REQUIRE (n.clause_type) IS UNIQUE')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7f45867bff50>, keys=[])

## Load Graph

In [4]:
CREATE_GRAPH_STATEMENT = """
WITH $data AS data
WITH data.agreement as a

// todo proper global id for the agreement, perhaps from filename
MERGE (agreement:Agreement {contract_id: a.contract_id})
ON CREATE SET 
  agreement.name = a.agreement_name,
  agreement.effective_date = a.effective_date,
  agreement.expiration_date = a.expiration_date,
  agreement.agreement_type = a.agreement_type,
  agreement.renewal_term = a.renewal_term,
  agreement.most_favored_country = a.governing_law.most_favored_country
  //agreement.Notice_period_to_Terminate_Renewal = a.Notice_period_to_Terminate_Renewal
  

MERGE (gl_country:Country {name: a.governing_law.country})
MERGE (agreement)-[gbl:GOVERNED_BY_LAW]->(gl_country)
SET gbl.state = a.governing_law.state


FOREACH (party IN a.parties |
  // todo proper global id for the party
  MERGE (p:Organization {name: party.name})
  MERGE (p)-[ipt:IS_PARTY_TO]->(agreement)
  SET ipt.role = party.role
  MERGE (country_of_incorporation:Country {name: party.incorporation_country})
  MERGE (p)-[incorporated:INCORPORATED_IN]->(country_of_incorporation)
  SET incorporated.state = party.incorporation_state
)

WITH a, agreement, [clause IN a.clauses WHERE clause.exists = true] AS valid_clauses
FOREACH (clause IN valid_clauses |
  CREATE (cl:ContractClause {type: clause.clause_type})
  MERGE (agreement)-[clt:HAS_CLAUSE]->(cl)
  SET clt.type = clause.clause_type
  // ON CREATE SET c.excerpts = clause.excerpts
  FOREACH (excerpt IN clause.excerpts |
    MERGE (cl)-[:HAS_EXCERPT]->(e:Excerpt {text: excerpt})
  )
  //link clauses to a Clause Type label
  MERGE (clType:ClauseType{name: clause.clause_type})
  MERGE (cl)-[:HAS_TYPE]->(clType)
)"""

In [7]:
import json

json_contracts = [filename for filename in os.listdir("./output/") if filename.endswith('.json')]
contract_id = 1
for json_contract in json_contracts:
  with open("./output/" + json_contract,'r') as file:
    json_string = file.read()
    json_data = json.loads(json_string)
    agreement = json_data['agreement']
    agreement['contract_id'] = contract_id
    driver.execute_query(CREATE_GRAPH_STATEMENT,  data=json_data)
    contract_id+=1

## Create FullText Indexes

In [8]:
driver.execute_query("CREATE FULLTEXT INDEX excerptTextIndex IF NOT EXISTS FOR (e:Excerpt) ON EACH [e.text]")
driver.execute_query("CREATE FULLTEXT INDEX agreementTypeTextIndex IF NOT EXISTS FOR (a:Agreement) ON EACH [a.agreement_type]")
driver.execute_query("CREATE FULLTEXT INDEX clauseTypeNameTextIndex IF NOT EXISTS FOR (ct:ClauseType) ON EACH [ct.name]")
driver.execute_query("CREATE FULLTEXT INDEX contractClauseTypeTextIndex IF NOT EXISTS FOR (c:ContractClause) ON EACH [c.type]")
driver.execute_query("CREATE FULLTEXT INDEX organizationNameTextIndex IF NOT EXISTS FOR (o:Organization) ON EACH [o.name]")
driver.execute_query("CREATE INDEX agreementContractId IF NOT EXISTS FOR (a:Agreement) ON (a.contract_id) ")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7f45c81c0750>, keys=[])

## Generate Text Embeddings & Create Vector Index

In [14]:
driver.execute_query("""
MATCH (e:Excerpt) 
WHERE e.text IS NOT NULL
WITH e, genai.vector.encode( e.text, 'Bedrock',
    {accessKeyId:$accessKeyId, secretAccessKey:$secretAccessKey}
) AS vector
CALL db.create.setNodeVectorProperty(e, "embedding", vector)
""", parameters_={"accessKeyId":ACCESS_KEY, "secretAccessKey": SECRET_KEY})

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7f458573cf10>, keys=[])

In [17]:
# Create vector index
embedding_dimension = 1536 #default

driver.execute_query('''
CREATE VECTOR INDEX excerpt_embeddings IF NOT EXISTS FOR (n:Excerpt) ON (n.textEmbedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}''', parameters_={'dimension': embedding_dimension})

#wait for index to come online
driver.execute_query('CALL db.awaitIndex("excerpt_embeddings", 300)')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7f4585898510>, keys=[])