# SEC Edgar data prep



## Imports

### Script

As before, you'll start by importing some packages to set up the notebook.

In [12]:
from dotenv import load_dotenv
import os
import json
import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter


## Set up Neo4j

In [6]:
# Load from environment
load_dotenv('../.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

IMPORT_DATA_DIRECTORY = '../data/sample/form10k/'

In [21]:
from neo4j import GraphDatabase


driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD) )

def isNeo4jReady() -> bool:
  try:
    driver.verify_connectivity()
    return True
  except:
      return False

def showIndexes():
  try:
    records, _, _ = driver.execute_query("SHOW INDEXES")
    for record in records:
      print(f'{record["type"]} INDEX: {record["name"]} {record["labelsOrTypes"]} {record["properties"]}')
  except:
    print("FAILED")

def dropGraph():
    driver.execute_query("""
      MATCH (n)
      CALL { WITH n
        DETACH DELETE n
      } IN TRANSACTIONS OF 10000 ROWS;
      """)
    
def dropIndexesAndConstraints():
    for constraint in driver.execute_query('SHOW CONSTRAINTS'):
      driver.execute_query(f"DROP CONSTRAINT {constraint['name']}")
    for index in driver.execute_query('SHOW INDEXES'):
      driver.execute_query(f"""DROP INDEX `{index['name']}`""")
    
def resetDatabase():
   dropGraph()
   dropIndexesAndConstraints();

isNeo4jReady()
showIndexes()

VECTOR INDEX: form_10k_chunks ['Chunk'] ['textEmbedding']
LOOKUP INDEX: index_343aff4e None None
LOOKUP INDEX: index_f7700477 None None
RANGE INDEX: unique_chunk ['Chunk'] ['chunkId']


In [5]:

# Create a knowledge graph using Langchain's Neo4j integration.
# This will be used for direct querying of the knowledge graph. 
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [8]:
# Splitting text into chunks using the RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [19]:
# Create a vector index called "form_10k_chunks" the `textEmbedding`` property of nodes labeled `Chunk`. 
# neo4j_create_vector_index(kg, VECTOR_INDEX_NAME, 'Chunk', 'textEmbedding')
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (n:Chunk) ON (n.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

# Create a uniqueness constraint on the chunkId property of Chunk nodes 
kg.query('CREATE CONSTRAINT unique_chunk IF NOT EXISTS FOR (n:Chunk) REQUIRE n.chunkId IS UNIQUE')

showIndexes()


INDEX: form_10k_chunks VECTOR ['Chunk'] ['textEmbedding']
INDEX: index_343aff4e LOOKUP None None
INDEX: index_f7700477 LOOKUP None None
INDEX: unique_chunk RANGE ['Chunk'] ['chunkId']


In [13]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

def create_nodes_for_all_chunks(chunks_with_metadata_list):
    merge_chunk_node_query = """
    MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
        ON CREATE SET 
            mergedChunk.names = $chunkParam.names,
            mergedChunk.formId = $chunkParam.formId, 
            mergedChunk.cik = $chunkParam.cik, 
            mergedChunk.cusip6 = $chunkParam.cusip6, 
            mergedChunk.source = $chunkParam.source, 
            mergedChunk.f10kItem = $chunkParam.f10kItem, 
            mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
            mergedChunk.text = $chunkParam.text
    RETURN mergedChunk
    """
    node_count = 0
    for chunk in chunks_with_metadata_list:
        print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
        kg.query(merge_chunk_node_query, 
                params={
                    'chunkParam': chunk
                })
        node_count += 1
    print(f"Created {node_count} nodes")

In [14]:
%%time

all_file_names = [IMPORT_DATA_DIRECTORY + x for x in os.listdir(IMPORT_DATA_DIRECTORY)]
counter = 0

for file_name in all_file_names:
    counter += 1
    print(f'=== Processing {counter} of {len(all_file_names)} ===')
    # get and split text data
    print('Reading and splitting Form10k file...')
    chunk_list = split_form10k_data_from_file(file_name)
    #load nodes
    print('Creating Chunk Nodes...')
    create_nodes_for_all_chunks(chunk_list)
    print(f'Done Processing {file_name}')

# Check the number of nodes in the graph
kg.query("MATCH (n:Chunk) RETURN count(n) as chunkCount")

=== Processing 1 of 10 ===
Reading and splitting Form10k file...
Processing item1 from ../data/sample/form10k/0001650372-23-000040.json
	Split into 20 chunks
Processing item1a from ../data/sample/form10k/0001650372-23-000040.json
	Split into 20 chunks
Processing item7 from ../data/sample/form10k/0001650372-23-000040.json
	Split into 20 chunks
Processing item7a from ../data/sample/form10k/0001650372-23-000040.json
	Split into 4 chunks
Creating Chunk Nodes...
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0000
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0001
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0002
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0003
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0004
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0005
Creating `:Chunk` node for chunk ID 0001650372-23-000040-item1-chunk0006
Creating `:Chunk` node for 

[{'chunkCount': 544}]