# Load Form 10k Chunk Nodes

## Imports

In [43]:
from dotenv import load_dotenv
import os

# Common data processing
import json
from pandas import DataFrame
import pandas as pd
from typing import List, Tuple, Union
from numpy.typing import ArrayLike

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
# from langchain_community.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI

## Neo4j Utility Functions

In [48]:
def neo4j_version(kg: Neo4jGraph) -> Tuple[int, int, int]:
  """Return the version of Neo4j as a tuple of ints"""
  version = kg.query("CALL dbms.components()")[0]["versions"][0]
  if "aura" in version:
    version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,)
  else:
    version_tuple = tuple(map(int, version.split(".")))
  return version_tuple

def neo4j_create_vector_index(kg: Neo4jGraph, index_name: str, 
                              for_label: str, on_property: str, 
                              vector_dimensions: int = 1536,
                              similarity_function: str = 'cosine') -> None:
  """Create a Neo4j index for vector properties"""
  
  kg.query(f"""CREATE VECTOR INDEX `{index_name}` IF NOT EXISTS
    FOR (n:{for_label}) ON (n.{on_property}) 
    OPTIONS {{indexConfig: {{
      `vector.dimensions`: {vector_dimensions},
      `vector.similarity_function`: '{similarity_function}'
    }}}}"""
  )

def neo4j_show_properties(kg: Neo4jGraph, on_label: str) -> List:
  """Introspects the properties of a Neo4j node
    by sampling a single node with a specified label
    and returning the properties of that node"""
  return kg.query(f"""
    MATCH (n:{on_label}) WITH n LIMIT 1
    RETURN apoc.meta.cypher.types(properties(n)) 
    """)

def neo4j_vector_search(kg: Neo4jGraph, embeddings_model: OpenAIEmbeddings,
                        index_name: str, query: str, top_k: int = 10) -> List:
  """Search for similar nodes using the Neo4j vector index"""
  embedded_query = embeddings_model.embed_query(query)
  vector_search = f"""
    CALL db.index.vector.queryNodes($index_name, $top_k, $embedding) yield node, score
    RETURN node.text AS result
  """
  similar = kg.query(vector_search, params={'embedding': embedded_query, 'index_name':index_name, 'top_k': top_k})
  return similar

## Set up Neo4j and Langchain

In [49]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'


In [50]:
# Create a knowledge graph using Langchain's Neo4j integration.
# This will be used for direct querying of the knowledge graph. 
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

# OpenAI for creating embeddings
embeddings_model = OpenAIEmbeddings()

# Splitting text into chunks using the RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)


In [51]:
# Create a vector index for the textEmbedding property of Chunk nodes. 
# Call the index "form_10k_chunks" 
neo4j_create_vector_index(kg, VECTOR_INDEX_NAME, 'Chunk', 'textEmbedding')

# Create a uniqueness constraint on the chunkId property of Chunk nodes 
kg.query('CREATE CONSTRAINT unique_chunk IF NOT EXISTS FOR (n:Chunk) REQUIRE n.chunkId IS UNIQUE')


[]

In [53]:

# Create a langchain vector store from the existing Neo4j knowledge graph.
vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)
# Create a retriever from the vector store
retriever = vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=retriever
)


## Basic Cypher Queries

In [54]:
kg.refresh_schema()
print(kg.schema)

Node properties are the following:

Relationship properties are the following:

The relationships are the following:



In [55]:
# Run arbitrary Cypher queries by replacing the string 
# passed below into the kg.query() function
kg.query("MATCH (n) RETURN count(n) as count")

[{'count': 0}]

## Data loading utility functions 

In [19]:

def batches(xs, n=100):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]

def make_set_clause(prop_names: ArrayLike, element_name='n', item_name='rec'):
    clause_list = []
    for prop_name in prop_names:
        clause_list.append(f'{element_name}.{prop_name} = {item_name}.{prop_name}')
    return 'SET ' + ', '.join(clause_list)


def make_node_merge_query(node_key_name: str, node_label: str, cols: ArrayLike):
    template = f'''UNWIND $recs AS rec\nMERGE(n:{node_label} {{{node_key_name}: rec.{node_key_name}}})'''
    prop_names = [x for x in cols if x != node_key_name]
    if len(prop_names) > 0:
        template = template + '\n' + make_set_clause(prop_names)
    return template + '\nRETURN count(n) AS nodeLoadedCount'

def load_nodes(graph: Neo4jGraph, node_df: pd.DataFrame, node_key_col: str, node_label: str, batch_size=1000):
    records = node_df.to_dict('records')
    print(f'======  loading {node_label} nodes  ======')
    total = len(records)
    print(f'staging {total:,} records')
    query = make_node_merge_query(node_key_col, node_label, node_df.columns.copy())
    print(f'\nUsing This Cypher Query:\n```\n{query}\n```\n')
    cumulative_count = 0
    for recs in batches(records, batch_size):
        res = graph.query(query, params={'recs': recs})
        cumulative_count += res[0].get('nodeLoadedCount')
        print(f'Loaded {cumulative_count:,} of {total:,} nodes')


def get_and_split_txt_data(file_names: List[str]) -> DataFrame:
    doc_data_list = []
    for file_name in file_names:
        with open(file_name) as f:
            f10_k = json.load(f)
            for item in ['item1', 'item1a', 'item7', 'item7a']:
                #split text data
                txt = f10_k[item]
                split_txts = text_splitter.split_text(txt)
                chunk_seq_id = 0
                for split_txt in split_txts:
                    form_id = file_name[file_name.rindex('/') + 1:file_name.rindex('.')]
                    doc_data_list.append({ 'formId': f'{form_id}',
                                           'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                                           'cik': f10_k['cik'],
                                           'cusip6': f10_k['cusip6'],
                                           'source': f10_k['source'],
                                           'f10kItem': item,
                                           'chunkSeqId': chunk_seq_id,
                                           'text': split_txt})
                    chunk_seq_id += 1
    return pd.DataFrame(doc_data_list)

def add_text_embeddings(df):
    count = 0
    embeddings = []
    for docs in batches(df.text, n=100):
        count += len(docs)
        print(f'Embedded {count} of {df.shape[0]}')
        embeddings.extend(embeddings_model.embed_documents(docs))
    df['textEmbedding'] = embeddings



## Load Form 10k documents

1. iterate through all the files in the directory
2. batch load sets of the files
3. for each file, load the content and split the text into chunks
4. for each chunk, create a graph Node that includes metadata and the chunk text

In [56]:
%%time

all_file_names = ['./data/form10k/' + x for x in os.listdir('./data/form10k/')]
counter = 0
for file_names in batches(all_file_names, 20):
    counter += len(file_names)
    print(f'=== Processing {counter-len(file_names)}:{counter} of {len(all_file_names)} ===')
    # get and split text data
    print('Loading and splitting Text Files...')
    doc_df = get_and_split_txt_data(file_names)
    # perform text embedding
    print('Performing Text Embedding...')
    add_text_embeddings(doc_df)
    #load nodes
    print('Loading Nodes...')
    load_nodes(kg, doc_df.drop(columns='textEmbedding'), 'chunkId', 'Chunk')
    print(f'Done Processing {counter-len(file_names)}:{counter}')

    # Merge text embeddings using set vector property
    records = doc_df[['chunkId', 'textEmbedding']].to_dict('records')
    print(f'======  loading Document text embeddings ======')
    total = len(records)
    print(f'staging {total:,} records')
    cumulative_count = 0
    for recs in batches(records, n=100):
        res = kg.query('''
        UNWIND $recs AS rec
        MATCH(n:Chunk {chunkId: rec.chunkId})
        CALL db.create.setNodeVectorProperty(n, "textEmbedding", rec.textEmbedding)
        RETURN count(n) AS propertySetCount
        ''', params={'recs': recs})
        cumulative_count += res[0].get('propertySetCount')
        print(f'Set {cumulative_count:,} of {total:,} text embeddings')

=== Processing 0:20 of 45 ===
Loading and splitting Text Files...
Performing Text Embedding...
Embedded 100 of 2033
Embedded 200 of 2033
Embedded 300 of 2033
Embedded 400 of 2033
Embedded 500 of 2033
Embedded 600 of 2033
Embedded 700 of 2033
Embedded 800 of 2033
Embedded 900 of 2033
Embedded 1000 of 2033
Embedded 1100 of 2033
Embedded 1200 of 2033
Embedded 1300 of 2033
Embedded 1400 of 2033
Embedded 1500 of 2033
Embedded 1600 of 2033
Embedded 1700 of 2033
Embedded 1800 of 2033
Embedded 1900 of 2033
Embedded 2000 of 2033
Embedded 2033 of 2033
Loading Nodes...
staging 2,033 records

Using This Cypher Query:
```
UNWIND $recs AS rec
MERGE(n:Chunk {chunkId: rec.chunkId})
SET n.formId = rec.formId, n.cik = rec.cik, n.cusip6 = rec.cusip6, n.source = rec.source, n.f10kItem = rec.f10kItem, n.chunkSeqId = rec.chunkSeqId, n.text = rec.text
RETURN count(n) AS nodeLoadedCount
```

Loaded 1,000 of 2,033 nodes
Loaded 2,000 of 2,033 nodes
Loaded 2,033 of 2,033 nodes
Done Processing 0:20
staging 2,033 

## Example queries

In [58]:
kg.refresh_schema()
print(kg.schema)

Node properties are the following:
Chunk {f10kItem: STRING, chunkSeqId: INTEGER, textEmbedding: LIST, chunkId: STRING, cik: STRING, cusip6: STRING, text: STRING, formId: STRING, source: STRING}
Relationship properties are the following:

The relationships are the following:



In [59]:
question = 'Who makes GPUs?'

In [60]:
# Vector search using our utility function
neo4j_vector_search(kg, embeddings_model, VECTOR_INDEX_NAME, question, top_k=3)

[{'result': 'The growth in demand for associative processing computing solutions is being driven by the increasing market adoption and usage of graphics processing unit (“GPU”) and CPU farms for AI processing of large data collections, including parallel computing in scientific research.  However, the large-scale usage of GPU and CPU farms for AI processing of data is demonstrating the limits of GPU and CPU processing speeds and resulting in ever higher energy consumption. The amounts of data being processed, which is coming from increasing numbers of users and continuously increasing amounts of collected data, has resulted in efforts to split and store the processed data among multiple databases, through a process called sharding.  Sharding substantially increases processing costs and worsens the power consumption factors associated with processing so much data.  As the environmental impacts of data processing are becoming increasingly important, and complex workloads are migrating to

In [61]:
# Vector search using the langchain vector store
docs_with_score = vector_store.similarity_search_with_score(question, k=3)

for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.9043025970458984

text: The growth in demand for associative processing computing solutions is being driven by the increasing market adoption and usage of graphics processing unit (“GPU”) and CPU farms for AI processing of large data collections, including parallel computing in scientific research.  However, the large-scale usage of GPU and CPU farms for AI processing of data is demonstrating the limits of GPU and CPU processing speeds and resulting in ever higher energy consumption. The amounts of data being processed, which is coming from increasing numbers of users and continuously increasing amounts of collected data, has resulted in efforts to split and store the processed data among multiple databases, through a process called sharding.  Sharding substantially increases processing costs and worsens the power consumption factors associated with processing so much data.  As the environmental 

In [62]:
# Vector search using the langchain retriever over the Neo4j vector store
retriever.get_relevant_documents(question)[0]

Document(page_content='\ntext: The growth in demand for associative processing computing solutions is being driven by the increasing market adoption and usage of graphics processing unit (“GPU”) and CPU farms for AI processing of large data collections, including parallel computing in scientific research.  However, the large-scale usage of GPU and CPU farms for AI processing of data is demonstrating the limits of GPU and CPU processing speeds and resulting in ever higher energy consumption. The amounts of data being processed, which is coming from increasing numbers of users and continuously increasing amounts of collected data, has resulted in efforts to split and store the processed data among multiple databases, through a process called sharding.  Sharding substantially increases processing costs and worsens the power consumption factors associated with processing so much data.  As the environmental impacts of data processing are becoming increasingly important, and complex workload

In [63]:
chain(
    {"question": question},
    return_only_outputs=True,
)

{'answer': 'NVIDIA Corporation and Intel Corporation are competitors in the GPU market.\n',
 'sources': 'https://www.sec.gov/Archives/edgar/data/1126741/000155837023011516/0001558370-23-011516-index.htm'}