In [130]:
# import os
# from dotenv import load_dotenv
# from langchain_community.graphs import Neo4jGraph


# load_dotenv(override=True)
# NEO4J_URI = os.getenv("NEO4J_URI")
# NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
# NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
# NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")
# OPENAI_EMBEDDING_ENDPOINT = os.getenv("OPENAI_EMBEDDING_ENDPOINT")

# graph = Neo4jGraph(
#     url=NEO4J_URI,
#     username=NEO4J_USERNAME,
#     password=NEO4J_PASSWORD,
#     database=NEO4J_DATABASE,
# )

# graph.query(
#     """
#   MATCH (n)
#   MATCH (m:Movie)
#   RETURN count(n) AS numberOfNodes, count(m) AS numberOfMovies
#   """
# )

# graph.query(
#     """
# MATCH (tomCruise:Person {name: "Tom Cruise"})-[:ACTED_IN|DIRECTED]->(movie:Movie)
# RETURN movie.title, movie.tagline, movie.released
# ORDER BY movie.released DESC
#   """
# )

# graph.query(
#     """
#   CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
#   FOR (m:Movie) ON (m.taglineEmbedding) 
#   OPTIONS { indexConfig: {
#     `vector.dimensions`: 1536,
#     `vector.similarity_function`: 'cosine'
#   }}"""
# )

# graph.query(
#     """
#   SHOW VECTOR INDEXES
#   """
# )

# graph.query(
#     """
#     MATCH (m:Movie) 
#     WHERE m.tagline IS NOT NULL
#     RETURN m.tagline, m.taglineEmbedding
#     LIMIT 1        
#     """
# )

# # from openai import OpenAI
# # from tqdm import tqdm

# # # Initialize OpenAI client
# # client = OpenAI(api_key=OPENAI_API_KEY)

# # # Fetch all movie taglines
# # result = graph.query(
# #     """
# #     MATCH (m:Movie)
# #     WHERE m.tagline IS NOT NULL
# #     RETURN m.title, m.tagline
# #     """
# # )

# # # Generate embeddings and update the graph
# # for movie in tqdm(result):
# #     title = movie["m.title"]
# #     tagline = movie["m.tagline"]

# #     # Generate embedding
# #     response = client.embeddings.create(input=tagline, model="text-embedding-ada-002")
# #     embedding = response.data[0].embedding

# #     # Update the graph
# #     graph.query(
# #         """
# #         MATCH (m:Movie {title: $title})
# #         SET m.taglineEmbedding = $embedding
# #         """,
# #         params={"title": title, "embedding": embedding},
# #     )

# # print("Embeddings updated successfully.")

# graph.query(
#     """
#     MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
#     WITH movie, genai.vector.encode(
#         movie.tagline, 
#         "OpenAI", 
#         {
#           token: $openAiApiKey,
#           endpoint: $openAiEndpoint
#         }) AS vector
#     CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
#     """,
#     params={"openAiApiKey": OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT},
# )

# question = "What movies are about love?"

# graph.query(
#     """
#     WITH genai.vector.encode(
#         $question, 
#         "OpenAI", 
#         {
#           token: $openAiApiKey,
#           endpoint: $openAiEndpoint
#         }) AS question_embedding
#     CALL db.index.vector.queryNodes(
#         'movie_tagline_embeddings', 
#         $top_k, 
#         question_embedding
#         ) YIELD node AS movie, score
#     RETURN movie.title, movie.tagline, score
#     """,
#     params={
#         "openAiApiKey": OPENAI_API_KEY,
#         "openAiEndpoint": OPENAI_ENDPOINT,
#         "question": question,
#         "top_k": 5,
#     },
# )

In [131]:
import os

from dotenv import load_dotenv

from bs4 import BeautifulSoup

from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.schema import Document


load_dotenv(override=True)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")
OPENAI_EMBEDDING_ENDPOINT = os.getenv("OPENAI_EMBEDDING_ENDPOINT")

VECTOR_INDEX_NAME = "form_10k_chunks"
VECTOR_NODE_LABEL = "Chunk"
VECTOR_SOURCE_PROPERTY = "text"
VECTOR_EMBEDDING_PROPERTY = "textEmbedding"

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
)

graph.query("MATCH (n) DETACH DELETE n")

for answer in graph.query("SHOW CONSTRAINTS;"):
    graph.query(f"DROP CONSTRAINT `{answer['name']}`;")

for answer in graph.query("SHOW INDEXES;"):
    graph.query(f"DROP INDEX `{answer['name']}`;")

print("Database purged")

Database purged


In [132]:
with open("../data/GM_10k/gm-20231231.html", "r") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")
text_lst = soup.find_all("div", style="text-align:justify;text-indent:9pt")
text_lst = [text.get_text().encode("ascii", "ignore").decode("ascii") for text in text_lst]
text_lst = [text for text in text_lst if text]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
)

documents = [Document(page_content=text) for text in text_lst]
chunks = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(chunks)}")

Number of chunks: 432


In [133]:
for chunkId, chunk in enumerate(chunks):
    chunkText = chunk.page_content
    graph.query(
        """
        MERGE(mergedChunk:Chunk {chunkId: $chunkId})
        ON CREATE SET 
            mergedChunk.text = $chunkText,
            mergedChunk.source = $source
        RETURN mergedChunk  
        """,
        params={"chunkId": chunkId,
                "chunkText": chunkText,
                "source": "GM_10k"},
    )

graph.query(
    """
    CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
        FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
    """
)

result = graph.query(
    """
    MATCH (n)
    RETURN count(n) as nodeCount
    """
)
print(result)

graph.query(
    """
    CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
        FOR (c:Chunk) ON (c.textEmbedding) 
        OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
        }}
    """
)

result = graph.query(
    """
    SHOW INDEXES
    """
)
print(result)

graph.query(
    """
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT}
)

graph.refresh_schema()
print(graph.schema)

[{'nodeCount': 432}]
[{'id': 4, 'name': 'form_10k_chunks', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'VECTOR', 'entityType': 'NODE', 'labelsOrTypes': ['Chunk'], 'properties': ['textEmbedding'], 'indexProvider': 'vector-2.0', 'owningConstraint': None, 'lastRead': None, 'readCount': None}, {'id': 10, 'name': 'unique_chunk', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'RANGE', 'entityType': 'NODE', 'labelsOrTypes': ['Chunk'], 'properties': ['chunkId'], 'indexProvider': 'range-1.0', 'owningConstraint': 'unique_chunk', 'lastRead': None, 'readCount': None}]
Node properties:
Chunk {textEmbedding: LIST, chunkId: INTEGER, text: STRING, source: STRING}
Relationship properties:

The relationships:



In [134]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = graph.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10}
                      )
  return similar

question = "What is the company's mission?"
similar = neo4j_vector_search(question)
print(similar)

[{'score': 0.9159088134765625, 'text': 'The foundation of GMs business is our Purpose: We pioneer the innovations that move and connect people to what matters. It is why we exist. Our Purpose, growth strategy and culture all help us on our path towards achieving our vision of a world with zero crashes, zero emissions and zero congestion. Our people are our most valuable asset, and we must continue to attract and retain the best talent in the world in order to achieve this vision. As a result, we strive to create a Workplace of Choice to attract, retain and develop top talent by adhering to a responsible employer philosophy, which includes, among other things, commitments to create job opportunities, pay workers fairly, ensure safety and well-being and promote diversity, equity and inclusion (DEI). Fundamental to these commitments are our company values.'}, {'score': 0.88677978515625, 'text': 'Safety and Well-Being  The safety and well-being of our employees is also a critical component

In [135]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

retriever = neo4j_vector_store.as_retriever()

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever
)

question = "How many employees does General Motors have?"
result = chain({"question": question})
print("Question:", question)
print("Answer:", result['answer'])
print("Sources:", result['sources'])

Question: How many employees does General Motors have?
Answer: General Motors has 164,000 employees.

Sources: GM_10k


In [152]:
graph.query(
    """
MERGE (form:Form {formId: 0})
    ON CREATE
        SET form.formId = 0
        SET form.source = $formParam.source
RETURN form
""",
params={"formParam": {"source": VECTOR_SOURCE_PROPERTY}}
)

[{'form': {'formId': 0, 'source': 'text'}}]

In [155]:
graph.query(
    """
MATCH (chunk:Chunk)
SET chunk.formId = 0
"""
)

[]

In [160]:
graph.query(
    """
MATCH (chunk:Chunk)
WHERE chunk.formId = $chunkFormId
ORDER BY chunk.chunkId ASC
WITH collect(chunk) as chunkList
    CALL apoc.nodes.link(
        chunkList, 
        "NEXT", 
        {avoidDuplicates: true}
    )
RETURN size(chunkList)
""",
params={"chunkFormId": 0}
)

[{'size(chunkList)': 432}]

In [162]:
graph.query(
    """
  MATCH (chunk:Chunk), (form:Form)
    WHERE chunk.formId = form.formId
  MERGE (chunk)-[newRelationship:PART_OF]->(form)
  RETURN count(newRelationship)
"""
)

[{'count(newRelationship)': 432}]

In [163]:
graph.refresh_schema()
print(graph.schema)


Node properties:
Chunk {textEmbedding: LIST, formId: INTEGER, chunkId: INTEGER, text: STRING, source: STRING}
Form {formId: INTEGER, source: STRING}
Relationship properties:

The relationships:
(:Chunk)-[:PART_OF]->(:Form)
(:Chunk)-[:NEXT]->(:Chunk)


In [176]:
graph.query(
    """
    MATCH (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk)
    ORDER BY c1.chunkId ASC
    LIMIT 10
    RETURN c1.chunkId, c2.chunkId, c3.chunkId
    """
)

[{'c1.chunkId': 0, 'c2.chunkId': 1, 'c3.chunkId': 2},
 {'c1.chunkId': 1, 'c2.chunkId': 2, 'c3.chunkId': 3},
 {'c1.chunkId': 2, 'c2.chunkId': 3, 'c3.chunkId': 4},
 {'c1.chunkId': 3, 'c2.chunkId': 4, 'c3.chunkId': 5},
 {'c1.chunkId': 4, 'c2.chunkId': 5, 'c3.chunkId': 6},
 {'c1.chunkId': 5, 'c2.chunkId': 6, 'c3.chunkId': 7},
 {'c1.chunkId': 6, 'c2.chunkId': 7, 'c3.chunkId': 8},
 {'c1.chunkId': 7, 'c2.chunkId': 8, 'c3.chunkId': 9},
 {'c1.chunkId': 8, 'c2.chunkId': 9, 'c3.chunkId': 10},
 {'c1.chunkId': 9, 'c2.chunkId': 10, 'c3.chunkId': 11}]

In [180]:
graph.query(
    """
    MATCH window = (c1:Chunk)-[:NEXT*0..1]->(c2:Chunk)-[:NEXT*0..1]->(c3:Chunk)
    ORDER BY c1.chunkId ASC
    LIMIT 10
    RETURN c1.chunkId, c2.chunkId, c3.chunkId, length(window)
    """
)

[{'c1.chunkId': 0, 'c2.chunkId': 0, 'c3.chunkId': 0, 'length(window)': 0},
 {'c1.chunkId': 0, 'c2.chunkId': 0, 'c3.chunkId': 1, 'length(window)': 1},
 {'c1.chunkId': 0, 'c2.chunkId': 1, 'c3.chunkId': 1, 'length(window)': 1},
 {'c1.chunkId': 0, 'c2.chunkId': 1, 'c3.chunkId': 2, 'length(window)': 2},
 {'c1.chunkId': 1, 'c2.chunkId': 1, 'c3.chunkId': 1, 'length(window)': 0},
 {'c1.chunkId': 1, 'c2.chunkId': 1, 'c3.chunkId': 2, 'length(window)': 1},
 {'c1.chunkId': 1, 'c2.chunkId': 2, 'c3.chunkId': 2, 'length(window)': 1},
 {'c1.chunkId': 1, 'c2.chunkId': 2, 'c3.chunkId': 3, 'length(window)': 2},
 {'c1.chunkId': 2, 'c2.chunkId': 2, 'c3.chunkId': 2, 'length(window)': 0},
 {'c1.chunkId': 2, 'c2.chunkId': 2, 'c3.chunkId': 3, 'length(window)': 1}]

In [182]:
graph.query(
    """
    MATCH window = (c1:Chunk)-[:NEXT*0..1]->(c2:Chunk)-[:NEXT*0..1]->(c3:Chunk)
    ORDER BY length(window) DESC
    LIMIT 1
    RETURN c1.chunkId, c2.chunkId, c3.chunkId, length(window)
    """
)

[{'c1.chunkId': 251,
  'c2.chunkId': 252,
  'c3.chunkId': 253,
  'length(window)': 2}]

In [197]:
retrieval_query_extra_text = """
WITH node, score, "BMW builds better cars than GM. " as extraText
RETURN extraText + "\n" + node.text as text, score, node {.source} AS metadata
"""

vector_store_extra_text = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    retrieval_query=retrieval_query_extra_text,
)

retriever_extra_text = vector_store_extra_text.as_retriever()

chain_extra_text = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever_extra_text
)

result = chain_extra_text({"question": "What does GM do and how do their products compare to BMW?"})
print(result["answer"])


GM meets the demands of customers in North America and outside North America with vehicles developed, manufactured, and/or marketed under various brands. They also provide automotive financing services through GM Financial. BMW is considered to build better cars than GM.



In [204]:
result = graph.query(
    """
    MATCH window=(:Chunk)-[:NEXT*0..1]->(node)-[:NEXT*0..1]->(:Chunk)
    WITH window as longestWindow 
    ORDER BY length(window) DESC LIMIT 1
    WITH nodes(longestWindow) as chunkList
    UNWIND chunkList as chunkRows
    WITH collect(chunkRows.text) as textList
    RETURN apoc.text.join(textList, " \n ") as text
"""
)
print(result[0]["text"])

The following table reconciles our effective tax rate under U.S. GAAP to ETR-adjusted: 
 We define return on equity (ROE) as Net income attributable to stockholders for the trailing four quarters divided by average equity for the same period. Management uses average equity to provide comparable amounts in the calculation of ROE. The following table summarizes the calculation of ROE (dollars in billions): 
 We caution readers not to place undue reliance on forward-looking statements. Forward-looking statements speak only as of the date they are made, and we undertake no obligation to update publicly or otherwise revise any forward-looking statements, whether as a result of new information, future events or other factors, except where we are expressly required to do so by law.


In [205]:
retrieval_query_extra_text = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(node)-[:NEXT*0..1]->(:Chunk)
  WITH node, score, window as longestWindow 
    ORDER BY length(window) DESC LIMIT 1
  WITH nodes(longestWindow) as chunkList, node, score
    UNWIND chunkList as chunkRows
  WITH collect(chunkRows.text) as textList, node, score
  RETURN apoc.text.join(textList, " \n ") as text,
      score,
      node {.source} AS metadata
"""

vector_store_extra_text = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    retrieval_query=retrieval_query_extra_text,
)

retriever_extra_text = vector_store_extra_text.as_retriever()

chain_extra_text = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever_extra_text
)

result = chain_extra_text({"question": "What does GM do and how do their products compare to BMW?"})
print(result["answer"])

GM produces trucks, crossovers, cars, and automobile parts that are marketed through retail dealers in North America and distributors and dealers outside of North America. They also provide automotive financing services through GM Financial. GM's products are developed, manufactured, and/or marketed under various brands such as Buick, Cadillac, Chevrolet, and GMC. In comparison to BMW, GM offers a wider range of vehicles across different brands and has a focus on meeting the demands of customers in North America and other countries. BMW, on the other hand, is a German multinational company that primarily focuses on luxury vehicles and motorcycles under the BMW, Mini, and Rolls-Royce brands.



In [206]:
graph.refresh_schema()
print(graph.schema)


Node properties:
Chunk {textEmbedding: LIST, formId: INTEGER, chunkId: INTEGER, text: STRING, source: STRING}
Form {formId: INTEGER, source: STRING}
Relationship properties:

The relationships:
(:Chunk)-[:PART_OF]->(:Form)
(:Chunk)-[:NEXT]->(:Chunk)


In [207]:
graph.query(
    """
    MATCH (:Chunk)-[next:NEXT]->(:Chunk)
    RETURN COUNT(next) as nextCount
    """
)

[{'nextCount': 431}]

In [211]:
graph.query(
    """
    MATCH (:Chunk)-[relation]->(:Form)
    RETURN COUNT(relation) as relationCount
    """
)

[{'relationCount': 432}]

In [213]:
graph.query(
    """
    MATCH ()-[relation]->()
    RETURN COUNT(relation) as relationCount
    """
)

[{'relationCount': 863}]