In [1]:
# Install required packages using pip
%pip install pypdf langchain_community langchain langchain_openai langchain_experimental IPython neo4j yfiles_jupyter_graphs yfiles_jupyter_graphs_for_neo4j==1.3.1

Note: you may need to restart the kernel to use updated packages.


## Initialization
This section initializes the notebook by importing necessary libraries and loading environment variables.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
import ast
from IPython.display import clear_output

# Check if running in Google Colab and enable custom widget manager if true
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

In [3]:
# Load environment variables from 'credentials.env' if it exists
if os.path.exists('credentials.env'):
    load_dotenv('credentials.env', override=True)

    # Neo4j credentials
    uri = os.getenv('NEO4J_URI')
    username = os.getenv('NEO4J_USERNAME')
    password = os.getenv('NEO4J_PASSWORD')
    database = os.getenv('NEO4J_DATABASE')

    # OpenAI credentials
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
else:
    print("File 'credentials.env' not found.")

## Create Text Chunks and Generate Embeddings
This section splits the PDF document into chunks and generates embeddings for each chunk using OpenAI.

In [4]:
# Initialize text splitter with specified chunk size and overlap
chunk_size = 800
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    length_function = len,
    is_separator_regex = False,
)

In [5]:
# Load and split the PDF document into chunks
doc_path = 'microsoft-blogpost.pdf'
all_chunks = []

loader = PyPDFLoader(doc_path)
pages = loader.load_and_split()
for page in pages:
    chunks = text_splitter.split_text(page.page_content)
    for chunk in chunks:
        all_chunks.append(chunk)
        
print(f"Parsed: {doc_path}")
print(f"Chunked {len(pages)} pages into {len(all_chunks)} chunks")

Parsed: microsoft-blogpost.pdf
Chunked 8 pages into 25 chunks


In [6]:
# Check the first two chunks to ensure correct splitting
print('\nFirst chunk: \n' + all_chunks[0])
print('\nSecond chunk: \n' + all_chunks[1])


First chunk: 
The Microsoft Ecosystem and Its Advantages
Alpesh Patel·Follow
9 min read·May 22, 2023
Listen Share
Digital business transformation, in today’s hi-tech marketplace has become a
business reality. For our businesses to create value for customers, it is imperative
that our evolving business and rapidly changing technology landscape adopts
transformation to create value for the customer. Therefore, every business today
must transform to address new competitive forces and create new value. Digital
business transformation, if done strategically, brings a deep understanding of
customer, thereby, enabling you to design product and services that best meet their
needs, streamline operations through application that improves responsiveness,

Second chunk: 
needs, streamline operations through application that improves responsiveness,
service levels, and reduce costs, and delivers more empowered teams by providing
them the right information at the right time.
There are 4 pillars of 

In [7]:
# Generate embeddings for each chunk using OpenAI model
model = 'text-embedding-3-small'
embeddings_model = OpenAIEmbeddings(
    model = model,
    openai_api_key = OPENAI_API_KEY
)
embeddings = []
for chunk in all_chunks:
    embeddings.append(embeddings_model.embed_query(chunk))

In [8]:
# Check the first embedding to ensure correct generation
print('\nFirst embedding: \n' + str(embeddings[0]))


First embedding: 
[0.021190209314227104, -0.007180939894169569, 0.055880360305309296, 0.06044750660657883, 0.05364156514406204, -0.04005207121372223, 0.012850691564381123, 0.08686530590057373, -0.015783514827489853, 0.013231287710368633, 0.04728338122367859, -0.021626774221658707, -0.03772372007369995, -0.0207536444067955, 0.012615618295967579, 0.04952218011021614, -0.02565660886466503, -0.053552012890577316, -0.016779780387878418, 0.012783528305590153, -0.0011138012632727623, 0.008406681008636951, 0.01438426785171032, 0.048850540071725845, -0.013858150690793991, -0.01445143111050129, -0.04791024699807167, 0.031947627663612366, 0.005252776201814413, -0.048134125769138336, 0.023708855733275414, -0.019141709432005882, -0.014596953056752682, 0.018559623509645462, 0.024716313928365707, 0.023104380816221237, -0.023440198972821236, -0.021884236484766006, 0.03819386661052704, -0.02599242702126503, 0.006817135494202375, 0.023238707333803177, 0.011888009496033192, 0.05055202543735504, 0.018727

## Write Data to Neo4j
This section shows how to clear the existing database, create document nodes, and link text chunks with their embeddings in Neo4j.

In [9]:
# Connect to Neo4j and clear the database
driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
driver.execute_query('MATCH (n) DETACH DELETE n')
print("Database cleared.")

Database cleared.


In [10]:
# Create a document node for the PDF
driver.execute_query('CREATE (d:Document{name:"'+doc_path+'"})')
print('Document created.')

Document created.


In [11]:
# Create chunk nodes and link them to the document node
for index, chunk in enumerate(all_chunks):
    embedding = embeddings[index]
    driver.execute_query(f"""
    MATCH (d:Document)
    WHERE d.name = '{doc_path}'
    CREATE (d)-[:HAS_CHUNK]->(c:Chunk)
    SET c.text = '{chunk}'
    SET c.embedding = {embedding}
    SET c.index = {index}
    """)
    
print('Embeddings set.')

Embeddings set.


## Visualize the Graph
Use yFiles Jupyter graphs for Neo4j to visualize the document and its chunks.

In [12]:
# Visualize the document and chunks in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("MATCH (d:Document)-[r]->(c:Chunk) RETURN d,r,c")

GraphWidget(layout=Layout(height='760px', width='100%'))

## Extract Graph from Text
Use LangChain to extract entities from the graph.

In [13]:
# Import necessary libraries and initialize LangChain's LLMGraphTransformer
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)
from langchain_core.documents import Document

# Initialize dictionaries to store nodes and relationships
nodes = {}
rels = {}

# Define functions to generate unique hashes for nodes and relationships
def get_node_hash(node):
    return hash(node.id + ':' + node.type)

def get_rel_hash(rel):
    return hash(rel.source.id + ':' + rel.source.type + ':' + rel.type + ':' + rel.target.id + ':' + rel.target.type)

# Process each text chunk to extract graph information
for index, chunk in enumerate(all_chunks):
    documents = [Document(page_content=chunk)]
    graph_documents = llm_transformer.convert_to_graph_documents(documents)

    # Extract unique nodes from the LangChain output
    for node in graph_documents[0].nodes:
        node_hash = get_node_hash(node)
        if node_hash in nodes:
            nodes[node_hash]['chunks'].append(index)
        else:
            nodes[node_hash] = {'id': node_hash, 'name': node.id, 'label': node.type, 'chunks': [index]}

    # Extract unique relationships from the LangChain output
    for rel in graph_documents[0].relationships:
        rel_hash = get_rel_hash(rel)
        if rel_hash in rels:
            rels[rel_hash]['chunks'].append(index)
        else:
            source_hash = get_node_hash(rel.source)
            target_hash = get_node_hash(rel.target)
            rels[rel_hash] = {'id': rel_hash, 'source': source_hash, 'target': target_hash, 'type': rel.type, 'chunks': [index]}

    print(f"Loaded chunk. Current nodes: {len(nodes)}, relationships: {len(rels)}...")

print('Complete')

Loaded chunk. Current nodes: 10, relationships: 8...
Loaded chunk. Current nodes: 18, relationships: 17...
Loaded chunk. Current nodes: 33, relationships: 33...
Loaded chunk. Current nodes: 50, relationships: 47...
Loaded chunk. Current nodes: 69, relationships: 62...
Loaded chunk. Current nodes: 69, relationships: 64...
Loaded chunk. Current nodes: 74, relationships: 68...
Loaded chunk. Current nodes: 83, relationships: 77...
Loaded chunk. Current nodes: 89, relationships: 88...
Loaded chunk. Current nodes: 90, relationships: 91...
Loaded chunk. Current nodes: 101, relationships: 102...
Loaded chunk. Current nodes: 102, relationships: 104...
Loaded chunk. Current nodes: 105, relationships: 111...
Loaded chunk. Current nodes: 121, relationships: 128...
Loaded chunk. Current nodes: 136, relationships: 142...
Loaded chunk. Current nodes: 154, relationships: 157...
Loaded chunk. Current nodes: 161, relationships: 161...
Loaded chunk. Current nodes: 167, relationships: 168...
Loaded chunk.

In [14]:
# Write nodes to Neo4j and link them to the corresponding chunks
## TODO -> this needs some optimization using parameters / batching.
for node in nodes.values():
    driver.execute_query(f"""
    CREATE (n:{node['label']})
    SET n.name = '{node['name']}'
    SET n.id = '{node['id']}'
    WITH n
    UNWIND {str(node['chunks'])} as chunk_index
    MATCH (c:Chunk)
    WHERE c.index = chunk_index
    CREATE (n)-[:IN]->(c)
    """)

print('Nodes created and linked to chunks.')

Nodes created and linked to chunks.


In [15]:
# Write relationships to Neo4j
## TODO -> this needs some optimization using parameters / batching.
for rel in rels.values():
    driver.execute_query(f"""
    MATCH (n), (m)
    WHERE n.id = "{rel['source']}" AND m.id = "{rel['target']}"
    CREATE (n)-[r:{rel['type']}]->(m)
    SET r.chunks = {str(rel['chunks'])}
    """)

print('Relationships created.')

Relationships created.


In [16]:
# Visualize the graph in Neo4j
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
widget = Neo4jGraphWidget(driver, overview_enabled=False, context_start_with=None)
widget.show_cypher("MATCH (d)-[r]->(c) WHERE type(r) <> 'IN' AND type(r) <> 'HAS_CHUNK' RETURN d,r,c")

GraphWidget(layout=Layout(height='800px', width='100%'))

## Time for Querying

In [17]:
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI


enhanced_graph = Neo4jGraph(
    url=uri,
    username=username,
    password=password,
    enhanced_schema=True,
)
print(enhanced_graph.schema)

Node properties:
- **Document**
  - `name`: STRING Available options: ['The Microsoft Ecosystem And Its Advantages', 'microsoft-blogpost.pdf']
  - `id`: STRING Available options: ['-6128678441651722737']
- **Chunk**
  - `text`: STRING Example: "The Microsoft Ecosystem and Its Advantages Alpesh "
  - `index`: INTEGER Min: 0, Max: 24
- **Concept**
  - `name`: STRING Example: "Organizations"
  - `id`: STRING Example: "-1729061487026209417"
- **Person**
  - `name`: STRING Example: "Moore"
  - `id`: STRING Example: "-7144879061524072736"
- **Product**
  - `name`: STRING Example: "Customer Service"
  - `id`: STRING Example: "4683732313084330625"
- **Data**
  - `name`: STRING Available options: ['Business Application Data']
  - `id`: STRING Available options: ['-1658853085048990278']
- **Group**
  - `name`: STRING Available options: ['500M Sales Professionals', 'Enterprise Customers', 'Enterprise Organization']
  - `id`: STRING Available options: ['7068519713195080833', '8550369440340091076',

In [18]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=enhanced_graph, verbose=True
)

chain.invoke({"query": "What does Microsoft Offer"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (o:Organization {name: "Microsoft"})-[:OFFERS]->(s:Service)
RETURN s.name[0m
Full Context:
[32;1m[1;3m[{'s.name': 'Machine Learning'}, {'s.name': 'Internet Of Things (Iot)'}, {'s.name': 'Cognitive Services'}][0m

[1m> Finished chain.[0m


{'query': 'What does Microsoft Offer',
 'result': 'Microsoft offers Machine Learning, Internet Of Things (IoT), and Cognitive Services.'}