In [74]:
from langchain_openai import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
import os
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_text_splitters import TokenTextSplitter
from langchain_core.output_parsers import JsonOutputParser

load_dotenv('../backend/.env')

True

Read a document data and create chunks

In [75]:
loader = PyMuPDFLoader('../data/Apple stock during pandemic.pdf')
pages = loader.load()

texts = ""
for page in pages:
    texts = texts+" "+page.page_content

text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = text_splitter.split_documents(pages)    

Initialize OpenAI LLM

In [77]:
model_name, api_key = os.environ.get('LLM_MODEL_CONFIG_openai_gpt_4o').split(",")
llm = ChatOpenAI(
            api_key=api_key,
            model=model_name,
            temperature=0,
        )

Generate graph documents via LLMGraphTransformer

In [78]:
graph_documents = LLMGraphTransformer(llm).convert_to_graph_documents(chunks)

In [80]:
graph_documents

[GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Pandemic', type='Event', properties={}), Node(id='Wendy Sun', type='Person', properties={}), Node(id='Kinglee High School', type='Organization', properties={}), Node(id='Zheng Zhou', type='Location', properties={}), Node(id='He Nan', type='Location', properties={}), Node(id='China', type='Location', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Pandemic', type='Event', properties={}), type='EXPERIENCED_FLUCTUATION', properties={}), Relationship(source=Node(id='Wendy Sun', type='Person', properties={}), target=Node(id='Kinglee High School', type='Organization', properties={}), type='AFFILIATION', properties={}), Relationship(source=Node(id='Kinglee High School', type='Organization', properties={}), target=Node(id='Zheng Zhou', type='Location', properties={}), type='LOCATED_IN', properties={}), Relationship(source=Node(id='Zheng 

Add nodes and relations to Neo4j DB

In [81]:
graph = Neo4jGraph(url=os.environ.get('NEO4J_URI'), 
                    database="neo4j", 
                    username=os.environ.get('NEO4J_USERNAME'), 
                    password=os.environ.get('NEO4J_PASSWORD'))

In [82]:
graph.add_graph_documents(graph_documents)

Get distinct nodes and relations from graph document

In [83]:
node_labels = set()
rel_labels = set()

for gd in graph_documents:
    for node in gd.nodes:
        node_labels.add(node.type)
    
    for rel in gd.relationships:
            rel_labels.add(rel.type)

In [86]:
system_prompt = """Please consolidate the following list of types into a smaller set of more general, semantically 
related types. The consolidated types must be drawn from the original list; do not introduce new types.  
Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type
and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and 
repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, 
mapped to itself.

**Input:** A list of strings representing the types to be consolidated. These types may represent either node 
labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.

Example 1:
Input: 
[ "Person", "Human", "People", "Company", "Organization", "Product"]
Output :
[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]]

Example 2:
Input :
["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"]
Output:
["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]]
"""


Get list of more general, semantically related labels from LLM

In [87]:
parser = JsonOutputParser()
prompt = ChatPromptTemplate(messages=[("system",system_prompt),("human", "{input}")],
                                          partial_variables={"format_instructions": parser.get_format_instructions()})

chain = prompt | llm | parser
nodes_dict = chain.invoke({'input':node_labels})
relation_dict = chain.invoke({'input':rel_labels})

In [88]:
nodes_dict

{'Economy': ['Economy', 'Economic status'],
 'List': ['List'],
 'Index': ['Index'],
 'Publication': ['Publication'],
 'Concept': ['Concept'],
 'Location': ['Location'],
 'Person': ['Person'],
 'Price': ['Price', 'Value'],
 'Time': ['Time', 'Date'],
 'Group': ['Group'],
 'Event': ['Event'],
 'Product': ['Product'],
 'Company': ['Company', 'Organization', 'Industry'],
 'License': ['License'],
 'Stock': ['Stock'],
 'Financial_metric': ['Financial_metric'],
 'Url': ['Url']}

In [66]:
relation_dict

{'RECOVERED': ['RECOVERED', 'RECOVERED_FROM'],
 'COMPONENTS_PRODUCED_IN': ['COMPONENTS_PRODUCED_IN'],
 'MENTIONS': ['MENTIONS'],
 'PRICE': ['OPENING_PRICE_132.76',
  'LOWEST_PRICE',
  'OPENING_PRICE_57.02',
  'CLOSING_PRICE_134.18',
  'OPENING_PRICE_76.07',
  'CLOSING_PRICE_56.09',
  'CLOSING_PRICE_80.01'],
 'PART_OF': ['PART_OF'],
 'MARKET_VALUE': ['MARKET_VALUE',
  'MARKET_VALUE_HIGHER_THAN',
  'MARKET_VALUE_RECORD',
  'MARKET_VALUE_EXCEEDED'],
 'FOUNDED_ON': ['FOUNDED_ON'],
 'AUTHOR': ['AUTHOR'],
 'IS': ['IS'],
 'SURPASSED': ['SURPASSED_IN_BRAND_VALUE',
  'SURPASSED_BY_APPLE_IN_BRAND_VALUE',
  'SURPASSED'],
 'TRANSACTION': ['HIGHER_TRANSACTION_NUMBER'],
 'CAUSES': ['CAUSES_SHRINKAGE',
  'INCREASE_CAUSES_STAGNATION',
  'DECREASE_CAUSES_STAGNATION',
  'DECLINE_CAUSES_STAGNATION'],
 'DATA_OBSERVED_ON': ['DATA_OBSERVED_ON'],
 'AFFECTED': ['AFFECTED_BY', 'AFFECTED'],
 'HAS': ['HAS'],
 'DISTRIBUTED_UNDER': ['DISTRIBUTED_UNDER'],
 'COMPARISON': ['VOLUME_COMPARISON',
  'STOCK_PRICE_FALL_COM

In [89]:
node_match = {}
relation_match = {}

for new_label , values in nodes_dict.items() :
    for old_label in values:
        node_match[old_label]=new_label
        
for new_label , values in relation_dict.items() :
    for old_label in values:
        relation_match[old_label]=new_label        

Update new node labels to database

In [92]:
for old_label, new_label in node_match.items():
    graph.query("""MATCH (n:$($label))
                    SET n:$($new_label)
                    REMOVE n:$($label)
                    """,
                    params={'label':old_label, 'new_label':new_label})

Update new relationship labels to database

In [93]:
for old_label, new_label in relation_match.items():
    graph.query("""MATCH (n)-[r:$($label)]->(m)
                    CREATE (n)-[r2:$($new_label)]->(m)
                    WITH r
                    DELETE r
                    """,
                    params={'label':old_label, 'new_label':new_label})

Todo -

1. Query correction for relation updation and copying relation properties to new relation
2. Sending nodes and relation labels in batches to LLM if exceed X number.
