In [8]:
import os
from langchain.document_loaders import JSONLoader, DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.callbacks import get_openai_callback
import requests
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.chat import SystemMessage, HumanMessagePromptTemplate

from langchain.chat_models import AzureChatOpenAI
from dotenv import load_dotenv
load_dotenv()


True

## Defining the chat model of LLM for entity relationship extraction

In [9]:
def generate_system_message() -> str:
    return """
You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES].
It is important that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. If you can't pair a relationship with a pair of nodes don't add it.
When you find a node or relationship you want to add try to create a generic TYPE for it that  describes the entity you can also think of it as a label.

Example:
Data: Interest Rate: shall mean, for any Interest Period, (x) the Spread plus the
Benchmark for such Interest Period or (y) when applicable pursuant to this Agreement or any other
Loan Document, the Default Rate.
Spread: shall mean 4.75%.
Benchmark: shall mean, initially, the London interbank offered rate for U.S. dollars
with a 1-month tenor; provided that if a Benchmark Transition Event or an Early Opt-in Election,
as applicable, and its related Benchmark Replacement Date have occurred with respect to the then current Benchmark, then “Benchmark” means the applicable Benchmark Replacement to the
extent that such Benchmark Replacement has replaced such prior benchmark rate pursuant to
Section 2.10 hereof. Lender shall determine the Benchmark (and the applicable Reference Time)
as in effect from time to time, and each such determination by Lender shall be conclusive and
binding absent manifest error. While the Benchmark remains the London interbank offered rate
for U.S. dollars with a 1-month tenor, Lender shall determine the same in accordance with the
defined term “LIBOR.”
LIBOR: shall mean, with respect to any Interest Period, a rate per annum (expressed
as a percentage per annum rounded upwards, if necessary, to the nearest one hundredth (1/100th)
of one percent (1%)) for deposits in U.S. Dollars for a one (1) month period that appears on Reuters
Screen LIBOR01 Page as of 11:00 a.m., London time. Notwithstanding the
foregoing, in no event shall LIBOR be an amount less than the Index Floor.
Index Floor: shall mean one-quarter of one percent (0.25%) per annum.
Nodes: ["Interest Rate", "FinancialTerm", {"name":"Interest Rate"}], ["Spread", "FinancialTerm", {"value": "4.75%", "name": "Spread"}], ["Benchmark", "FinancialTerm", {"name": "Benchmark"}], ["LIBOR", "FinancialTerm", {"name": "LIBOR"}], ["IndexFloorRate", "FinancialTerm", {"name": "Index Floor Rate", "value": "0.25%"}]
Relationships: ["Interest Rate", "SUM_OF", "Spread", {}], ["Interest Rate", "SUM_OF", "Benchmark", {}], ["Benchmark", "Initial Benchmark", "LIBOR", {}],  ["LIBOR", "MINIMUM_VALUE", "Index Floor Rate", {}]
"""

def generate_prompt(data) -> str:
    return f"""
Data: {str(data)}
"""

def generate_cypher() -> str:
    return """Return the Cypher code for the given entity relationships. Only write the code."""

def generate_prompt_for_question(entity_relationship: str, question: str) -> str:
    return f"""
You are a graph data scientist working for a company that is building a graph database. Your task is to extract information from questions and convert it to cypher query.
The following is the nodes and relationships of the graph database:

{entity_relationship}

Example:\
Question: How is the interest rate calculated?\
Cypher: MATCH (a:FinancialTerm)-[b:SUM_OF]->(c:FinancialTerm), (d:FinancialTerm)-[e:Initial_Benchmark]->(f:LIBOR), (g:FinancialTerm)-[h:MINIMUM_VALUE]->(i:FinancialTerm)  RETURN a,b,c,d,e,f,g,h,i\

Now do the same for the following questions:

The following is the question:
{question}\

Return the cypher code to query the graph database. Only write the code.
"""

def generate_prompt_cypher2text(cypher: str, context: str):
    return f"""
You are a graph data scientist working for a company that is building a graph database. Your task is to convert cypher query to natural language text.
The following is the cypher query:\
{cypher}\

Using the logic above, extract the answer to the question from the below context:\
{context}\
"""

In [10]:
import os
import openai
from dotenv import load_dotenv
load_dotenv()

openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-05-15"
openai.api_key = os.getenv("OPENAI_API_KEY")
engine="PROD-GPT-16K-TURBO"

def text2triplets(text: str):
    entity_rel_output = openai.ChatCompletion.create(
    engine=engine, # engine = "deployment_name".
    messages=[
        {"role": "system", "content": generate_system_message()},
        {"role": "user", "content": generate_prompt(text)}
        ]
    )
    entity_rel_output = entity_rel_output['choices'][0]['message']['content']
    return entity_rel_output

    """cypher_output = openai.ChatCompletion.create(
        engine=engine,
        messages=[
            {"role": "system", "content": generate_cypher()},
            {"role": "user", "content": entity_rel_output}
        ]
    )
        cypher_output = cypher_output['choices'][0]['message']['content']

        return cypher_output """

def triplet2cypher(entity_rel_output: str):
    cypher_output = openai.ChatCompletion.create(
        engine=engine,
        messages=[
            {"role": "system", "content": generate_cypher()},
            {"role": "user", "content": entity_rel_output}
        ]
    )
    cypher_output = cypher_output['choices'][0]['message']['content']

    return cypher_output

def question2cypher(entity_relationship: str, question: str) -> str:
    cypher_output = openai.ChatCompletion.create(
        engine=engine,
        messages=[
            {"role": "system", "content": generate_prompt_for_question(question=question, entity_relationship=entity_relationship)},
            {"role": "user", "content": question}
        ]
    )
    cypher_output = cypher_output['choices'][0]['message']['content']
    return cypher_output

def cypher2text(cypher: str, question: str, text: str) -> str:
    output =  openai.ChatCompletion.create(
        engine=engine,
        messages=[
            {"role": "system", "content": generate_prompt_cypher2text(cypher=cypher, context=text)},
            {"role": "user", "content": question}
        ]
    )
    output = output['choices'][0]['message']['content']
    return output

### read the text context file

In [11]:
with open('text.txt', encoding='utf-8') as f:
    text = f.read()

## Creating the Graph Database

### Convert text context to entity relationship triplets

In [12]:

#extracted_entities,query_output = text2cypher(text)
output=text2triplets(text)
print("Here all the entity relationship found: \n" + output )

Here all the entity relationship found: 
Nodes: 
["Interest Rate", "FinancialTerm", {"name":"Interest Rate"}], 
["Spread", "FinancialTerm", {"value": "4.75%", "name": "Spread"}], 
["Benchmark", "FinancialTerm", {"name": "Benchmark"}], 
["LIBOR", "FinancialTerm", {"name": "LIBOR"}], 
["IndexFloorRate", "FinancialTerm", {"name": "Index Floor Rate", "value": "0.25%"}]

Relationships: 
["Interest Rate", "SUM_OF", "Spread", {}], 
["Interest Rate", "SUM_OF", "Benchmark", {}], 
["Benchmark", "Initial Benchmark", "LIBOR", {}],  
["LIBOR", "MINIMUM_VALUE", "Index Floor Rate", {}]


### Convert entity relationship triplets to cypher query

In [16]:
print("\n Here are converted queries from triplets that can be executed on graph database: \n")
query_output = triplet2cypher(output)
print(query_output)


 Here are converted queries from triplets that can be executed on graph database: 

CREATE (ir:InterestRate:FinancialTerm {name: "Interest Rate"})
CREATE (s:Spread:FinancialTerm {value: "4.75%", name: "Spread"})
CREATE (b:Benchmark:FinancialTerm {name: "Benchmark"})
CREATE (l:LIBOR:FinancialTerm {name: "LIBOR"})
CREATE (ifr:IndexFloorRate:FinancialTerm {name: "Index Floor Rate", value: "0.25%"})
CREATE (ir)-[:SUM_OF]->(s)
CREATE (ir)-[:SUM_OF]->(b)
CREATE (b)-[:Initial_Benchmark]->(l)
CREATE (l)-[:MINIMUM_VALUE]->(ifr)


### Connect the neo4j and create the knowledge graph from the cypher

In [2]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.13.0.tar.gz (192 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/192.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/192.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.13.0-py3-none-any.whl size=265313 sha256=03531df22aa0feef76aaad29a8f93f1d8ab3dbc5349fe04ff36a0644b5d39830
  Stored in directory: /root/.cache/pip/wheels/7b/1d/b6/1be3a1e9de57bc83

In [17]:
import neo4j

# Connect to the Neo4j database

uri = "neo4j+s://52890117.databases.neo4j.io"
username = "neo4j"
password = "UxotX-MJuZFoNVyeCfh-nJTWM79IWMYJEbQHpBBcbVA"

# Connect to the Neo4j database
driver = neo4j.GraphDatabase.driver(uri, auth=(username, password))

# Create a session
session = driver.session()

# for deleting all nodes and relationships
# query = """MATCH (n)
# # DETACH DELETE n"""
# session.run(query)

# Run the query
query = query_output

# Execute the query
session.run(query)

# # visualize all the nodes and relationships
# query = """MATCH (a)-[r]->(b)
# RETURN a, r, b"""


<neo4j._sync.work.result.Result at 0x7ff601e68430>

Hey after this the graph visualization step is manual here is a link you can log in with above credentials to check your graph :
1) https://workspace-preview.neo4j.io/connection/connect


In [None]:
session.close()

## Generate answer from the questions

In [15]:
entity_rel = text2triplets(text)

question = "How is the interest rate calculated?"
cypher_output = question2cypher(entity_relationship= entity_rel, question=question)

print(cypher_output)

answer = cypher2text(cypher=cypher_output, question=question, text=text)
print(answer)

MATCH (a:FinancialTerm)-[b:SUM_OF]->(c:FinancialTerm), (c:FinancialTerm)-[d:Initial_Benchmark]->(e:FinancialTerm) RETURN a,b,c,d,e
The interest rate is calculated by adding the spread to the benchmark rate. The spread is a fixed percentage, while the benchmark rate is initially the London interbank offered rate (LIBOR) for U.S. dollars with a 1-month tenor. However, if there is a benchmark transition event or an early opt-in election, the benchmark may be replaced with a benchmark replacement. The lender determines the benchmark and its applicable reference time. The interest rate is determined by the lender and is binding unless there is a manifest error. The benchmark rate is calculated based on the rate for deposits in U.S. dollars for a one-month period that appears on the Reuters Screen LIBOR01 Page at 11:00 a.m. London time, two business days before the start of the interest period. If the rate is not available, the lender will request rates from four prime banks in the London in