## Importing Libraries

In [99]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter

import os

Setting the openai token to environment variables

In [100]:
## Setting the environment variables of the OpenAI
os.environ['OPENAI_API_KEY'] = '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'

### Defining the Custom Recursive Character Text Splitter

The custom recursive text splitter wraps the RecursiveCharacterTextSplitter class and adds the page number to the metadata by adding them to each chunks

In [101]:
class CustomRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
    def split_documents(self, documents):
        texts = []
        for i, doc in enumerate(documents):
            chunks = super().split_documents([doc])
            for chunk in chunks:
                chunk.metadata = {'page_number': i + 1}  # Adding page number
                texts.append(chunk)
        return texts

## Creating the FAISS knowledge base and embeddings

In [102]:
DATA_PATH = 'data-legal/'
DB_FAISS_PATH = 'vectorstore/db_faiss'

loader = DirectoryLoader(DATA_PATH,
                         glob='*.pdf',
                         loader_cls=PyPDFLoader)
documents = loader.load()

text_splitter = CustomRecursiveCharacterTextSplitter(chunk_size=1500,
                                                     chunk_overlap=20)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(model_kwargs={'model_name': 'text-embedding-ada-002'})

db = FAISS.from_documents(texts, embeddings)
db.save_local(DB_FAISS_PATH)

## Initializing the retriever

In [103]:
## Information retrieval from the FAISS knowledge base
db = FAISS.load_local(DB_FAISS_PATH, embeddings)

"""Search Parameters for the retriever
k: refers to the top-k probability
fetch_k: refers to the number of documents to fetch
maximal_marginal_relevance: Boolean flag to indicate whether to use the maximal marginal relevance algorithm
distance_metric: refers to the distance metric to be used
"""

search_kwargs = {
    'k': 30,
    'fetch_k':100,
    'maximal_marginal_relevance': True,
    'distance_metric': 'cos',
}

retriever=db.as_retriever(search_kwargs=search_kwargs)

# bm25_retriever = BM25Retriever.from_documents(texts)
# bm25_retriever.k = 5
# ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.6, 0.4])

#initialize the LLM model
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

In [104]:
compressor = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)


## Defining the prompt for retrieving the answer for the legal queries from the knowledge base

In [197]:
custom_prompt_template = """Imagine you are the general counsel of a tech company and you need to go through an agreement and answer a few questions.
Use the following pieces of information to answer the question.

If the question asks for a specific piece of information like numerical values, answer by the specific value followed by the context of the question.

For question asking for description, answer with detailed bulleted description. Do not summarize the bullet points and remember to include all the points as mentioned in the source context. 

For questions asking for interest rate of Spread, search the named entity Spread and find the interest rate, followed by any relation to other variables.

For questions related to interest rate, answer with the rate of Spread and the initial benchmark and how they are related. Use the proper terminology of initial benchmark as mentioned in the context. 

For questions asking for initial maturity date or stated maturity date, search the named entity Stated Maturity Date and find the maturity date. The Stated Maturity Date should be an independent entity under definitions header.

For questions related to prepayability of loan, mention the numbers and facts like, Sections under which they are related. Do not summarize any response.

For example: 

Question: What is the name of the borrower of the loan agreement?
Answer: XYZ ('Borrower')

Question: What is the name of the lender of the loan agreement?
Answer: XYZ

Question: What is the maximum principal amount of loan in the loan agreement and the initial advance?
Answer: $15151515.00, with an initial advance of $43531.00
Mention the initial advance if it is mentioned

Question: what is the maximum amount of loan and the initial advance on loan or closing date advance amount?
Answer: $21323213.00; initial advance on loan or closing date advance amount: $43531.00 ('Closing Date Advance')

Question: what amount shall remain unfunded on the closing date in an unfunded reserve for payment and completion of approved initial capex?
Answer: $21323213.00 for payment and completion of the Approved Initial CapEx.

Question: What is the closing date of the loan agreement?
Answer: Date (the 'Closing Date')

Question: What is the Stated Maturity Date of the loan in the agreement?
Answer: Date

If you don't know the answer, return None, don't try to make up an answer.
Context information is below.
    ---------------------
    {context}
    ---------------------
    Given the context information,
    answer the question: {question}
    
    Answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])


## Defining the prompt for document relevancy to fetch the page number of the most relevant page numbers

In [194]:
custom_prompt_for_document_relevancy = """
A list of documents relevant to the query is shown below. Each document has a page number associated with it. 
Strictly respond with the page number of the document you should consult to answer the question, in order of relevance and return the page number of the most relevant document containing the query.
Do not include any documents that are not relevant to the query. Return only the page number of the relevant document as Answer.

If there are multiple pages containing the information, return the pages in a list of page numbers like [<page_number_1>, <page_number_2>,].

Example format:
Document 1:
<summary of document 1>
Page Number: <page_number of document 1>
Document 2:
<summary of document 2>
Page Number: <page_number of document 2>

Query: <query>
Page Number:

Let's try this now:
{context}
Query: {query}
Page Number:
"""

retrieval_prompt = PromptTemplate(template=custom_prompt_for_document_relevancy,
                            input_variables=['context', 'query'])


In [195]:
## Retrieval QA chain for retrieval from the FAISS
qa_chain = RetrievalQA.from_chain_type(
                                        llm=llm,
                                        chain_type='stuff',
                                        chain_type_kwargs={'prompt': prompt},
                                        retriever=retriever,
                                        return_source_documents=True
                                    )

## LLM chain for information relevancy from the returned query
relevance_qa_chain = LLMChain(
    llm=llm,
    prompt=retrieval_prompt
)


## Running the queries from the legal document

**Question**: "What is the name of the borrower of the loan agreement?"

In [108]:
query = "What is the name of the borrower of the loan agreement?"
res = qa_chain({'query': query})
print(res['result'])

TIDES ON BOCA RATON SOUTH OWNER, LLC ('Borrower')


Fetching the page number from the most relevant document

In [109]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)



Page Number: 1


**Question**: "What is the name of the lender of the loan agreement?"

In [110]:
query = "What is the name of the lender of the loan agreement?"
res = qa_chain({'query': query})
print(res['result'])

ACM CRE FUND I-L, LP


Fetching the page number from the most relevant document

In [111]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 1


**Question**: "What is the maximum principal amount of loan in the loan agreement and the initial advance on loan amount?"

In [112]:
query = "What is the maximum principal amount of loan in the loan agreement and the initial advance on loan amount?"
res = qa_chain({'query': query})
print(res['result'])

The maximum principal amount of the loan in the loan agreement is $13,817,798.00. The initial advance on the loan amount is $12,274,640.00.


In [113]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 34


**Question**: "What amount shall remain unfunded on the closing date in an unfunded reserve for payment and completion of approved initial capex? Answer with the amount and the purpose."

In [114]:
query = "what amount shall remain unfunded on the closing date in an unfunded reserve for payment and completion of approved initial capex? Answer with the amount and the purpose." 
res = qa_chain({'query': query})
print(res['result'])

$1,543,158.00 for payment and completion of the Approved Initial CapEx.


In [115]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 51


**Question**: "What is the closing date of the loan agreement?"

In [116]:
query = "What is the closing date of the loan agreement?"
res = qa_chain({'query': query})
print(res['result'])

June 2, 2022 (the "Closing Date")


In [117]:

context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 6


**Question**: "What is the Stated Maturity Date of the loan in the agreement?"

In [118]:
query = "What is the Stated Maturity Date of the loan in the agreement?"
res = qa_chain({'query': query})


In [119]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 31


**Question**: Is the loan prepayable, answer yes or no? If yes, mention the under which circumstances and what are the conditions of prepayment? What is 'Exit Fee' and the conditions related to it? What is 'Prepayment Premium' and the conditions related to it? Explain in detailed bulleted points, all the conditions of prepayment under 'Optional Prepayments' of the loan agreement. Strictly do not summarize.

In [130]:
query = "Is the loan prepayable, answer yes or no? If yes, mention the under which circumstances and what are the conditions of prepayment? What is 'Exit Fee' and the conditions related to it? What is 'Prepayment Premium' and the conditions related to it? Explain in detailed bulleted points, all the conditions of prepayment under 'Optional Prepayments' of the loan agreement. Strictly do not summarize."
res = qa_chain({'query': query})
res['result']

'Yes, the loan is prepayable. The conditions of prepayment are as follows:\n\n- Borrower may elect to prepay the Loan in full as set forth in Section 2.3.3; provided, that contemporaneously with such prepayment Borrower shall pay to Lender the Prepayment Premium, if any.\n- If the Loan is accelerated for any reason, other than Casualty or Condemnation at a time that no Event of Default is continuing, Borrower shall pay to Lender, in addition to all other amounts outstanding under the Loan Documents, the Prepayment Premium that would be payable on the date of acceleration calculated as if Borrower had elected to make a voluntary prepayment of the Loan pursuant to Section 2.3.3.\n- Borrower acknowledges that Lender is making the Loan in consideration of the receipt by Lender of all interest and other benefits intended to be conferred by the Loan Documents and if payments of Principal are made to Lender on or prior to the Stated Maturity Date, for any reason whatsoever, whether voluntary,

In [131]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 37


**Question**: "What is the spread and index floor rate."

In [215]:
query = "What is rate of spread (the 'Spread'). What is the relation with benchmark with Spread to form the interest rate. What is the floor index rate? Write in sentence format and numbers"
res = qa_chain({'query': query})
res['result']

"The rate of spread (the 'Spread') is 4.20%. The interest rate is determined by adding the Spread to the benchmark for the interest period. The floor index rate is 1.12606% per annum."

In [216]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 7


**Question**: Describe the use of future advances in the loan agreement. Explain in detailed bulleted points, all the conditions under 'Initial CapEx Reserve' of the loan agreement. Strictly do not summarize.

In [134]:
query = "Describe use of future advances in the loan agreement. Explain in detailed bulleted points, all the conditions under 'Initial CapEx Reserve' of the loan agreement. Strictly do not summarize."
res = qa_chain({'query': query})
res['result']

"- The loan agreement allows for future advances from the Initial CapEx Reserve.\n- The advances can be made within ten (10) business days after Lender's determination that all conditions to advance have been satisfied.\n- The advances can be made in increments of at least $25,000, except for the last advance which may be of a lesser amount.\n- Each advance from the Initial CapEx Reserve is subject to the satisfaction of certain conditions.\n- If Lender has not yet advanced or disbursed $1,000,000 from the Initial CapEx Reserve for the costs of the Approved Initial CapEx, the advance is for Capital Expenses actually incurred in respect of Approved Initial CapEx.\n- If Lender has advanced or disbursed $1,000,000 or more from the Initial CapEx Reserve for the costs of the Approved Initial CapEx, the advance is for 50% of the Capital Expenses actually incurred in respect of Approved Initial CapEx.\n- Borrower must provide evidence acceptable to Lender that the remaining 50% of such Capita

In [135]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 51


**Question**: "What amount shall remain unfunded on the closing date in an unfunded reserve for payment and completion of approved initial capex?"

In [126]:
query = "what amount shall remain unfunded on the closing date in an unfunded reserve for payment and completion of approved initial capex?"
res = qa_chain({'query': query})
res['result']

'$1,543,158.00'

In [127]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 51


In [217]:
query = "Describe the extension options in the loan agreement? Explain in detailed bulleted points, all the conditions of extension under 'Extension Conditions' of the loan agreement. Strictly do not summarize."
res = qa_chain({'query': query})
res['result']

'- The extension options in the loan agreement allow the borrower to extend the term and stated maturity date of the loan.\n- The extension can only be granted if certain conditions, known as the "Extension Conditions," are satisfied.\n- The Extension Conditions include the following:\n  1. No monetary or material non-monetary default or event of default exists at the time of the extension request and on the originally scheduled stated maturity date or the first extended maturity date.\n  2. The borrower must provide an officer\'s certificate confirming the accuracy of the statement in condition 1.\n  3. On or before the originally scheduled stated maturity date or the first extended maturity date, the borrower must either extend the term of the interest rate protection agreement or enter into a new agreement that expires no earlier than the first extended maturity date. The extension or new agreement must be in respect of the maximum principal amount and on the same terms as set forth

In [218]:
context = ""

compressed_docs = compression_retriever.get_relevant_documents(query+'\n'+res['result'])
for index, doc in enumerate(compressed_docs, start=1):
    context += f"Document {index}\n{doc.page_content}\nPage Number: {doc.metadata['page_number']}\n\n"

res = relevance_qa_chain.predict(context=context, query = query)
print(res)

Page Number: 46
