In [2]:
import os
from langchain.llms import GooglePalm
from langchain.vectorstores import FAISS
from langchain.embeddings import GooglePalmEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI, ChatAnthropic
from langchain.llms import AzureOpenAI
from langchain.document_loaders import DirectoryLoader,PyPDFLoader, UnstructuredPDFLoader, MathpixPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
# from langchain.document_loaders import UnstructuredExcelLoader
# from langchain.vectorstores import DocArrayInMemorySearch
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain import PromptTemplate
# from langchain.vectorstores import Chroma
# from langchain.agents.tools import Tool
# from langchain.experimental.plan_and_execute import PlanAndExecute, load_agent_executor, load_chat_planner
# from langchain import OpenAI, VectorDBQA
# from langchain.chains.router import MultiRetrievalQAChain
import streamlit as st
import pandas as pd
from tqdm import tqdm
import tiktoken
# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.callbacks import StdOutCallbackHandler

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain.chains import SequentialChain
from langchain.chains import LLMChain

In [3]:
os.environ["ANTHROPIC_API_KEY"] = "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"
claude_models = ["claude-instant-1","claude-2"]
anthropic_llm = ChatAnthropic(model=claude_models[1],temperature= 0,max_tokens_to_sample = 512)

os.environ["OPENAI_API_KEY"] = "36ed7c12ee2344d18a3f71ddeb477ce6"
os.environ["OPENAI_API_TYPE"] ="azure"
os.environ["OPENAI_API_VERSION"] ="2023-05-15"
os.environ["OPENAI_API_BASE"] = "https://testavinx.openai.azure.com/"

openai_llm = AzureChatOpenAI(deployment_name="gpt-35-turbo",model_name="gpt-35-turbo",temperature=0)
embeddings = OpenAIEmbeddings(deployment="embedding1",model="text-embedding-ada-002",openai_api_base="https://testavinx.openai.azure.com/",openai_api_type="azure",chunk_size = 1)

In [4]:
def load_doc(path,pages=[]):
    if path.endswith(".pdf"):
        doc = UnstructuredPDFLoader(file_path=path)
    else:
        doc = DirectoryLoader(path=path,glob="**/*.pdf")
    document = doc.load()
    if len(pages)!=0:
        document = [document[i] for i in pages]
    context = "\n\n".join([document[i].page_content for i in range(len(document))])
    return context

def token_len(string,model="text-embedding-ada-002"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(string))


def create_embedding(path,embeddings):
    if path.endswith(".pdf"):
        doc = PyPDFLoader(file_path=path)
    else:
        doc = DirectoryLoader(path=path,glob="**/*.pdf")
    document = doc.load()
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100,length_function = token_len)
    # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)/
    # text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
    # document = text_splitter.split_documents(document)
    file_name = os.path.splitext(os.path.split(path)[-1])[0]
    if (f"{file_name}.pkl" not in os.listdir("./FAISS_VS/")) or (f"{file_name}.faiss" not in os.listdir("./FAISS_VS/")):
        docsearch = FAISS.from_documents(document, embeddings)
        docsearch.save_local(folder_path='FAISS_VS', index_name=f"{file_name}")
    return document

def get_relevant_context(question,docs,top_k=5):
    context = dict()
    for doc_name,doc_info in docs.items():
        relevant_docs = doc_info["embeddings"].similarity_search(question,k=top_k)
        pages = [d.metadata["page"] for d in relevant_docs]
        relevant_pages = sorted(set().union(*[{p,p+1,p-1} for p in pages]).difference({-1}))
        context[doc_name] = "\n\n".join([d.page_content for d in doc_info["documents"] if d.metadata["page"] in relevant_pages])
    return context

In [5]:
document_paths = [
    './data/Home Bancorp (HB).pdf',
    './data/National Bank of Canada (NBC).pdf',
    './data/Bank of Montreal (BMO).pdf',
    './data/Basel Capital Adequacy Reporting (BCAR).pdf',
    './data/Versa Bank (VB)'
    ]

In [6]:
institute = "Bank of Montreal (BMO)"
client = "Basel Capital Adequacy Reporting (BCAR)"

docs = {
    f"{institute} Annual Report Document"   : {
        "embeddings":FAISS.load_local(folder_path='./FAISS_VS', embeddings=embeddings, index_name=institute),
        "documents" :create_embedding(f'./data/{institute}.pdf',embeddings),
        },
    f"{client} Document"                    : {
        "embeddings":FAISS.load_local(folder_path='./FAISS_VS', embeddings=embeddings, index_name=client),
        "documents" :create_embedding(f'./data/{client}.pdf',embeddings),
        } 
    }

In [7]:
# def compare_answer(chat_llm,question,docs,top_k=5):
    
#     relevant_context = get_relevant_context(question,docs,top_k)
    
#     retrival_system_template = """You are a helpful assistant, You need to extract as much text as you can which is relater or relevant to the answer of the given question from the context provided:
    
# Question: "{question}"

# Context: {doc_name}

# {context}

# DO NOT answer the question.
# Only extract the relevant text from the provided context.
# DO NOT use previous knowledge only use the provided context.
# """
    
#     retrival_system_prompt = SystemMessagePromptTemplate.from_template(template=retrival_system_template)
#     retrival_messages = [retrival_system_prompt]
#     retrival_chat_prompt = ChatPromptTemplate.from_messages(retrival_messages)
    
#     summary = dict()
#     for doc_name,doc_txt in tqdm(relevant_context.items()):
#         retrival_llm = ChatAnthropic(model=claude_models[1],temperature= 0,max_tokens_to_sample = 512)
#         summary[doc_name] = retrival_llm(retrival_chat_prompt.format_prompt(question=question,doc_name=doc_name,context=doc_txt).to_messages()).content

#     compare_context = "\n\n".join([f"Relevant points from {doc_name}:\n\n{doc_summary}" for doc_name,doc_summary in summary.items()])
#     print("\n\n",compare_context,"\n\n")
    
#     compare_system_template = """You are a helpful chatbot who has to answer question of a user from the institute {institute}.
# You will be given relevant points from various documents that will help you answer the user question.
# Below is a list of relevant points along with the name of the document from where thoes points are from.
# Consider all the documents provided to you and answer the question by choosing all the relevant points to the question.
# You might have to compare points from more than one document to answer the question.

# {context}"""

#     compare_system_prompt = SystemMessagePromptTemplate.from_template(template=compare_system_template)
#     compare_messages = [compare_system_prompt,HumanMessage(content=question)]
#     compare_chat_prompt = ChatPromptTemplate.from_messages(compare_messages)
#     response = chat_llm(compare_chat_prompt.format_prompt(institute=institute,context=compare_context).to_messages()).content
#     return response

In [8]:
question = "According to the Fiscal year end mentioned in the annual report on what months should BMO file BCAR?"

In [9]:
bank_system_template = """You are a helpful assistant who has been given the question "{question}".
Corresponding to previous question you have to extract the information from {institute} Annual Report provided below.
Just extract the relevent information or text from the context below that one required to answer the above question.
You are not question answering chatbot you just extract similar text from the context.

{institute} Annual Report:

{bank_context}
"""
    
bank_system_prompt = SystemMessagePromptTemplate.from_template(template=bank_system_template)
bank_chat_prompt = ChatPromptTemplate.from_messages([bank_system_prompt])
bank_chain = LLMChain(llm=anthropic_llm, prompt=bank_chat_prompt, output_key="bank_summary")

In [10]:
bcar_system_template = """You are a helpful assistant who has been given the question "{question}".
Corresponding to previous question you have to extract the information from Basel Capital Adequacy Reporting (BCAR) Document provided below.
Just extract the relevent information or text from the context below that one required to answer the above question.
You are not question answering chatbot you just extract similar text from the context.

Basel Capital Adequacy Reporting (BCAR) Document:

{bcar_context}
"""
    
bcar_system_prompt = SystemMessagePromptTemplate.from_template(template=bank_system_template)
bcar_chat_prompt = ChatPromptTemplate.from_messages([bcar_system_prompt])
bcar_chain = LLMChain(llm=anthropic_llm, prompt=bcar_chat_prompt, output_key="bcar_summary")

In [11]:
compare_system_template = """You are a helpful chatbot who has to answer question of a user from the institute {institute}.
You will be given relevant points from various documents that will help you answer the user question.
Below is a list of relevant points along with the name of the document from where thoes points are from.
Consider all the documents provided to you and answer the question by choosing all the relevant points to the question.
You might have to compare points from more than one document to answer the question.

{institute} Annual Report:
{bank_summary}

Basel Capital Adequacy Reporting (BCAR) Document:
{bcar_summary}
"""

compare_system_prompt = SystemMessagePromptTemplate.from_template(template=compare_system_template)
compare_messages = [compare_system_prompt,HumanMessage(content=question)]
compare_chat_prompt = ChatPromptTemplate.from_messages(compare_messages)
compare_chain = LLMChain(llm=anthropic_llm, prompt=compare_chat_prompt, output_key="final_answer")

In [12]:
overall_chain = SequentialChain(
    chains=[bank_chain,bcar_chain,compare_chain],
    input_variables=["bank_context","bcar_context","institute","question"],
    output_variables=["bank_summary","bcar_summary","final_answer"]
)

In [13]:
bank_context, bcar_context = get_relevant_context(question,docs).values()

In [14]:
response = overall_chain({"question":question,"institute":institute,"bank_context":bank_context,"bcar_context":bcar_context})

In [15]:
print(response['final_answer'])

 Based on the information provided in the Bank of Montreal (BMO) Annual Report and Basel Capital Adequacy Reporting (BCAR) Document, the relevant points are:

- The annual report states that BMO's fiscal year end is October 31.

- The annual report mentions that BMO is required to meet the minimum TLAC Ratio and TLAC Leverage Ratio effective November 1, 2021, as calculated under OSFI's TLAC Guideline. 

- OSFI's TLAC Guideline states that D-SIBs like BMO must file BCAR on a quarterly basis.

- Therefore, based on a fiscal year ending October 31, BMO should file BCAR on the following months:

1) January 31 (for the quarter ending January 31)
2) April 30 (for the quarter ending April 30)  
3) July 31 (for the quarter ending July 31)
4) October 31 (for the quarter ending October 31)

In summary, according to its fiscal year end of October 31 mentioned in the annual report, BMO should file BCAR on January 31, April 30, July 31, and October 31 each year.


In [16]:
import streamlit  as st