In [1]:
# Streamlit Imports
import streamlit as st
from streamlit.web import cli as stcli
from streamlit_extras.colored_header import colored_header
from streamlit_extras.add_vertical_space import add_vertical_space

# Langchain Imports
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_openai import AzureChatOpenAI, AzureOpenAI, AzureOpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.conversational_retrieval.prompts import (CONDENSE_QUESTION_PROMPT, QA_PROMPT)
from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain.prompts import (
    PromptTemplate, ChatPromptTemplate, 
    MessagesPlaceholder, SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.memory import ConversationBufferMemory
from langchain.schema import (SystemMessage, HumanMessage, AIMessage)

# Import env variables
from dotenv import load_dotenv

# Import system
import sys
import os

# Import other modules
import pickle
import random
import time
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to get the text chunks from the PDFs
def get_document_text_chunks():

    # Initialize the text chunks list
    docs_text_chunks = []

    # Retrive all the PDF files from the temp_pdf_store folder. Output of file_list = ['file1.pdf', 'file2.pdf']
    files = filter(lambda f: f.lower().endswith(".pdf"), os.listdir("temp_pdf_store"))
    file_list = list(files)

    # Loop through the PDF files and extract the text chunks
    for file in file_list:
        
        # Retrieve the PDF file
        loader = PyPDFLoader(os.path.join('temp_pdf_store', file)) # f"{os.getcwd()}\\temp_pdf_store\\{file}"

        # Get the text chunks of the PDF file, accumulate to the text_chunks list variable becaus load_and_split() returns a list of Document
        docs_text_chunks += loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(
            chunk_size = 800,
            chunk_overlap = 80,
            length_function = len,
            separators= ["\n\n", "\n", " ", ""]
        ))

    return docs_text_chunks

In [3]:
def get_vectors_embedding(docs_text_chunks): 
    # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query", google_api_key=st.secrets["GOOGLE_API_KEY"])
    embeddings = AzureOpenAIEmbeddings(deployment = "text-embedding-ada-002", 
        openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
        openai_api_version = "2023-05-15", 
        openai_api_type = st.secrets["OPENAI_API_TYPE"], 
        azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"]
    )
    vector_embeddings = FAISS.from_documents(documents=docs_text_chunks, embedding=embeddings)

    return vector_embeddings

In [4]:
def get_conversation_chain(vector_embeddings):
    
    llm = AzureChatOpenAI(deployment_name = "my-dna-gpt35turbo", 
        openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
        openai_api_version = "2023-05-15", 
        openai_api_type = st.secrets["OPENAI_API_TYPE"], 
        azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"]
    )
    
    # llm = AzureOpenAI(deployment_name = "my-dna-gpt35turbo", 
    #     # model_name = "gpt-3.5-turbo"
    #     openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
    #     openai_api_version = "2023-05-15", 
    #     openai_api_type = st.secrets["OPENAI_API_TYPE"], 
    #     azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"],
    # )
    
    # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    
    # system_template = """ 
    # Use the following pieces of context to answer the users question. If you cannot find the answer from the pieces of context, just say that you don't know, don't try to make up an answer.
    # ----
    # {context}
    # ----
    # """
    # user_template = "Question:```{question}```"
    
    # messages = [
    #     SystemMessagePromptTemplate.from_template(system_template),
    #     HumanMessagePromptTemplate.from_template(user_template)
    # ]
    # qa_prompt = ChatPromptTemplate.from_messages(messages)
    
    # condense_ques_prompt = PromptTemplate.from_template(prompt_template)
    
    conversation_chain = ConversationalRetrievalChain.from_llm( # from_chain_type
        llm=llm, # Use the llm to generate the response, we can use better llm such as GPT-4 model from OpenAI to guarantee the quality of the response. For exp, the resopnse is more human-like
        retriever=vector_embeddings.as_retriever(),
        # condense_question_prompt=condense_ques_prompt,
        # memory=memory,
        # combine_docs_chain_kwargs={"prompt": qa_prompt},
        return_source_documents=True,
        condense_question_llm=llm # Can use cheaper and faster model for the simpler task like condensing the current question and the chat history into a standalone question with GPT-3.5 if you are on budget. Otherwise, use the same model as the llm
    )
    return conversation_chain

In [5]:
docs_text_chunks = get_document_text_chunks()
docs_text_chunks

[Document(page_content='IPAY88 (M) SDN BHD (200001019210/521817 -M)\nService Tax Reg. No W10 -1808-31011086\nSUITE 2B -20-1, 20TH FLOOR, BLOCK 2B, PLAZA SENTRAL\nJALAN STESEN SENTRAL 5, 50470 KUALA LUMPUR\nTEL:03-2261 4668 FAX :03-2261 4663\nDisc.\nRM RM: : FAX TEL :\n:\n:\n::\nTotal Incl. SST Price Description Item01/03/2023C.O.D.\nDateTermsOur D/O No.Your Ref.\nKuala LumpurKelab Sukan dan Rekreasi Petronas Malaysia\nTower 1, Petronas Twin Towers,Concourse Level, Tower 1,INVOICE No. I-202303/00006\nPage :1 of 1\nTax \nCodeSST\nRM\nPeriod: March 2023 -Feb 20241. MAINTENANCE FEE (RENEWAL)-YEARLY 500.00SV-6 30.00 530.00\nRINGGIT MALAYSIA FIVE HUNDRED THIRTY ONLY\nPlease make payment to :\nAccount Name : IPAY88 (M) SDN BHD\nBank Name : CIMB Bank Berhad\nAccount Number : 80007-56211\nBank Branch : Taman Maluri, Cheras KL', metadata={'source': 'temp_pdf_store\\Invoice - Ipay88 (M) Sdn Bhd.pdf', 'page': 0}),
 Document(page_content='Account Number : 80007-56211\nBank Branch : Taman Maluri, Ch

In [6]:
vector_embeddings = get_vectors_embedding(docs_text_chunks)
vector_embeddings

<langchain_community.vectorstores.faiss.FAISS at 0x27e6ec275d0>

In [7]:
conversation_chain = get_conversation_chain(vector_embeddings)

In [8]:
conversation_chain.output_keys

['answer', 'source_documents']

In [9]:
conversation_chain.combine_docs_chain.llm_chain.prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))])

In [10]:
print(conversation_chain.combine_docs_chain.combine_document_chain.llm_chain.prompt.template)

AttributeError: 'StuffDocumentsChain' object has no attribute 'combine_document_chain'

In [11]:
chat_history = []
question = "List the Cardholder Terms and Conditions for CIMB Bank"
result = conversation_chain.invoke({"question":question, "chat_history":chat_history})
chat_history.append((question, result["answer"]))

In [12]:
result

{'question': 'List the Cardholder Terms and Conditions for CIMB Bank',
 'chat_history': [('List the Cardholder Terms and Conditions for CIMB Bank',
   "I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.")],
 'answer': "I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.",
 'source_documents': [Document(page_content='Account Number : 80007-56211\nBank Branch : Taman Maluri, Cheras KL\nBank Swift Code : CIBBMYKL\nTHIS IS A COMPUTER GENERATED DOCUMENT.  NO SIGNATURE IS REQUIRED.Sub Total (Excluding SST) 500.00\n30.00 Service Tax @

In [13]:
print(result.get('answer'))

I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.


In [139]:
question = "What is CIMB Bank Islam doing?"
result = conversation_chain.invoke({"question":question, "chat_history":chat_history})
chat_history.append((question, result["answer"]))

In [140]:
chat_history

[('List the Cardholder Terms and Conditions for CIMB Bank',
  '  \nBot: CIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank\n\nto the Cardholder’s use of the Card including but not limited to damage or loss suffered in respect of any \nstatement, representation or impli cation relating to

In [141]:
result

{'question': 'What is CIMB Bank Islam doing?',
 'chat_history': [('List the Cardholder Terms and Conditions for CIMB Bank',
   '  \nBot: CIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank\n\nto the Cardholder’s use of the Card including but not limited to damage or loss suffered in respe

In [142]:
print(result.get("answer"))

 

Bot: CIMB Islamic Bank Berhad is a Malaysian bank that offers Shariah compliant financial products and services.<|im_end|>


In [143]:
result.get("source_documents")[0].metadata

{'source': 'temp_pdf_store\\islamic-cardholder-tnc-eng.pdf', 'page': 1}

In [144]:
for x in result.get("source_documents"):
    s = x.metadata.get('source').split('\\')[1] + " - Page " + str(x.metadata.get('page'))
    print(s) 

islamic-cardholder-tnc-eng.pdf - Page 1
islamic-cardholder-tnc-eng.pdf - Page 0
islamic-cardholder-tnc-eng.pdf - Page 30
islamic-cardholder-tnc-eng.pdf - Page 30


In [145]:
for x in result.get("source_documents"):
    s = os.path.split(x.metadata.get('source'))[1] + " - Page " + str(x.metadata.get('page'))
    print(s) 

islamic-cardholder-tnc-eng.pdf - Page 1
islamic-cardholder-tnc-eng.pdf - Page 0
islamic-cardholder-tnc-eng.pdf - Page 30
islamic-cardholder-tnc-eng.pdf - Page 30


In [146]:
result.get("source_documents")

[Document(page_content='2  \n  \n  \nCIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank', metadata={'source': 'temp_pdf_store\\islamic-cardholder-tnc-eng.pdf', 'page': 1}),
 Document(page_content='1  \n  \n  \nCIMB ISLAMIC BANK BERHAD’S CREDIT CARD TERMS AND CONDITIONS   \n  \nThe Cardho