In [2]:
# Streamlit Imports
import streamlit as st
from streamlit.web import cli as stcli
from streamlit_extras.colored_header import colored_header
from streamlit_extras.add_vertical_space import add_vertical_space

# Langchain Imports
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_openai import AzureChatOpenAI, AzureOpenAI, AzureOpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain.chains.conversational_retrieval.prompts import (CONDENSE_QUESTION_PROMPT, QA_PROMPT)
from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain.prompts import (
    PromptTemplate, ChatPromptTemplate, 
    MessagesPlaceholder, SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.memory import ConversationBufferMemory
from langchain.schema import (SystemMessage, HumanMessage, AIMessage)
from langchain.schema import StrOutputParser
# Import env variables
from dotenv import load_dotenv

# Import system
import sys
import os

# Import other modules
import pickle
import random
import time
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to get the text chunks from the PDFs
def get_document_text_chunks():

    # Initialize the text chunks list
    docs_text_chunks = []

    # Retrive all the PDF files from the temp_pdf_store folder. Output of file_list = ['file1.pdf', 'file2.pdf']
    files = filter(lambda f: f.lower().endswith(".pdf"), os.listdir("temp_pdf_store"))
    file_list = list(files)

    # Loop through the PDF files and extract the text chunks
    for file in file_list:
        
        # Retrieve the PDF file
        loader = PyPDFLoader(os.path.join('temp_pdf_store', file)) # f"{os.getcwd()}\\temp_pdf_store\\{file}"

        # Get the text chunks of the PDF file, accumulate to the text_chunks list variable becaus load_and_split() returns a list of Document
        docs_text_chunks += loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(
            chunk_size = 800,
            chunk_overlap = 80,
            length_function = len,
            separators= ["\n\n", "\n", " ", ""]
        ))

    return docs_text_chunks

In [3]:
def get_vectors_embedding(docs_text_chunks): 
    # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query", google_api_key=st.secrets["GOOGLE_API_KEY"])
    embeddings = AzureOpenAIEmbeddings(
        deployment = "text-embedding-3-small", 
        openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
        openai_api_version = "2024-02-01", 
        openai_api_type = "azure", 
        azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"]
    )
    vector_embeddings = FAISS.from_documents(documents=docs_text_chunks, embedding=embeddings)

    return vector_embeddings

In [4]:
def get_conversation_chain(vector_embeddings):

    # Create LLM
    llm = AzureChatOpenAI(
        deployment_name = "gpt-35-turbo-16k", 
        openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
        openai_api_version = "2024-02-01",  
        azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"],
        max_tokens = 550,
        model_kwargs={"top_p": 0.9}
    )
    
    # Create a template
    custom_template = """
    You are a helpful and powerful AI Assistant for question-asnwering tasks. Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. If you do not know the answer reply with "I am sorry, I dont have enough information to asnwer this question.".
    
    Chat History: {chat_history}
    Follow Up Input: {question}
    Standalone question:
    """

    CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)
    
    # Create chain with source documents return
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_embeddings.as_retriever(),
        return_source_documents=True,
        condense_question_prompt=CUSTOM_QUESTION_PROMPT,
    )
    return conversation_chain

In [5]:
docs_text_chunks = get_document_text_chunks()
docs_text_chunks

[Document(page_content='IPAY88 (M) SDN BHD (200001019210/521817 -M)\nService Tax Reg. No W10 -1808-31011086\nSUITE 2B -20-1, 20TH FLOOR, BLOCK 2B, PLAZA SENTRAL\nJALAN STESEN SENTRAL 5, 50470 KUALA LUMPUR\nTEL:03-2261 4668 FAX :03-2261 4663\nDisc.\nRM RM: : FAX TEL :\n:\n:\n::\nTotal Incl. SST Price Description Item01/03/2023C.O.D.\nDateTermsOur D/O No.Your Ref.\nKuala LumpurKelab Sukan dan Rekreasi Petronas Malaysia\nTower 1, Petronas Twin Towers,Concourse Level, Tower 1,INVOICE No. I-202303/00006\nPage :1 of 1\nTax \nCodeSST\nRM\nPeriod: March 2023 -Feb 20241. MAINTENANCE FEE (RENEWAL)-YEARLY 500.00SV-6 30.00 530.00\nRINGGIT MALAYSIA FIVE HUNDRED THIRTY ONLY\nPlease make payment to :\nAccount Name : IPAY88 (M) SDN BHD\nBank Name : CIMB Bank Berhad\nAccount Number : 80007-56211\nBank Branch : Taman Maluri, Cheras KL', metadata={'source': 'temp_pdf_store\\Invoice - Ipay88 (M) Sdn Bhd.pdf', 'page': 0}),
 Document(page_content='Account Number : 80007-56211\nBank Branch : Taman Maluri, Ch

In [6]:
vector_embeddings = get_vectors_embedding(docs_text_chunks)
vector_embeddings

<langchain_community.vectorstores.faiss.FAISS at 0x27e6ec275d0>

In [7]:
conversation_chain = get_conversation_chain(vector_embeddings)

In [8]:
conversation_chain.output_keys

['answer', 'source_documents']

In [9]:
conversation_chain.combine_docs_chain.llm_chain.prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))])

In [10]:
print(conversation_chain.combine_docs_chain.combine_document_chain.llm_chain.prompt.template)

AttributeError: 'StuffDocumentsChain' object has no attribute 'combine_document_chain'

In [11]:
chat_history = []
question = "List the Cardholder Terms and Conditions for CIMB Bank"
result = conversation_chain.invoke({"question":question, "chat_history":chat_history})
chat_history.append((question, result["answer"]))

In [12]:
result

{'question': 'List the Cardholder Terms and Conditions for CIMB Bank',
 'chat_history': [('List the Cardholder Terms and Conditions for CIMB Bank',
   "I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.")],
 'answer': "I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.",
 'source_documents': [Document(page_content='Account Number : 80007-56211\nBank Branch : Taman Maluri, Cheras KL\nBank Swift Code : CIBBMYKL\nTHIS IS A COMPUTER GENERATED DOCUMENT.  NO SIGNATURE IS REQUIRED.Sub Total (Excluding SST) 500.00\n30.00 Service Tax @

In [13]:
print(result.get('answer'))

I'm sorry, but I don't have access to the specific Cardholder Terms and Conditions for CIMB Bank. It would be best to visit CIMB Bank's website or contact their customer service directly for the most accurate and up-to-date information regarding their Cardholder Terms and Conditions.


In [139]:
question = "What is CIMB Bank Islam doing?"
result = conversation_chain.invoke({"question":question, "chat_history":chat_history})
chat_history.append((question, result["answer"]))

In [140]:
chat_history

[('List the Cardholder Terms and Conditions for CIMB Bank',
  '  \nBot: CIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank\n\nto the Cardholder’s use of the Card including but not limited to damage or loss suffered in respect of any \nstatement, representation or impli cation relating to

In [141]:
result

{'question': 'What is CIMB Bank Islam doing?',
 'chat_history': [('List the Cardholder Terms and Conditions for CIMB Bank',
   '  \nBot: CIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank\n\nto the Cardholder’s use of the Card including but not limited to damage or loss suffered in respe

In [142]:
print(result.get("answer"))

 

Bot: CIMB Islamic Bank Berhad is a Malaysian bank that offers Shariah compliant financial products and services.<|im_end|>


In [143]:
result.get("source_documents")[0].metadata

{'source': 'temp_pdf_store\\islamic-cardholder-tnc-eng.pdf', 'page': 1}

In [144]:
for x in result.get("source_documents"):
    s = x.metadata.get('source').split('\\')[1] + " - Page " + str(x.metadata.get('page'))
    print(s) 

islamic-cardholder-tnc-eng.pdf - Page 1
islamic-cardholder-tnc-eng.pdf - Page 0
islamic-cardholder-tnc-eng.pdf - Page 30
islamic-cardholder-tnc-eng.pdf - Page 30


In [145]:
for x in result.get("source_documents"):
    s = os.path.split(x.metadata.get('source'))[1] + " - Page " + str(x.metadata.get('page'))
    print(s) 

islamic-cardholder-tnc-eng.pdf - Page 1
islamic-cardholder-tnc-eng.pdf - Page 0
islamic-cardholder-tnc-eng.pdf - Page 30
islamic-cardholder-tnc-eng.pdf - Page 30


In [146]:
result.get("source_documents")

[Document(page_content='2  \n  \n  \nCIMB ISLAMIC CARDHOLDER TERMS AND CONDITIONS   \nVersion: 1 January 2024   \n  \nThese terms and conditions govern the use of the Mastercard and/or Visa Card issued by CIMB Islamic Bank Berhad  \n[200401032872 (671380 -H)] (the “ Bank ”) to the individual named on the Card.  \n  \n 1.  Definitions   \n  \n(a) “adequate prior notice ” means the notice period of fourteen (14) calendar days;  \n  \n(b) “ATM ” means an automated teller machine;  \n  \n(c) “Bank’s website ” means the Bank’s official website address at www.cimb.com.my  or such other website \naddress which the Bank may change from time to time by notification to the Cardholder;  \n  \n(d) “Card ” means any MasterCard or Visa Card issued by the Bank of such categories or brands which the Bank', metadata={'source': 'temp_pdf_store\\islamic-cardholder-tnc-eng.pdf', 'page': 1}),
 Document(page_content='1  \n  \n  \nCIMB ISLAMIC BANK BERHAD’S CREDIT CARD TERMS AND CONDITIONS   \n  \nThe Cardho

### Conversational Retriever + Chat History + Source Documents Returned
#### **LCEL Implementation**

In [3]:
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from operator import itemgetter
from langchain.memory import ConversationBufferMemory

In [6]:
def get_conversation_chain(vector_embeddings, memory):

    ############################################### CREATE LLM ###############################################
    llm = AzureChatOpenAI(
        deployment_name = "gpt-35-turbo-16k", 
        openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
        openai_api_version = "2024-02-01",  
        azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"],
        max_tokens = 550,
        model_kwargs={"top_p": 0.9}
    )

    ############################################### CREATE PROMPTS ###############################################
    # 1) Create a condense question prompt template for chat history
    condense_question_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

    Chat History:
    {chat_history}
    Follow Up Input: {question}
    Standalone question:"""
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(condense_question_template)

    # 2) Create a QnA answering prompt template
    qa_template = """Answer the question based only on the following context:
    {context}

    Question: {question}
    """
    ANSWER_PROMPT = ChatPromptTemplate.from_template(qa_template)

    # 3) Create a document prompt template
    DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

    ############################################### CREATE DOCUMENT FORMATTING METHOD ###############################################
    def _combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
        doc_strings = [format_document(doc, document_prompt) for doc in docs]
        return document_separator.join(doc_strings)

    # # Create memory
    # memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

    # First we add a step to load memory. this adds a "memory" key to the input object
    loaded_memory = RunnablePassthrough.assign(chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"))

    # Now we calculate the standalone question
    standalone_question = {
        "standalone_question": {
            "question": lambda x: x["question"],
            "chat_history": lambda x: get_buffer_string(x["chat_history"]),
        }
        | CONDENSE_QUESTION_PROMPT
        | llm
        | StrOutputParser(),
    }

    # Now we retrieve the documents
    retrieved_documents = {
        "docs": itemgetter("standalone_question") | vector_embeddings.as_retriever(),
        "question": lambda x: x["standalone_question"],
    }
    
    # Now we construct the inputs for the final prompt
    final_inputs = {
        "context": lambda x: _combine_documents(x["docs"]),
        "question": itemgetter("question"),
    }

    # And finally, we do the part that returns the answers
    answer = {
        "answer": final_inputs | ANSWER_PROMPT | llm,
        "docs": itemgetter("docs"),
    }

    # And now we put it all together!
    final_chain = loaded_memory | standalone_question | retrieved_documents | answer
    
    return final_chain

In [7]:
# Retrieve the PDF file
loader = PyPDFLoader("https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf")

# Initialize the text chunks list
docs_text_chunks = []

# Get text chunks
docs_text_chunks += loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 80,
    length_function = len,
    separators= ["\n\n", "\n", " ", ""]
))

In [8]:
# Get vector embeddings of text chunks and store it in vectorstore
embeddings = AzureOpenAIEmbeddings(
    deployment = "text-embedding-3-small", 
    openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"], 
    openai_api_version = "2024-02-01", 
    # openai_api_type = "azure", 
    azure_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"]
)
vector_embeddings = Chroma.from_documents(documents=docs_text_chunks, embedding=embeddings)

In [9]:
# Create memory
memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

# Get the chain
conversation_chain = get_conversation_chain(vector_embeddings, memory)

In [10]:
# Get the response from the chain with chat history, 1st question
user_input_ques = {"question": "Who created Transformers?"}
result = conversation_chain.invoke({"question": user_input_ques})
result

{'answer': AIMessage(content='The creator of Transformers is Ashish Vaswani.', response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 715, 'total_tokens': 726}, 'model_name': 'gpt-35-turbo-16k', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}),
 'docs': [Document(page_content='sequence (y1,...,y m)of symbols one element at a time. At each step the model is auto-regressive\n[9], consuming the previously generated symbols as additional

In [11]:
# Note that the memory does not save automatically
# This will be improved in the future
# For now you need to save it yourself
memory.save_context(user_input_ques, {"answer": result["answer"].content})

# Print chat history
memory.load_memory_variables({})

{'history': [HumanMessage(content='Who created Transformers?'),
  AIMessage(content='The creator of Transformers is Ashish Vaswani.')]}

In [12]:
# Get the response from the chain with chat history, 2nd question
user_input_ques = {"question": "Why they create it?"}
result = conversation_chain.invoke({"question": user_input_ques})
result

{'answer': AIMessage(content='The reason for creating Transformers was to propose a new simple network architecture, based solely on attention mechanisms, that can achieve superior quality in sequence transduction tasks while being more parallelizable and requiring less time to train compared to existing models.', response_metadata={'token_usage': {'completion_tokens': 46, 'prompt_tokens': 681, 'total_tokens': 727}, 'model_name': 'gpt-35-turbo-16k', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence':

In [13]:
# Save the latest user question
memory.save_context(user_input_ques, {"answer": result["answer"].content})

# Print chat history
memory.load_memory_variables({})

{'history': [HumanMessage(content='Who created Transformers?'),
  AIMessage(content='The creator of Transformers is Ashish Vaswani.'),
  HumanMessage(content='Why they create it?'),
  AIMessage(content='The reason for creating Transformers was to propose a new simple network architecture, based solely on attention mechanisms, that can achieve superior quality in sequence transduction tasks while being more parallelizable and requiring less time to train compared to existing models.')]}

In [14]:
# Get the response from the chain with chat history, 3rd question
# user_input_ques = {"question": "Who are they?"}
result = conversation_chain.invoke({"question": "Who are they?"})
result

{'answer': AIMessage(content='The creators of Transformers are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', response_metadata={'token_usage': {'completion_tokens': 53, 'prompt_tokens': 715, 'total_tokens': 768}, 'model_name': 'gpt-35-turbo-16k', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}),
 'docs': [Document(page_content='Attention Is All You Need\nAshish Vaswani∗\nGoo

In [15]:
# Save the latest user question
memory.save_context({"question": "Who are they?"}, {"answer": result["answer"].content})

# Print chat history
memory.load_memory_variables({})

{'history': [HumanMessage(content='Who created Transformers?'),
  AIMessage(content='The creator of Transformers is Ashish Vaswani.'),
  HumanMessage(content='Why they create it?'),
  AIMessage(content='The reason for creating Transformers was to propose a new simple network architecture, based solely on attention mechanisms, that can achieve superior quality in sequence transduction tasks while being more parallelizable and requiring less time to train compared to existing models.'),
  HumanMessage(content='Who are they?'),
  AIMessage(content='The creators of Transformers are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.')]}

In [17]:
result.get('answer').content

'The creators of Transformers are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.'

## Convert docx & doc file to PDF

In [7]:
from spire.doc import Document, FileFormat
from spire.doc.common import *
        
# Create a Document object
document = Document()

# Load a Word DOCX file
document.LoadFromFile("C:/Users/ILLEGEAR/Downloads/Intune BYOD Exception Form.docx") # Intune BYOD Exception Form.docx, Lim Ming Fung_Internship Enquiries Letter.docx

# Or load a Word DOC file
#document.LoadFromFile("Sample.doc")

# Save the file to a PDF file
document.SaveToFile("temp_pdf_store/Lim Ming Fung.pdf", FileFormat.PDF)
document.Close()

In [4]:
import os

file_name = "Lim Ming Fung.GGG-hhfd.docx"
file_extension = os.path.splitext(file_name)

print(file_extension)
print(file_extension[-1])
print(file_extension[-1].split(".")[-1])

('Lim Ming Fung.GGG-hhfd', '.docx')
.docx
docx


In [3]:
original_sentence = "Evaluation Warning: The document was created with Spire.Doc for Python."
your_string_variable = "Evaluation Warning: The document was created with Spire.Doc for Python. Your original string containing the sentence."

# Remove the sentence
cleaned_string = your_string_variable.replace(original_sentence, "")

print(cleaned_string)

 Your original string containing the sentence.
