# Experimenting thenlper/gte-base embedding model
### It ranked 6th on leaderboard, has 768 embedding dimension, & 51.14 score for retrieval task.

In [2]:
# import libraries
import streamlit as st
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import vertexai
import langchain
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import VertexAI
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate

In [3]:
# define LLM
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=500,
    temperature=0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

In [4]:
# load file and split in pages
def fileloader(pdffile):
    loader = PyPDFLoader(pdffile)
    pages = loader.load_and_split()
    return pages
pages = fileloader('./gcs-bucket/reports/2021/2021_AAPL.pdf')

In [5]:
# create embeddings
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, 
                                                                 chunk_size=150, 
                                                                 chunk_overlap=20, 
                                                                 separator="\n")
documents = text_splitter.split_documents(pages)
embeddings = HuggingFaceEmbeddings()

In [6]:
# indexing
vectordb_singleDoc = None
persist_directory = "vectordb_singleDoc1"
vectordb_singleDoc1 = Chroma.from_documents(documents=documents,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
vectordb_singleDoc1.persist()

In [7]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=vectordb_singleDoc1.as_retriever(search_type="mmr", 
                                                                           search_kwargs={'fetch_k': 30}), 
                                 return_source_documents=True)

In [8]:
result = qa({"query": "what is the title of this document and who produced it?"})
result.keys()

dict_keys(['query', 'result', 'source_documents'])

In [9]:
result["result"]

"The title of this document is Apple's 2021 Environmental, Social and Governance Report. It was produced by Apple."

In [10]:
result["source_documents"][0].metadata

{'page': 1, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}

In [11]:
query_list = ["What are the company's views on climate change?",
              "What are the company' plans to become more energy efficient?",
              "Which are the highlights of this report?",
              "How the company is going to achieve carbon neutrality?",
              "How the company manages its water consumption?",
              "How the company manages its energy consumption?",
              "How is waste handled by the company?",
              "Which future targets are mentioned in the report?",
              "What are the company's plan to achive their targets?"
              "How many targets the company have achieved? Which are those?",
              "Which targets the company could not meet?"
              "How did the company support their employes during the Covid Pandemic?"]

In [13]:
def QA(query):
    pagelist = []
    result = qa({"query": query})
    result["result"]
    for i in range(len(result["source_documents"])): 
        pagelist.append((result["source_documents"][i].metadata.get('page')))
    return result["result"], pagelist

In [14]:
## With embedding model: thenlper/gte-base
for q in range(len(query_list)):
    ans = QA(query_list[q])
    print('Query: ', query_list[q])
    print('Response:', ans[0])
    print('Source (page numbers): ', ans[1])
    print("===============================")

Query:  What are the company's views on climate change?
Response: The company's views on climate change are that it is a serious issue that needs to be addressed. The company is committed to reducing its carbon footprint and is working to develop new technologies that will help to reduce emissions. The company is also working with other companies and organizations to address climate change.
Source (page numbers):  [3, 1, 9, 42]
Query:  What are the company' plans to become more energy efficient?
Response: The company's plans to become more energy efficient are to use energy more efficiently across retail stores, offices, data centers, and manufacturing sites.
Source (page numbers):  [6, 8, 7, 9]
Query:  Which are the highlights of this report?
Response: The highlights of this report are:
- Apple's commitment to diversity and inclusion
- Apple's commitment to human rights
- Apple's commitment to ethics and compliance
- Apple's commitment to tax payments
- Apple's commitment to stakehold

# Now adding prompt

In [16]:
# Now adding prompt
prompt_template = """Use the following pieces of context to answer the question at the end.\ 
    If you don't know the answer, just say that information is not available, don't try to make up an answer.\
    Keep the answer as concise as possible and present as a bullet points.\ 
    Always say "thanks for asking!" at the end of the answer, in new line. 
    {context}
    Question: {question}
    Answer:"""
        
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"] )

chain_type_kwargs = {"prompt": PROMPT}
    
qa_retriever = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",
                                           retriever=vectordb_singleDoc1.as_retriever(), 
                                           chain_type_kwargs=chain_type_kwargs, 
                                           return_source_documents=True, verbose=False )

In [17]:
query = "How the company is going to achieve carbon neutrality?"
#result = qa_retriever({"query": query})
result = QA(query)
print('Response:', result[0])
print('Source (page numbers): ', result[1])

Response: The company is going to achieve carbon neutrality by reducing its energy use, sourcing 100 percent renewable energy, and investing in working forests and ecosystem restoration.
Source (page numbers):  [2, 8, 6, 9]


## ConversationalRetrievalChain is useful for chat like continuous conversation where memory object is used. 
### As a source it returns name of pdf file. 
### To get page numbers, input parameters varies.

In [19]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm, vectordb_singleDoc1.as_retriever(), memory=memory)

In [20]:
query = "what is the title of this document?"
result = qa({"question": query})

In [21]:
result

{'question': 'what is the title of this document?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report.")],
 'answer': "The title of this document is Apple's 2021 ESG Report."}

In [22]:
query = "How is waste handled by the company?"
result = qa({"question": query})
result

{'question': 'How is waste handled by the company?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
  HumanMessage(content='How is waste handled by the company?'),
  AIMessage(content='Apple has a zero waste to landfill goal, which means that they aim to minimize overall waste generated and eliminate waste sent to landfill from key manufacturing facilities, corporate offices, data centers, and retail stores. They also work with their suppliers to help them eliminate waste from manufacturing.')],
 'answer': 'Apple has a zero waste to landfill goal, which means that they aim to minimize overall waste generated and eliminate waste sent to landfill from key manufacturing facilities, corporate offices, data centers, and retail stores. They also work with their suppliers to help them eliminate waste from manufacturing.'}

In [24]:
from langchain.prompts.prompt import PromptTemplate

custom_template = """Given the following conversation and a follow up question, \
rephrase the follow up question to be a standalone question, in English language.\
If you do not know the answer reply with 'I do not have the information'.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
        
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_prompt = ConversationalRetrievalChain.from_llm(llm, vectordb_singleDoc1.as_retriever(),
    condense_question_prompt=CUSTOM_QUESTION_PROMPT,
    memory=memory
)

In [25]:
chat_history = [(query, result["answer"])]
query = "what is the title of this document?"
result = qa_prompt({"question": query, "chat_history": chat_history})
result

{'question': 'what is the title of this document?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report.")],
 'answer': "The title of this document is Apple's 2021 ESG Report."}

In [28]:
chat_history = [(query, result["answer"])]
query = "How company is going to achieve carbon neutrality?"
result = qa({"question": query, "chat_history": chat_history})
result

{'question': 'How company is going to achieve carbon neutrality?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
  HumanMessage(content='How is waste handled by the company?'),
  AIMessage(content='Apple has a zero waste to landfill goal, which means that they aim to minimize overall waste generated and eliminate waste sent to landfill from key manufacturing facilities, corporate offices, data centers, and retail stores. They also work with their suppliers to help them eliminate waste from manufacturing.'),
  HumanMessage(content='How company is going to achieve carbon neutrality?'),
  AIMessage(content='The company is going to achieve carbon neutrality by decarbonizing materials used in their products, rapidly deploying renewable energy across their operations and their supply chain, and launching a first-of-its-kind fund to invest in nature-based carbon removal solutions.')

In [29]:
result.keys()

dict_keys(['question', 'chat_history', 'answer'])

In [27]:
result['answer']

'The company is going to achieve carbon neutrality by decarbonizing materials used in their products, rapidly deploying renewable energy across their operations and their supply chain, and launching a first-of-its-kind fund to invest in nature-based carbon removal solutions.'

In [30]:
result['chat_history']

[HumanMessage(content='what is the title of this document?'),
 AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
 HumanMessage(content='How is waste handled by the company?'),
 AIMessage(content='Apple has a zero waste to landfill goal, which means that they aim to minimize overall waste generated and eliminate waste sent to landfill from key manufacturing facilities, corporate offices, data centers, and retail stores. They also work with their suppliers to help them eliminate waste from manufacturing.'),
 HumanMessage(content='How company is going to achieve carbon neutrality?'),
 AIMessage(content='The company is going to achieve carbon neutrality by decarbonizing materials used in their products, rapidly deploying renewable energy across their operations and their supply chain, and launching a first-of-its-kind fund to invest in nature-based carbon removal solutions.'),
 HumanMessage(content='How company is going to achieve carbon neutrality?'),
 AIMessage

In [35]:
qa_prompt_source = ConversationalRetrievalChain.from_llm(llm, vectordb_singleDoc1.as_retriever(),
                                                condense_question_prompt=CUSTOM_QUESTION_PROMPT,
                                                return_source_documents=True
)

In [45]:
chat_history = []
chat_history = [(query, result["answer"])]
query = "which are defined targets?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})

In [46]:
chat_history = [(query, result["answer"])]
query = "which sections this document has?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})
result.keys()

dict_keys(['question', 'chat_history', 'answer', 'source_documents'])

In [47]:
print("response: ", result["answer"])
print("\n")
print('source: ', result["source_documents"][0].metadata)
print("\n")
print('chat_hist: ', result['chat_history'])

response:  The sections of the document are: Our Business Governance, Our commitment to human rights, Ethics and compliance, Tax payments, Stakeholder engagement, Appendix, Awards and recognition, United Nations Sustainable Development Goals, and About the report.


source:  {'page': 1, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}


chat_hist:  [('which are defined targets?', 'The defined targets are:\n\n• Commitment to competitive minimum wage\n• Support for Malala Fund\n• Donations help to alleviate poverty\n• End hunger, achieve food security \nand improved nutrition and promote \nsustainable agriculture\n• Donations to food banks, soup kitchens, and meals on wheels\n• Ensure healthy lives and promote \nwell-being for all at all ages\n• Support for product (RED)\n• Our work to promote health\n• Smarter chemistry  in our products\n• Environmental, health, and safety practices at Apple  and at supplier facilities\n• Build resilient infrastructure, promote \ninclusive and susta

In [48]:
### checking chat history: keep earlier query-history in memory

In [49]:
chat_history = [(query, result["answer"])]
query = "what is the title?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})

In [50]:
print("response: ", result["answer"])
print("\n")
print('source: ', result["source_documents"][0].metadata)
print("\n")
print('chat_hist: ', result['chat_history'])

response:  The title of the document is Apple's 2021 ESG Report.


source:  {'page': 1, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}


chat_hist:  [('which sections this document has?', 'The sections of the document are: Our Business Governance, Our commitment to human rights, Ethics and compliance, Tax payments, Stakeholder engagement, Appendix, Awards and recognition, United Nations Sustainable Development Goals, and About the report.')]
