# Experimenting sentence-transformers/all-MiniLM-L6-v2 embedding model

### It ranked 40th on leaderboard, has 384 embedding dimension, & 41.95 score for retrieval task.

In [1]:
# import libraries
import streamlit as st
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import vertexai
import langchain
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import VertexAI
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate

In [2]:
# define LLM
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=500,
    temperature=0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

In [3]:
# load file and split in pages
def fileloader(pdffile):
    loader = PyPDFLoader(pdffile)
    pages = loader.load_and_split()
    return pages
pages = fileloader('./gcs-bucket/reports/2021/2021_AAPL.pdf')

In [4]:
# create embeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [6]:
# indexing
vectordb_singleDoc = None
persist_directory = "vectordb_singleDoc"
vectordb_singleDoc = Chroma.from_documents(documents=pages,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
vectordb_singleDoc.persist()

In [7]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=vectordb_singleDoc.as_retriever(search_type="mmr", 
                                                                           search_kwargs={'fetch_k': 30}), 
                                 return_source_documents=True)

In [11]:
result = qa({"query": "what is the title of this document and who produced it?"})
result.keys()

dict_keys(['query', 'result', 'source_documents'])

In [12]:
result["result"]

"The title of this document is Apple's 2021 ESG Report and it was produced by Apple."

In [13]:
result["source_documents"][0].metadata

{'page': 54, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}

In [14]:
query_list = ["What are the company's views on climate change?",
              "What are the company' plans to become more energy efficient?",
              "Which are the highlights of this report?",
              "How the company is going to achieve carbon neutrality?",
              "How the company manages its water consumption?",
              "How the company manages its energy consumption?",
              "How is waste handled by the company?",
              "Which future targets are mentioned in the report?",
              "What are the company's plan to achive their targets?"
              "How many targets the company have achieved? Which are those?",
              "Which targets the company could not meet?"
              "How did the company support their employes during the Covid Pandemic?"]

In [15]:
def QA(query):
    pagelist = []
    result = qa({"query": query})
    result["result"]
    for i in range(len(result["source_documents"])): 
        pagelist.append((result["source_documents"][i].metadata.get('page')))
    return result["result"], pagelist

In [16]:
## With embedding model: thenlper/gte-base
for q in range(len(query_list)):
    ans = QA(query_list[q])
    print('Query: ', query_list[q])
    print('Response:', ans[0])
    print('Source (page numbers): ', ans[1])
    print("===============================")

Query:  What are the company's views on climate change?
Response: Apple has committed to reducing its carbon footprint by 75% by 2030. The company has also issued about $4.7 billion in green bonds to finance renewable energy projects.
Source (page numbers):  [42, 5, 63, 6]
Query:  What are the company' plans to become more energy efficient?
Response: Apple's energy efficiency goals extend well beyond its products. The company is focused on using less energy across its operations and in its supply chain. At its facilities, Apple tracks energy use and explores ways to save energy: renovating and retrofitting older locations, designing new facilities with energy efficiency in mind, and working with local utilities on energy efficiency strategies.
Source (page numbers):  [7, 5, 16, 34]
Query:  Which are the highlights of this report?
Response: The highlights of this report are:
- Apple has over 160,000 talented employees across the world.
- We are taking meaningful actions for more diverse

# Now adding prompt

In [17]:
# Now adding prompt
prompt_template = """Use the following pieces of context to answer the question at the end.\ 
    If you don't know the answer, just say that information is not available, don't try to make up an answer.\
    Keep the answer as concise as possible and present as a bullet points.\ 
    Always say "thanks for asking!" at the end of the answer, in new line. 
    {context}
    Question: {question}
    Answer:"""
        
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"] )

chain_type_kwargs = {"prompt": PROMPT}
    
qa_retriever = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",
                                           retriever=vectordb_singleDoc.as_retriever(), 
                                           chain_type_kwargs=chain_type_kwargs, 
                                           return_source_documents=True, verbose=False )

In [18]:
query = "How the company is going to achieve carbon neutrality?"
#result = qa_retriever({"query": query})
result = QA(query)
print('Response:', result[0])
print('Source (page numbers): ', result[1])

Response: Apple is working to achieve carbon neutrality by using low-carbon materials, driving energy efficiency, and switching to clean energy. The company has also committed to making its products using only recycled or renewable materials.
Source (page numbers):  [7, 3, 59, 34]


## ConversationalRetrievalChain is useful for chat like continuous conversation where memory object is used. 
### As a source it returns name of pdf file. 
### To get page numbers, input parameters varies.

In [19]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm, vectordb_singleDoc.as_retriever(), 
                                           memory=memory)

In [20]:
query = "what is the title of this document?"
result = qa({"question": query})

In [21]:
result

{'question': 'what is the title of this document?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report.")],
 'answer': "The title of this document is Apple's 2021 ESG Report."}

In [22]:
query = "How is waste handled by the company?"
result = qa({"question": query})
result

{'question': 'How is waste handled by the company?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
  HumanMessage(content='How is waste handled by the company?'),
  AIMessage(content='Apple is committed to reducing waste and increasing recycling in its supply chain. The company has a zero waste to landfill goal for key manufacturing facilities, corporate offices, data centers, and retail stores. In fiscal year 2020, Apple sent 39,000 metric tons of e-waste to recycling.')],
 'answer': 'Apple is committed to reducing waste and increasing recycling in its supply chain. The company has a zero waste to landfill goal for key manufacturing facilities, corporate offices, data centers, and retail stores. In fiscal year 2020, Apple sent 39,000 metric tons of e-waste to recycling.'}

In [23]:
from langchain.prompts.prompt import PromptTemplate

custom_template = """Given the following conversation and a follow up question, \
rephrase the follow up question to be a standalone question, in English language.\
If you do not know the answer reply with 'I do not have the information'.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
        
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_prompt = ConversationalRetrievalChain.from_llm(llm, 
                                                  vectordb_singleDoc.as_retriever(),
                                                  condense_question_prompt=CUSTOM_QUESTION_PROMPT,
                                                  memory=memory
                                                  )

In [24]:
chat_history = [(query, result["answer"])]
query = "what is the title of this document?"
result = qa_prompt({"question": query, "chat_history": chat_history})
result

{'question': 'what is the title of this document?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report.")],
 'answer': "The title of this document is Apple's 2021 ESG Report."}

In [25]:
chat_history = [(query, result["answer"])]
query = "How company is going to achieve carbon neutrality?"
result = qa({"question": query, "chat_history": chat_history})
result

{'question': 'How company is going to achieve carbon neutrality?',
 'chat_history': [HumanMessage(content='what is the title of this document?'),
  AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
  HumanMessage(content='How is waste handled by the company?'),
  AIMessage(content='Apple is committed to reducing waste and increasing recycling in its supply chain. The company has a zero waste to landfill goal for key manufacturing facilities, corporate offices, data centers, and retail stores. In fiscal year 2020, Apple sent 39,000 metric tons of e-waste to recycling.'),
  HumanMessage(content='How company is going to achieve carbon neutrality?'),
  AIMessage(content='Apple is going to achieve carbon neutrality by reducing its emissions by 75 percent compared to 2015, and by investing in carbon removal solutions for the remaining emissions.')],
 'answer': 'Apple is going to achieve carbon neutrality by reducing its emissions by 75 percent compared to 2015, and

In [26]:
result.keys()

dict_keys(['question', 'chat_history', 'answer'])

In [27]:
result['answer']

'Apple is going to achieve carbon neutrality by reducing its emissions by 75 percent compared to 2015, and by investing in carbon removal solutions for the remaining emissions.'

In [28]:
result['chat_history']

[HumanMessage(content='what is the title of this document?'),
 AIMessage(content="The title of this document is Apple's 2021 ESG Report."),
 HumanMessage(content='How is waste handled by the company?'),
 AIMessage(content='Apple is committed to reducing waste and increasing recycling in its supply chain. The company has a zero waste to landfill goal for key manufacturing facilities, corporate offices, data centers, and retail stores. In fiscal year 2020, Apple sent 39,000 metric tons of e-waste to recycling.'),
 HumanMessage(content='How company is going to achieve carbon neutrality?'),
 AIMessage(content='Apple is going to achieve carbon neutrality by reducing its emissions by 75 percent compared to 2015, and by investing in carbon removal solutions for the remaining emissions.')]

In [29]:
qa_prompt_source = ConversationalRetrievalChain.from_llm(llm, 
                                                vectordb_singleDoc.as_retriever(),
                                                condense_question_prompt=CUSTOM_QUESTION_PROMPT,
                                                return_source_documents=True
)

In [30]:
chat_history = []
chat_history = [(query, result["answer"])]
query = "which are defined targets?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})

In [31]:
chat_history = [(query, result["answer"])]
query = "which sections this document has?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})
result.keys()

dict_keys(['question', 'chat_history', 'answer', 'source_documents'])

In [32]:
print("response: ", result["answer"])
print("\n")
print('source: ', result["source_documents"][0].metadata)
print("\n")
print('chat_hist: ', result['chat_history'])

response:  The document has the following sections:
- Introduction
- Our Business
- Our People
- Suppliers
- Customers
- Communities
- Environment
- Appendix


source:  {'page': 58, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}


chat_hist:  [('which are defined targets?', 'Apple has defined targets to reduce emissions by 75% by 2030, and to achieve 100% renewable energy for Apple facilities by 2030.')]


In [48]:
### checking chat history: keep earlier query-history in memory

In [33]:
chat_history = [(query, result["answer"])]
query = "what is the title?"
result = qa_prompt_source({"question": query, "chat_history": chat_history})

In [34]:
print("response: ", result["answer"])
print("\n")
print('source: ', result["source_documents"][0].metadata)
print("\n")
print('chat_hist: ', result['chat_history'])

response:  The title of the document is Apple's 2021 ESG Report.


source:  {'page': 54, 'source': './gcs-bucket/reports/2021/2021_AAPL.pdf'}


chat_hist:  [('which sections this document has?', 'The document has the following sections:\n- Introduction\n- Our Business\n- Our People\n- Suppliers\n- Customers\n- Communities\n- Environment\n- Appendix')]
