In [52]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [53]:
## load all the reports in pdf format
loader = DirectoryLoader('./data', glob="2023-sustainability-accessible-report.pdf", show_progress=True, use_multithreading=True)
raw_documents = loader.load()

100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


In [54]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256, chunk_overlap=100
)
documents = text_splitter.split_documents(raw_documents)

In [55]:
embeddings = HuggingFaceEmbeddings()

In [56]:
db = FAISS.from_documents(documents, embeddings)

In [57]:
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-large",
    task="text2text-generation",
    pipeline_kwargs={"max_length": 100},
)

In [58]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever(), return_source_documents=True)

In [59]:
query = "What are the company's climate objectives?"
response = qa_chain({"query": query})
print(response['result'])

economy-wide change


In [60]:
response["source_documents"]

[Document(page_content='26\n\n29 Be diverse and inclusive as a business and as\n\nan investor\n\n31 Align all our people behind our sustainability ambition\n\nHow we manage our sustainability activities\n\nFocus on climate\n\nDoing business responsibly\n\nSupplementary information', metadata={'source': 'data/2023-sustainability-accessible-report.pdf'}),
 Document(page_content='economy-wide change. Collaboration with our clients is critical to support our climate objectives and address related issues through our investments and stewardship.', metadata={'source': 'data/2023-sustainability-accessible-report.pdf'}),
 Document(page_content='Focus on climate\n\nDoing business responsibly\n\nSupplementary information\n\nM&G plc Sustainability Report 2022/23\n\n69\n\nAbout M&G\n\nGlossary\n\nKey terms and words\n\nCategory\n\nDefinition\n\nKey words and terms', metadata={'source': 'data/2023-sustainability-accessible-report.pdf'}),
 Document(page_content='our shareholders and other stakeholder