Import Libraries and Load the PDF

In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [2]:
# Load the PDF File

path="INFORM Annual Report 2024.pdf"

if path:
    loader = UnstructuredPDFLoader(file_path=path)
    data = loader.load()
else:

    print("Please upload a PDF File")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# View contents of the first page

data[0].page_content



Split Text into Chunks and Create Vector Database

In [4]:
# Split Text 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 150)
chunks = text_splitter.split_documents(data)
print(f"{len(chunks)} have been made")

11 have been made


In [5]:
# Create Vector Database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="pdf_reader"
)

  embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
OllamaEmbeddings: 100%|██████████| 11/11 [01:25<00:00,  7.73s/it]


Configure LLM and Retrieve 

In [6]:
# Configure LLM and Retrieval 

llm_model = 'mistral'
llm = ChatOllama(model=llm_model)

  llm = ChatOllama(model=llm_model)


In [7]:
# Template for Query Prompt

llm_prompt = PromptTemplate(
    input_variables = ["question"], 
    template= """Your task is to generate two distinct rephrasings of the provided user question to retrieve relevant documents from a vector database. By offering multiple perspectives on the query, your goal is to help the user mitigate the limitations of distance-based similarity searches. Provide these alternative questions separated by newlines. Quesiton: {question}""",)

# Retriever

retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=llm_prompt
)

Chain Creation

In [8]:
# Template for RAG Prompt
rag_template = """Answer the question based on the following context only: {context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(rag_template)

In [9]:
# Chain Creation
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Chat with Document

In [10]:
# Define Chat Function

def chat(question):
    """
    Chat with the document
    """
    return display(chain.invoke(question))

In [11]:
# First Sample Question
chat("What is this document about?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.33s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


' This document provides an analysis of global risks, focusing on crises as driven by natural hazards and conflicts. It discusses the trends in annual exposure to sudden-onset natural hazards like river floods, coastal floods, cyclones, earthquakes, and tsunamis. The data indicates that climate-related hazards are responsible for most of the increases in exposure over the last decade, with Asia having the largest overall exposure and Africa experiencing the largest increase relative to 2014.\n\n   Additionally, the document discusses the impact of conflicts on various dimensions of risk, such as institutional, infrastructural coping capacity, and natural hazard exposure. It highlights that countries in conflict have a larger than average exposure to natural hazards, suggesting possible cross-correlation between the two. The document also mentions the INFORM Severity Index, which is used to analyze drivers of crises since its launch in 2020.'

In [12]:
# Second Sample Question
chat("What is INFORM Risk Index?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.47s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]


' The INFORM Risk Index is a collaboration between the Inter-Agency Standing Committee and the European Commission, with the Joint Research Centre of the European Commission serving as its scientific and technical lead. It measures the severity of humanitarian crises globally on an ongoing, up-to-date basis. The index aggregates information from various credible sources to provide a shared and objective understanding of crisis severity that can aid decisions regarding the allocation of resources. More information about INFORM can be found at https://drmkc.jrc.ec.europa.eu/inform-index.'