# Loading and Processing of PDFs with LangChain

**pip install langchain langchain-openai pymupdf**

**pip install "unstructured[local-inference]"**

Sources:

- Arslan Shahid, Chat with your PDFs using LangChain, https://medium.com/firebird-technologies/chat-with-your-pdfs-using-langchain-e57866b7926d

- Santhosh Reddy D, What are Langchain Document Loaders?, https://www.analyticsvidhya.com/blog/2024/07/langchain-document-loaders/

- LangChain, How to load PDFs, https://python.langchain.com/docs/how_to/document_loader_pdf/

- LangChain, Document Loaders, https://python.langchain.com/docs/integrations/document_loaders/


#### Example PDF:

For this post, we will be using the Public Sector Development Program PDF document. The document is a holistic view of Pakistan’s Ministry of Planning & Development’s core program (PSDP).

Download PDF from https://www.pc.gov.pk/uploads/archives/PSDP_2023-24.pdf

### Loading PDFs using different PDF loaders in LangChain

In [1]:
# import unstructured (may be incompatible with New ARC)
import pymupdf

# We will be using these PDF loaders but you can check out other loaded documents
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader, UnstructuredFileLoader

# This is the name of the report which should be in the directory
# You can download the precise PDF I am using from here https://www.pc.gov.pk/uploads/archives/PSDP_2023-24.pdf
file_path = 'data/PSDP_2023-24.pdf'

# This loader uses PyMuPDF
loader_py = PyMuPDFLoader(file_path)

#This loader uses Unstructured
# loader_un = UnstructuredPDFLoader(file_path)

# Storing the loaded documents as langChain Document object
pages_py = loader_py.load()

# pages_un = loader_un.load()

In [2]:
from pprint import pprint
pprint(pages_py[0])

Document(metadata={'producer': 'Wondershare PDFelement Pro', 'creator': 'Wondershare PDFelement Pro', 'creationdate': '2023-06-08T01:23:45-19:00', 'source': 'data/PSDP_2023-24.pdf', 'file_path': 'data/PSDP_2023-24.pdf', 'total_pages': 100, 'format': 'PDF 1.7', 'title': '', 'author': 'jacc8', 'subject': '', 'keywords': '', 'moddate': 'D', 'trapped': '', 'modDate': 'D', 'creationDate': "D:20230608012345-19'00", 'page': 0}, page_content='GOVERNMENT OF PAKISTAN \n \n \n \n \n \n \n \n \nPUBLIC SECTOR DEVELOPMENT \nPROGRAMME 2023-24 \n \n \n \n \n \n \n \n \n \nPLANNING COMMISSION \nMINISTRY OF PLANNING, DEVELOPMENT \n& SPECIAL INITIATIVES \n \n \n \n \nJune, 2023')


In [3]:
pprint(pages_py[0].metadata)

{'author': 'jacc8',
 'creationDate': "D:20230608012345-19'00",
 'creationdate': '2023-06-08T01:23:45-19:00',
 'creator': 'Wondershare PDFelement Pro',
 'file_path': 'data/PSDP_2023-24.pdf',
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': 'D',
 'moddate': 'D',
 'page': 0,
 'producer': 'Wondershare PDFelement Pro',
 'source': 'data/PSDP_2023-24.pdf',
 'subject': '',
 'title': '',
 'total_pages': 100,
 'trapped': ''}


In [4]:
pprint(pages_py[0].page_content)

('GOVERNMENT OF PAKISTAN \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 'PUBLIC SECTOR DEVELOPMENT \n'
 'PROGRAMME 2023-24 \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 'PLANNING COMMISSION \n'
 'MINISTRY OF PLANNING, DEVELOPMENT \n'
 '& SPECIAL INITIATIVES \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 'June, 2023')


In [None]:
# pprint(pages_un[0])

In [None]:
# pprint(pages_un[0].page_content)

### Data Cleaning

In [5]:
# text splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    # shows how to seperate
    separator="\n",
    # Shows the document token length
    # chunk_size=1000,
    chunk_size=100,
    # How much overlap should exist between documents
    # chunk_overlap=150,
    chunk_overlap=15,
    # How to measure length
    length_function=len
)

# Applying the splitter
docs = text_splitter.split_documents(pages_py)
print(len(docs), len(pages_py))

Created a chunk of size 119, which is longer than the specified 100
Created a chunk of size 113, which is longer than the specified 100
Created a chunk of size 118, which is longer than the specified 100
Created a chunk of size 109, which is longer than the specified 100
Created a chunk of size 114, which is longer than the specified 100
Created a chunk of size 109, which is longer than the specified 100
Created a chunk of size 116, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100
Created a chunk of size 112, which is longer than the specified 100
Created a chunk of size 117, which is longer than the specified 100
Created a chunk of size 102, which is longer than the specified 100
Created a chunk of size 107, which is longer than the specified 100
Created a chunk of size 110, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 120, which is longer tha

3165 100


In [6]:
pprint(docs[0].page_content)

('GOVERNMENT OF PAKISTAN \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 'PUBLIC SECTOR DEVELOPMENT \n'
 'PROGRAMME 2023-24')


In [7]:
# a simple function that removes \n newline from the content
def remove_ws(d):
    text = d.page_content.replace('\n','')
    d.page_content = text
    return d

# applied on the docs
docs = [remove_ws(d) for d in docs]

### Building a Retrieval

In [9]:
import os

# Paste your API key here. Remember to not share publicly
# openai_api_key = open("/Users/mjack6/.secrets/openai_mjack.apikey", "r").read().strip()
openai_api_key = open(os.path.expanduser("~/.secrets/openai_pmolnar_gsu_edu_msa8700.apikey"), "r").read().strip()

os.environ["OPENAI_API_KEY"] = openai_api_key

In [10]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Uses OpenAI embeddings to build a retriever
embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])

# Creates the document retriever using docs and embeddings
db = FAISS.from_documents(docs, embeddings)

  embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])


In [11]:
# Asking the retriever to do similarity search based on Query
query = "Foreign Aid for Lowari Road Tunnel & Access Roads Project (2nd Revised )"
answer = db.similarity_search(query)

for doc in answer:
    pprint(doc.page_content)

('1234567891065Lowari Road Tunnel & Access Roads Project (2nd Revised )ECNEC '
 '07.10.2022')
('Foreign AidRupeeTotal(Rupees Million)G.Sl.No.Name of the ProjectApproval '
 'Status')
('Foreign AidRupeeTotal(Rupees Million)G.Sl.No.Name of the ProjectApproval '
 'Status')
('Foreign AidRupeeTotalG.Sl.No.Name of the ProjectApproval StatusApproved '
 'CostEstimated')


In [12]:
# Building the retriever
retriever = db.as_retriever(search_kwargs={'k': 3})

### Context-Augmentation for the LLM

In [14]:
# Imports needed for the code to work.
# Using a simple output parser and chat prompt template
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


# This is the prompt used
template = """

You are a information retrieval AI. Format the retrieved information as a table or text


Use only the context for your answers, do not make up information

query: {query}

{context} 
"""

# Converts the prompt into a prompt template
prompt = ChatPromptTemplate.from_template(template)
# Using OpenAI model, by default gpt 3.5 Turbo
model = ChatOpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Construction of the chain
chain = (
# The initial dictionary uses the retriever and user supplied query
    {"context":retriever,
     "query":RunnablePassthrough()}
# Feeds that context and query into the prompt then model & lastly 
# uses the ouput parser, do query for the data.
    |  prompt  | model | StrOutputParser()
 
)

In [15]:
# Asking for something inside the PDF image shown
pprint(
    chain.invoke("""Find the details Antimicrobial Resistance
        (AMR)containment and Infection
        prevention Control(IPC) program


        Break down the Approved Cost both Total and Foreign Aid, Throwforward and Estimated Expenditure 
        """)
    )

('| Approved Cost Total | Approved Cost Foreign Aid | Throwforward | Estimated '
 'Expenditure |\n'
 '|-----------------------|---------------------------|--------------|----------------------|\n'
 '| 361.96.000            | 267.41.594                | 94.54.500    | '
 '94.54.500            |')
