### Document Loaders
#### A LangChain document is bascically:
    - page_context: the text of teh document
    - metadata: info like source, page, etc


#### 1) PDF Loaders: PyPDFLoader

In [None]:
from langchain_community.document_loaders import PyPDFLoader

doc1 = PyPDFLoader('papers/2203.14465v2.pdf').load()
print(len(doc1))         # Nuber of Pages
print(doc1[0].metadata)

30
{'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}


In [None]:
from dotenv import load_dotenv
load_dotenv() # Loads variables from .env into environment

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('Introduction_to_Python_Programming_-_WEB.pdf')
docs = loader.load()
docs

[Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': 'Introduction_to_Python_Programming_-_WEB.pdf', 'total_pages': 415, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': 'Introduction_to_Python_Programming_-_WEB.pdf', 'total_pages': 415, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': 'Introduction_to_Python_Programming_-_WEB.pdf', 'total_pages': 415, 'page': 2, 'page_

In [None]:
print(type(docs)) # Docs is a list of Document type, each document has page_content (text), metadata (like page numbers, source)

<class 'list'>


In [6]:
print('Number of pages/documents:', len(docs))
print('First doc metadata:', docs[0].metadata)
print('10th doc content preview:\n', docs[10].page_content)

Number of pages/documents: 415
First doc metadata: {'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': 'Introduction_to_Python_Programming_-_WEB.pdf', 'total_pages': 415, 'page': 0, 'page_label': '1'}
10th doc content preview:
 Preface
About OpenStax
OpenStax is part of Rice University, which is a 501(c)(3) nonprofit charitable corporation. As an educational
initiative, it's our mission to improve educational access and learning for everyone. Through our partnerships
with philanthropic organizations and our alliance with other educational resource companies, we're breaking
down the most common barriers to learning. Because we believe that everyone should and can have access to
knowledge.
About OpenStax Resources
Customization
Introduction to Python Programming is licensed under a Creative Commons Attribution 4.0 International (CC
BY

### Loading multiple PDFs from a folder

In [8]:
import glob
from langchain_community.document_loaders import PyPDFLoader

all_docs = []

for path in glob.glob('papers/*.pdf'):
    all_docs.extend(PyPDFLoader(path).load())

print('Total loaded documents:', len(all_docs))
print(all_docs[0].metadata)

Total loaded documents: 52
{'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}


#### 2) Loading Text Files

In [21]:
from langchain_community.document_loaders import TextLoader

doc2 = TextLoader('notes.txt').load()
doc2


[Document(metadata={'source': 'notes.txt'}, page_content='Hello, my name is Allen Barry. \nI live in California, Los Angeles.\nI work at in a tech company as a Software Engineer.')]

#### 3) Loading CSV File

In [25]:
from langchain_community.document_loaders import CSVLoader

doc3 = CSVLoader('cleaned.csv').load()
print(doc3[0].page_content)
print('\n', doc3[100].page_content)
print('\n', len(doc3))

State_Code: AK
Issuer_Id: 38344
Issuer_Name: Premera Blue Cross Blue Shield of Alaska
Source_Name: HIOS
Import_Date: 10-01-2025
Market_Coverage: Individual
ServiceArea_Id: AKS001
Is_New_Plan: Existing
Plan_Type: PPO
Metal_Level: Gold
Notice_Pregnancy: No
Referral_Required: No
EHBPercent_TotalPremium: 0.9928
Out_Country_Coverage: Yes
Out_ServiceArea_Coverage: Yes
MedDrug_Deductibles_Int: Yes
MedDrug_Max_OOP_Int: Yes
HSA_Eligibility: No

 State_Code: AL
Issuer_Id: 46944
Issuer_Name: Blue Cross and Blue Shield of Alabama
Source_Name: HIOS
Import_Date: 25-03-2025
Market_Coverage: Individual
ServiceArea_Id: ALS001
Is_New_Plan: Existing
Plan_Type: PPO
Metal_Level: Silver
Notice_Pregnancy: No
Referral_Required: Yes
EHBPercent_TotalPremium: 1.0
Out_Country_Coverage: Yes
Out_ServiceArea_Coverage: Yes
MedDrug_Deductibles_Int: Yes
MedDrug_Max_OOP_Int: Yes
HSA_Eligibility: No

 23027


In [23]:
doc3

[Document(metadata={'source': 'cleaned.csv', 'row': 0}, page_content='State_Code: AK\nIssuer_Id: 38344\nIssuer_Name: Premera Blue Cross Blue Shield of Alaska\nSource_Name: HIOS\nImport_Date: 10-01-2025\nMarket_Coverage: Individual\nServiceArea_Id: AKS001\nIs_New_Plan: Existing\nPlan_Type: PPO\nMetal_Level: Gold\nNotice_Pregnancy: No\nReferral_Required: No\nEHBPercent_TotalPremium: 0.9928\nOut_Country_Coverage: Yes\nOut_ServiceArea_Coverage: Yes\nMedDrug_Deductibles_Int: Yes\nMedDrug_Max_OOP_Int: Yes\nHSA_Eligibility: No'),
 Document(metadata={'source': 'cleaned.csv', 'row': 1}, page_content='State_Code: AK\nIssuer_Id: 38344\nIssuer_Name: Premera Blue Cross Blue Shield of Alaska\nSource_Name: HIOS\nImport_Date: 10-01-2025\nMarket_Coverage: Individual\nServiceArea_Id: AKS001\nIs_New_Plan: Existing\nPlan_Type: PPO\nMetal_Level: Gold\nNotice_Pregnancy: No\nReferral_Required: No\nEHBPercent_TotalPremium: 0.9928\nOut_Country_Coverage: Yes\nOut_ServiceArea_Coverage: Yes\nMedDrug_Deductibles_I

#### 4) Loading JSON File

NOTE: 
- JSONLoader expects Document.page_content to be a string by default. If the jq_schema returns a JSON object (for example, using .data[]), LangChain raises an error because dictionaries are not valid text content. This can be fixed either by extracting a specific text field from the JSON (such as .data[].value) or by explicitly setting text_content=False to allow structured data.

Rule to remember:
- If your jq_schema returns a dict, set text_content=False.
- If your jq_schema returns text, keep the default behavior (text_content=True).

In [35]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(file_path = 'sample.json', jq_schema= ".data[]", text_content=False)

doc4 = loader.load()

doc4


[Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample.json', 'seq_num': 1}, page_content='{"id": 1, "value": "A"}'),
 Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample.json', 'seq_num': 2}, page_content='{"id": 2, "value": "B"}')]

In [37]:
loader1 = JSONLoader(file_path= "sample1.json", jq_schema= '.[]', text_content=False)

doc5 = loader1.load()

doc5

[Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 1}, page_content='User Reviews'),
 Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 2}, page_content='[{"review_id": 101, "user": "Alice", "comment": "The product quality is excellent", "rating": 5}, {"review_id": 102, "user": "Bob", "comment": "Delivery was slow but the product is good", "rating": 4}]')]

In [None]:
# jq_schema retuens a dict
loader1 = JSONLoader(file_path= "sample1.json", jq_schema= '.reviews[]', text_content=False)

doc5 = loader1.load()

doc5

[Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 1}, page_content='{"review_id": 101, "user": "Alice", "comment": "The product quality is excellent", "rating": 5}'),
 Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 2}, page_content='{"review_id": 102, "user": "Bob", "comment": "Delivery was slow but the product is good", "rating": 4}')]

In [None]:
# jq_schema returns a text
# No need to add text_content, default True works

loader1 = JSONLoader(file_path= "sample1.json", jq_schema= '.reviews[].comment')

doc6 = loader1.load()

doc6

[Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 1}, page_content='The product quality is excellent'),
 Document(metadata={'source': '/Users/mohinikathrotiya/Desktop/Langchain/sample1.json', 'seq_num': 2}, page_content='Delivery was slow but the product is good')]

#### Practice Problems

#### Problem 1)

Task:
- You have 2 PDFs inside the papers/ folder.
- Load all PDFs using DirectoryLoader + PyPDFLoader

Print:
- Total number of Document objects
- Number of unique PDF files
- Page count per PDF


In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from collections import defaultdict
loader = DirectoryLoader(
    "papers", 
    glob= '**/*.pdf',
    loader_cls= PyPDFLoader
)

docs = loader.load()
print('Total Pages:', len(docs)) # Total number of pages in all the PDF files

# Number of unique PDF files
sources= [doc.metadata['source'] for doc in docs]
unique_pdfs = set(sources)
print('Unique PDFs:', len(unique_pdfs))

# Page count per PDF
page_count = defaultdict(int) # create a dict, if key does not exists it's value start from 0

for doc in docs:
    pdf_name = doc.metadata['source'].split('/')[-1] # extract the pdf file name
    page_count[pdf_name] += 1

print('Page Count per PDF:')
for pdf, count in page_count.items():
    print(f'{pdf}: {count} pages')

Total Pages: 52
Unique PDFs: 2
Page Count per PDF:
2203.14465v2.pdf: 30 pages
2501.12948v1.pdf: 22 pages


In [44]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}, page_content='STaR: Self-Taught Reasoner\nBootstrapping Reasoning With Reasoning\nEric Zelikman∗1, Yuhuai Wu∗12, Jesse Mu1, Noah D. Goodman1\n1Department of Computer Science, Stanford University\n2 Google Research\n{ezelikman, yuhuai, muj, ngoodman}@stanford.edu\nAbstract\nGenerating step-by-step "chain-of-thought" rationales improves language model\nperformance on complex reasoning tasks like mathematics or commonsense\nquestion-answering. However, inducing language model rationale generation cur-\nrently requires either constructing massive rational

In [47]:
page_count

defaultdict(int, {'2203.14465v2.pdf': 30, '2501.12948v1.pdf': 22})

##### Problem 2) 

Task
1) From the loaded PDF documents:
    - Create a dictionary
2) Add a new metadata field to every document
3) Print one document to confirm the new metadata exists.

In [54]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from collections import defaultdict

loader = DirectoryLoader(
    "papers", 
    glob= '**/*.pdf',
    loader_cls= PyPDFLoader
)

docs = loader.load()

# Page count per PDF
page_count = defaultdict(int) # create a dict, if key does not exists it's value start from 0

for doc in docs:
    paper_name = doc.metadata['source'].split('/')[-1] # extract the pdf file name
    doc.metadata['paper_name'] = paper_name
    page_count[paper_name] += 1

page_count_dict = dict(page_count)

print("Page count dictionary:")
print(page_count_dict)

print("\nSample document metadata:")
print(docs[0].metadata)

Page count dictionary:
{'2203.14465v2.pdf': 30, '2501.12948v1.pdf': 22}

Sample document metadata:
{'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1', 'paper_name': '2203.14465v2.pdf'}


#### Problem 3) 
Task:
1) Load:    
    - PDFs from data/papers/
    - Text from data/notes.txt
2) Combine them into one list of Documents
3) Add metadata:
    - doc_type = pdf or text
4) Print:
    - Total documents
    - Count by doc_type

In [56]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader

# 1) Load pdf and txt files
pdf_loader = DirectoryLoader(
    'data/papers',
    glob = '**/*.pdf',
    loader_cls= PyPDFLoader
)
pdf_docs = pdf_loader.load()

# add doc_type metadata to pdf docs
for d in pdf_docs:
    d.metadata['doc_type'] = 'pdf'
    
# 2) Load text file

text_docs = TextLoader('data/notes.txt').load()

## add doc_type metadata to text docs
for d in text_docs:
    d.metadata['doc_type'] = 'text'

# 3) Combining all into one list
all_docs = pdf_docs + text_docs

# 4) Print Totals
pdf_count = sum(1 for d in all_docs if d.metadata.get('doc_type') == 'pdf')
text_count = sum(1 for d in all_docs if d.metadata.get('doc_type') == 'text')

print("Count by doc_type:")
print("pdf :", pdf_count)
print("text:", text_count)

# 5) Showing sample metadata
print("\nSample PDF doc metadata:")
print(pdf_docs[0].metadata)

print("\nSample Text doc metadata:")
print(text_docs[0].metadata)

Count by doc_type:
pdf : 52
text: 1

Sample PDF doc metadata:
{'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1', 'doc_type': 'pdf'}

Sample Text doc metadata:
{'source': 'data/notes.txt', 'doc_type': 'text'}


In [57]:
all_docs

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1', 'doc_type': 'pdf'}, page_content='STaR: Self-Taught Reasoner\nBootstrapping Reasoning With Reasoning\nEric Zelikman∗1, Yuhuai Wu∗12, Jesse Mu1, Noah D. Goodman1\n1Department of Computer Science, Stanford University\n2 Google Research\n{ezelikman, yuhuai, muj, ngoodman}@stanford.edu\nAbstract\nGenerating step-by-step "chain-of-thought" rationales improves language model\nperformance on complex reasoning tasks like mathematics or commonsense\nquestion-answering. However, inducing language model rationale generation cur-\nrently requires either const