# Document Loaders in LangChain

In [None]:
!pip install langchain-community pypdf

# Import Libraries

In [3]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader, WebBaseLoader, CSVLoader

# 1. `TextLoader`

In [4]:
text_loader = TextLoader(file_path='data/text_data.txt', encoding='utf8')
text_doc = text_loader.load()

In [5]:
display(len(text_doc))
type(text_doc)

1

list

In [7]:
text_doc[0]

Document(metadata={'source': 'data/text_data.txt'}, page_content='Agriculture is the practice of cultivating the soil, planting, raising, and harvesting both food and non-food crops, as well as livestock production. Broader definitions also include forestry and aquaculture. Agriculture was a key factor in the rise of sedentary human civilization, whereby farming of domesticated plants and animals created food surpluses that enabled people to live in the cities. While humans started gathering grains at least 105,000 years ago, nascent farmers only began planting them around 11,500 years ago. Sheep, goats, pigs, and cattle were domesticated around 10,000 years ago. Plants were independently cultivated in at least 11 regions of the world. In the 20th century, industrial agriculture based on large-scale monocultures came to dominate agricultural output.')

In [8]:
text_doc[0].metadata

{'source': 'data/text_data.txt'}

In [9]:
text_doc[0].page_content

'Agriculture is the practice of cultivating the soil, planting, raising, and harvesting both food and non-food crops, as well as livestock production. Broader definitions also include forestry and aquaculture. Agriculture was a key factor in the rise of sedentary human civilization, whereby farming of domesticated plants and animals created food surpluses that enabled people to live in the cities. While humans started gathering grains at least 105,000 years ago, nascent farmers only began planting them around 11,500 years ago. Sheep, goats, pigs, and cattle were domesticated around 10,000 years ago. Plants were independently cultivated in at least 11 regions of the world. In the 20th century, industrial agriculture based on large-scale monocultures came to dominate agricultural output.'

# 2. `PyPDFLoader`

In [12]:
pdf_loader = PyPDFLoader(file_path='data/Mid_Notes.pdf')
pdf_doc = pdf_loader.load()

In [14]:
display(len(pdf_doc))
type(pdf_doc)

19

list

In [15]:
pdf_doc[0]

Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-11-09T00:39:04+06:00', 'author': 'Shorna', 'moddate': '2025-11-09T00:39:04+06:00', 'source': 'data/Mid_Notes.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}, page_content='Page | 1  \n \nLecture 1: The Systems Development Environment \n1. What is Systems Analysis and Design? \n\uf0b7 Definition: A complex organizational process used to develop and maintain computer-\nbased information systems. \n\uf0b7 Key Role: The Systems Analyst is the professional responsible for this process. \n\uf0b7 The Approach: It is driven by three components: \no Methodologies: A comprehensive process to follow (e.g., SDLC). \no Techniques: The methods for completing each step (e.g., data modeling). \no Tools: Software programs that automate the process (e.g., CASE tools). \n2. The Core Process: Systems Development Life Cycle (SDLC) \nThe SDLC is the traditional, structured process for devel

In [16]:
pdf_doc[0].page_content

'Page | 1  \n \nLecture 1: The Systems Development Environment \n1. What is Systems Analysis and Design? \n\uf0b7 Definition: A complex organizational process used to develop and maintain computer-\nbased information systems. \n\uf0b7 Key Role: The Systems Analyst is the professional responsible for this process. \n\uf0b7 The Approach: It is driven by three components: \no Methodologies: A comprehensive process to follow (e.g., SDLC). \no Techniques: The methods for completing each step (e.g., data modeling). \no Tools: Software programs that automate the process (e.g., CASE tools). \n2. The Core Process: Systems Development Life Cycle (SDLC) \nThe SDLC is the traditional, structured process for developing information systems. It consists of \nfive phases: \n1. Planning: Identify, prioritize, and plan organizational information system needs. \n2. Analysis: Study and structure system requirements. Understand the current system and its \nproblems. \n3. Design: \no Logical Design: Describ

# 3. `DirectoryLoader`

In [19]:
%%time
dir_pdf_loader = DirectoryLoader(path='data/pdf_dir', glob=['*.pdf'], loader_cls=PyPDFLoader)
dir_pdf_doc = dir_pdf_loader.load()
type(dir_pdf_doc)

CPU times: user 4.43 s, sys: 37 ms, total: 4.47 s
Wall time: 4.54 s


list

In [36]:
%%time
dir_pdf_loader = DirectoryLoader(path='data/pdf_dir', glob=['*.pdf'], loader_cls=PyPDFLoader)
dir_pdf_doc = dir_pdf_loader.lazy_load()
type(dir_pdf_doc)

CPU times: user 504 µs, sys: 0 ns, total: 504 µs
Wall time: 468 µs


generator

In [37]:
for i, doc in enumerate(dir_pdf_doc):
    print(doc)
    if i == 1:
        break


page_content='Chapter 5 Copyright © 2017 Pearson Education, Ltd. 5-1
Initiating and Planning Systems 
Development Projects
Modern Systems Analysis
and Design
Eighth Edition, Global Edition 
Joseph S. Valacich
Joey F. George' metadata={'producer': 'PyPDF', 'creator': 'Google', 'creationdate': '', 'title': 'Copy of Lecture 5.pptx', 'source': 'data/pdf_dir/Lecture 5.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1'}
page_content='Chapter 5 Copyright © 2017 Pearson Education, Ltd. 5-2
Learning Objectives
✔ Describe the steps involved in the project initiation and planning process.
✔ List and describe various methods for assessing project feasibility.
✔ Describe the activities needed to build and review the baseline project plan.
✔ Describe the activities and participant roles within a structured walkthrough.' metadata={'producer': 'PyPDF', 'creator': 'Google', 'creationdate': '', 'title': 'Copy of Lecture 5.pptx', 'source': 'data/pdf_dir/Lecture 5.pdf', 'total_pages': 21, 'page': 1, 'p

# 4. `WebBasedLoader`

In [38]:
url1 = 'https://en.wikipedia.org/wiki/Agriculture'
url2 = 'https://en.wikipedia.org/wiki/History_of_agriculture'

web_loader = WebBaseLoader(web_path=[url1, url2])

web_doc = web_loader.load()
type(web_doc)

list

In [40]:
len(web_doc)

2

In [41]:
web_doc[0]



# 5. `CSVLoader`

In [46]:
csv_loader = CSVLoader(
    file_path="data/train.csv",
    csv_args={
        "delimiter": ",",
        "quotechar": '"'
    }
)
csv_doc = csv_loader.load()
type(csv_loader)

In [48]:
len(csv_doc)

1309

In [49]:
csv_doc[0]

Document(metadata={'source': 'data/train.csv', 'row': 0}, page_content='Passengerid: 1\nAge: 22\nFare: 7.25\nSex: 0\nsibsp: 1\nzero: 0\nParch: 0\nPclass: 3\nEmbarked: 2\n2urvived: 0')