In [None]:
%pip install langchain, unstructured, pdfminer, pypdf, arxiv, wikipedia, pymupdf

# for UnstructuredPDFLoader
%pip install pdfminer.six, unstructured_inference, pi_heif, pdf2image

In [4]:
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PDFMinerLoader

In [None]:
# Basic PDF loading - good for simple text extraction
loader = PyPDFLoader("attention.pdf")
documents = loader.load()
documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszk

In [None]:
# For complex layouts and better text extraction
loader = UnstructuredPDFLoader("attention.pdf")
documents = loader.load()

In [8]:
# For detailed control over PDF parsing
loader = PDFMinerLoader("attention.pdf")
documents = loader.load()
documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': 'False', 'total_pages': 15, 'source': 'attention.pdf'}, page_content='3\n2\n0\n2\n\ng\nu\nA\n2\n\n]\nL\nC\n.\ns\nc\n[\n\n7\nv\n2\n6\n7\n3\n0\n.\n6\n0\n7\n1\n:\nv\ni\nX\nr\na\n\nProvided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\n\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\n\nNiki Parmar∗\nGoogle Research\nnikip@google.com\n\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\n\nLlion Jones∗\nGoogle Research\nllion@google.com\n\nAidan N. Go

### Text and Markdown Files

In [10]:
from langchain.document_loaders import TextLoader, UnstructuredMarkdownLoader

# Simple text files
loader = TextLoader("documentation.txt", encoding="utf-8")
documents = loader.load()
documents


[Document(metadata={'source': 'documentation.txt'}, page_content="Introduction\nLangChain is a framework for developing applications powered by large language models (LLMs).\n\nLangChain simplifies every stage of the LLM application lifecycle:\n\nDevelopment: Build your applications using LangChain's open-source components and third-party integrations. Use LangGraph to build stateful agents with first-class streaming and human-in-the-loop support.\nProductionization: Use LangSmith to inspect, monitor and evaluate your applications, so that you can continuously optimize and deploy with confidence.\nDeployment: Turn your LangGraph applications into production-ready APIs and Assistants with LangGraph Platform.\n\nLangChain implements a standard interface for large language models and related technologies, such as embedding models and vector stores, and integrates with hundreds of providers. See the integrations page for more.\n\n")]

In [None]:
# Markdown files with structure preservation
loader = UnstructuredMarkdownLoader("README.md")
documents = loader.load()
documents

### Microsoft Office Documents

In [None]:
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader

# Word documents
loader = UnstructuredWordDocumentLoader("proposal.docx")
documents = loader.load()

# Excel files
loader = UnstructuredExcelLoader("data.xlsx")
documents = loader.load()

# PowerPoint presentations
loader = UnstructuredPowerPointLoader("presentation.pptx")
documents = loader.load()

### CSV and Structured Data

In [None]:
from langchain.document_loaders import CSVLoader, UnstructuredCSVLoader

# Basic CSV loading
loader = CSVLoader("sales_data.csv")
documents = loader.load()

# Advanced CSV with custom formatting
loader = UnstructuredCSVLoader("complex_data.csv", mode="elements")
documents = loader.load()

### Web-Based Document Loaders

In [None]:
from langchain.document_loaders import WebBaseLoader

# Simple web scraping
loader = WebBaseLoader("https://python.langchain.com/docs/introduction/")
documents = loader.load()
documents

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://python.langchain.com/docs/introduction/', 'title': 'Introduction | \uf8ffü¶úÔ∏è\uf8ffüîó LangChain', 'description': 'LangChain is a framework for developing applications powered by large language models (LLMs).', 'language': 'en'}, page_content='\n\n\n\n\nIntroduction | \uf8ffü¶úÔ∏è\uf8ffüîó LangChain\n\n\n\n\n\n\nSkip to main contentOur Building Ambient Agents with LangGraph course is now available on LangChain Academy!IntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1\uf8ffüí¨SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a simple LLM application with chat models and prompt templatesBuild a ChatbotBuild a Retrieval Augmented Generation (RAG) App: Part 2Build an Extraction ChainBuild an AgentTaggingBuild a Retrieval Augmented Generation (RAG) App: Part 1Build a semantic search engineBuild a Question/Answering system ove

In [None]:
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Agentic AI", load_max_docs=2).load()
len(docs)
print(docs)

[Document(metadata={'title': 'Agentic AI', 'summary': 'Agentic AI is a class of artificial intelligence that focuses on autonomous systems that can make decisions and perform tasks without human intervention. The independent systems automatically respond to conditions, to produce process results. The field is closely linked to agentic automation, also known as agent-based process management systems, when applied to process automation. Applications include software development, customer support, cybersecurity and business intelligence. \n\n', 'source': 'https://en.wikipedia.org/wiki/Agentic_AI'}, page_content="Agentic AI is a class of artificial intelligence that focuses on autonomous systems that can make decisions and perform tasks without human intervention. The independent systems automatically respond to conditions, to produce process results. The field is closely linked to agentic automation, also known as agent-based process management systems, when applied to process automation.

### Arxiv


In [1]:
from langchain_community.document_loaders import ArxivLoader
docs = ArxivLoader(query="1706.03762", load_max_docs=2).load()
len(docs)

1