# 1. Load from PDF file

In [13]:
# Local file or online file 
# 16 Pages
url = "https://arxiv.org/pdf/2312.16862.pdf"

# 1.1 Load per page

In [14]:
from langchain_community.document_loaders import PyPDFLoader

pdf_loader = PyPDFLoader(url)
docs = pdf_loader.load()
len(docs) 

16

In [15]:
docs[0]

Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre Dame2Lehigh University3Shanghai Jiao Tong University\nAbstract\nIn recent years, multimodal large language models (MLLMs) such as GPT-\n4V have demonstrated remarkable advancements, excelling in a variety\nof vision-language tasks. Despite their prowess, the closed-source na-\nture and computational demands of such models limit their accessibility\nand applicability. This study introduces TinyGPT-V , a novel open-source\nMLLM, designed for efficient training and inference across various vision-\nlanguage tasks, including image captioning (IC) and visual question an-\nswering (VQA). Leveraging a compact yet powerful architecture, TinyGPT-\nV integrates the Phi-2 language model with pre-trained vision encoders,\nutilizing a unique mapping module for visua

## 1.2 Extract image as text

In [None]:
# !pip install rapidocr-onnxruntime

In [16]:
pdf_loader = PyPDFLoader(url, extract_images=True)
docs = pdf_loader.load()
len(docs) 

16

## 1.3 Load UnStructured PDF

In [19]:
from langchain_community.document_loaders import PDFMinerLoader
pdf_loader = PDFMinerLoader(url)
docs = pdf_loader.load()
len(docs) 

1

## 1.4 Load PDF from dir

In [43]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

path_dir = "./data_source"
pdf_loader = PyPDFDirectoryLoader(path_dir)
docs = pdf_loader.load()
len(docs) # 36 pages from 2 pdf files

36

# 2. Load from HTML file

## 2.1 Load UnStructured

In [37]:
from langchain_community.document_loaders import UnstructuredHTMLLoader

html_path = "./data_source/ai-oss.html"

html_loader = UnstructuredHTMLLoader(html_path)
docs = html_loader.load()
docs

[Document(page_content="What I learned from looking at 900 most popular open source AI tools\n\nMar 14, 2024\n      \n      \n        • Chip Huyen\n\n[Hacker News discussion, LinkedIn discussion, Twitter thread]\n\nFour years ago, I did an analysis of the open source ML ecosystem. Since then, the landscape has changed, so I revisited the topic. This time, I focused exclusively on the stack around foundation models.\n\nThe full list of open source AI repos is hosted at llama-police. The list is updated every 6 hours. You can also find most of them on my cool-llm-repos list on GitHub.\n\nData\n\n….\n\nHow to add missing repos\n\nThe New AI Stack\n\n….\n\nAI stack over time\n\n……..\n\nApplications\n\n……..\n\nAI engineering\n\n……..\n\nModel development\n\n……..\n\nInfrastructure\n\nOpen source AI developers\n\n….\n\nOne-person billion-dollar companies?\n\n….\n\n1 million commits\n\nThe growing China’s open source ecosystem\n\nLive fast, die young\n\nMy personal favorite ideas\n\nConclusion\

In [38]:
docs[0].metadata

{'source': './data_source/ai-oss.html'}

## 2.2 Load with BS4

In [40]:
from langchain_community.document_loaders import BSHTMLLoader

html_loader = BSHTMLLoader(html_path)
docs = html_loader.load()
docs

[Document(page_content="\n\n\n\n\nWhat I learned from looking at 900 most popular open source AI tools\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWhat I learned from looking at 900 most popular open source AI tools | Chip Huyen\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nChip Huyen\n\n\n\n\n\n\n\n\n\n\n\n\n\nBlog\nBooks\nList 100\nLlama Police\nMLOps Guide\nTiếng Việt\n\n\n\n\n\n\n\n\nWhat I learned from looking at 900 most popular open source AI tools\n\n\n        \n        Mar 14, 2024\n      \n      \n        • Chip Huyen\n\n\n\n[Hacker News discussion, LinkedIn discussion, Twitter thread]\nFour years ago, I did an analysis of the open source ML ecosystem. Since then, the landscape has changed, so I revisited the topic. This time, I focused exclusively on the stack around foundation models.\nThe full list of open source AI repos is hosted at llama-police. The list is updated every 6 hours. You can also find most of them on my cool-llm-repos list on GitHub.\n\nTable of contents\nData\n…. How to

Using bs4, the title will extracted into metadata

In [41]:
docs[0].metadata

{'source': './data_source/ai-oss.html',
 'title': 'What I learned from looking at 900 most popular open source AI tools'}

# 3. Load from Markdown file

In [48]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

markdown_file = "./data_source/OpenAI_cookbook_README.md"
modes = ["single", "elements", "paged"]

"""
single: load content into a single document
elements: split each element of markdown file into a document
paged: each page is a document
"""

markdown_loader = UnstructuredMarkdownLoader(markdown_file, mode=modes[0])
docs = markdown_loader.load()
docs

[Document(page_content='openai\n\nopenai-cookbook\n\nPublic\n\nNotifications\n\nFork\n    8.8k\n\nStar\n          55.7k\n\n\n\nCode\n\nIssues\n          32\n\nPull requests\n          40\n\nActions\n\nSecurity\n\nInsights\n\nCode\n\nIssues\n\nPull requests\n\nActions\n\nSecurity\n\nInsights', metadata={'source': './data_source/OpenAI_cookbook_README.md'})]

# 4. Load from Web page

In [28]:
import bs4
from langchain_community.document_loaders import WebBaseLoader


web_paths = ["https://huyenchip.com/2023/10/10/multimodal.html"]

classes = ['post-content', 'post-title', 'post-header', 'page-content']
bs4_strainer = bs4.SoupStrainer(class_=classes)

web_loader = WebBaseLoader(
    web_paths=web_paths,
    bs_kwargs=dict(
        parse_only=bs4_strainer
    ),
)
docs = web_loader.load()
docs

[Document(page_content="\n\n\n\nMultimodality and Large Multimodal Models (LMMs)\n\n\n        \n        Oct 10, 2023\n      \n      \n        ‚Ä¢ Chip Huyen\n\n\n\nFor a long time, each ML model operated in one data mode ‚Äì text (translation, language modeling), image (object detection, image classification), or audio (speech recognition).\nHowever, natural intelligence is not limited to just a single modality. Humans can read and write text. We can see images and watch videos. We listen to music to relax and watch out for strange noises to detect danger. Being able to work with multimodal data is essential for us or any AI to operate in the real world.\nOpenAI noted in their GPT-4V system card that ‚Äúincorporating additional modalities (such as image inputs) into LLMs is viewed by some as a key frontier in AI research and development.‚Äù\nIncorporating additional modalities to LLMs (Large Language Models) creates LMMs (Large Multimodal Models). In the last year, every week, a major 