## Load document

In [1]:
from langchain_community.document_loaders import PyPDFLoader

pdf_url = "https://arxiv.org/pdf/2312.16862.pdf"

pdf_loader = PyPDFLoader(pdf_url)
pdf_pages = pdf_loader.load()

In [2]:
len(pdf_pages)

16

## Split document 

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


chunk_size = 300
chunk_overlap = 0

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False,
)

docs = splitter.split_documents(pdf_pages)

In [4]:
len(docs)

198

In [5]:
docs[0]

Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre Dame2Lehigh University3Shanghai Jiao Tong University\nAbstract', metadata={'source': 'https://arxiv.org/pdf/2312.16862.pdf', 'page': 0})

## Embedding model

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings()

# 1. Chrom vector database

In [7]:
# %pip install langchain-chroma

In [8]:
from langchain_chroma import Chroma

In [9]:
chroma_db = Chroma.from_documents(docs, embedding=embedding_model)


Exception occurred invoking consumer for subscription ad0fd55aaed14b45ac2902e2379eeb3eto topic persistent://default/default/42fdc2c2-bbf3-4f90-a4f3-a0d26304b74a 'utf-8' codec can't encode character '\ud835' in position 86: surrogates not allowed


In [10]:
query = "what is multimodal large language models?"

In [11]:
similar_docs = chroma_db.similarity_search(query, k=4)

In [12]:
len(similar_docs)

4

In [13]:
similar_docs

[Document(page_content='1 Introduction\nIn recent years, the field of artificial intelligence has seen significant advancements through\nthe development of multimodal large language models (MLLMs), such as GPT-4V , which\nhave shown exceptional performance across a range of vision-language tasks (Yang', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='In recent years, multimodal large language models (MLLMs) such as GPT-\n4V have demonstrated remarkable advancements, excelling in a variety\nof vision-language tasks. Despite their prowess, the closed-source na-\nture and computational demands of such models limit their accessibility', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre

# 2. FASII vector database

In [14]:
# %pip install faiss-cpu

In [15]:
from langchain_community.vectorstores import FAISS

In [16]:
faiss_db = FAISS.from_documents(docs, embedding=embedding_model)

In [17]:
query = "what is multimodal large language models?"

In [18]:
similar_docs = chroma_db.similarity_search(query, k=4)

In [19]:
len(similar_docs)

4

In [20]:
similar_docs

[Document(page_content='1 Introduction\nIn recent years, the field of artificial intelligence has seen significant advancements through\nthe development of multimodal large language models (MLLMs), such as GPT-4V , which\nhave shown exceptional performance across a range of vision-language tasks (Yang', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='In recent years, multimodal large language models (MLLMs) such as GPT-\n4V have demonstrated remarkable advancements, excelling in a variety\nof vision-language tasks. Despite their prowess, the closed-source na-\nture and computational demands of such models limit their accessibility', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2312.16862.pdf'}),
 Document(page_content='Published as a conference paper at COLM 2024\nTinyGPT-V: Efficient Multimodal Large Language Model\nvia Small Backbones\nZhengqing Yuan1, Zhaoxu Li2∗, Weiran Huang3, Yanfang Ye1, Lichao Sun2†\n1University of Notre