1. Load document

In [37]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
path_dir = './data_source'
pdf_loader = PyPDFDirectoryLoader(path_dir)
pdf_pages = pdf_loader.load()


2. Split document

In [38]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List

In [39]:

chunk_size = 300
chunk_overlap = 30
separators: List[str] = ['\n\n', '\n', ' ', ''] 

splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    separators = separators,
    is_separator_regex = False,
    length_function = len,
)

docs = splitter.split_documents(pdf_pages)
      
cleaned_docs = [doc for doc in docs if isinstance(doc.page_content, str) and len(doc.page_content.strip()) > 0]

len(cleaned_docs)

180

3. Embedding Model

In [40]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [41]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

4. Chroma vector database

In [42]:
from langchain_community.vectorstores import Chroma

In [43]:
chroma_db = Chroma.from_documents(documents=cleaned_docs, embedding=embedding_model)

In [44]:
query = "what is GNNs"

In [45]:
similar_docs = chroma_db.similarity_search(query, k = 4)
similar_docs

[Document(metadata={'source': 'data_source/2.+Exploringthepotential.pdf', 'creator': 'Microsoft® Word 2010 Trial', 'page': 1, 'producer': 'Microsoft® Word 2010 Trial', 'author': 'Administrator', 'creationdate': '2025-06-06T15:21:03+07:00', 'moddate': '2025-06-06T15:21:03+07:00', 'total_pages': 11, 'page_label': '2'}, page_content='The core mechanism of GNNs is message passing  – a process in which information is exchanged between \nneighboring nodes through edges. In each computational layer, a node aggregates mess ages from its'),
 Document(metadata={'creationdate': '2025-06-06T15:21:03+07:00', 'creator': 'Microsoft® Word 2010 Trial', 'page': 1, 'moddate': '2025-06-06T15:21:03+07:00', 'total_pages': 11, 'producer': 'Microsoft® Word 2010 Trial', 'author': 'Administrator', 'page_label': '2', 'source': 'data_source/2.+Exploringthepotential.pdf'}, page_content='The core mechanism of GNNs is message passing  – a process in which information is exchanged between \nneighboring nodes through 