In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

In [2]:
# embedding model
embeddings = OllamaEmbeddings(
    model="llama3.2",
)

# pdf files path
file_path = "../data/main.pdf"

pdf_loader = PyPDFLoader(file_path=file_path)

docs = pdf_loader.load()

# print(f"{docs[0].page_content[:500]}\n")

# setting up text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# splitting all pages from entire document
all_splits = text_splitter.split_documents(docs)

# setting up vector store
vector_store = Chroma(embedding_function=embeddings, 
                      collection_name='chroma_db',
                      persist_directory='../data/chroma_db')
# ids = vector_store.add_documents(documents=all_splits)
ids = vector_store.add_documents(documents=all_splits)

In [9]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("PHP with different filling ratio")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.783714234828949

page_content='2.4 Machine learning prediction for CLPHP
The area of machine learning is a fast-developing discipline enabled based on data optimization
[15]. The decision-tree-based foundation of the random forest technique involves dividing the data
into several branches based on distinct subsets of the data. Several decision trees are built using
the random forest method, and the characteristics are also randomly chosen [1]. This method
is unique in that it identifies and ranks the data-sets most pertinent features in relation to the
categorical replies. This can be helpful as an addition to the research on how individual features
affect output response. The primary drawback of this technique is the lengthy computing time
needed for implementation, which goes up as there are more trees to construct [22].
3 Results and Discussion
3.1 Analysis of a PHP at constant heat input with different filling ratios
After the performing the four sets of experiments, the s

In [10]:
embedding = embeddings.embed_query("PHP with different filling ratio")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='2.4 Machine learning prediction for CLPHP
The area of machine learning is a fast-developing discipline enabled based on data optimization
[15]. The decision-tree-based foundation of the random forest technique involves dividing the data
into several branches based on distinct subsets of the data. Several decision trees are built using
the random forest method, and the characteristics are also randomly chosen [1]. This method
is unique in that it identifies and ranks the data-sets most pertinent features in relation to the
categorical replies. This can be helpful as an addition to the research on how individual features
affect output response. The primary drawback of this technique is the lengthy computing time
needed for implementation, which goes up as there are more trees to construct [22].
3 Results and Discussion
3.1 Analysis of a PHP at constant heat input with different filling ratios
After the performing the four sets of experiments, the series of experiments data 

In [23]:
# making retriever
from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import chain

@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query=query, k=1)

retriever.invoke("What is PHP?")

[Document(id='a9368920-5905-45e9-a5a7-37a23dbf8f5c', metadata={'author': '', 'creationdate': '2023-10-21T13:58:37+02:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2023-10-21T13:58:37+02:00', 'page': 1, 'page_label': '2', 'producer': 'pdfTeX-1.40.25', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': '../data/main.pdf', 'start_index': 0, 'subject': '', 'title': '', 'total_pages': 17, 'trapped': '/False'}, page_content='Figure 1: Schematic of the PHPs\nAs per the literature review of PHPs, the use of various convectional fluids as working fluids are\nnot new to enhance the thermal performance of PHPs as shown in Table 1. Water and ethanol are\nwidely used as conventional fluids to increase heat transfer capability of PHP as shown as Table 1.\nIn addition to this, the use of various nanofluids as working fluids are also not newer to increase\nthe thermal performance of PHPs as shown in Table 2.\nThe 