In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain_ollama.llms import OllamaLLM

In [None]:
pdf_path = "./data/mining_of_massive_datasets.pdf" #TODO: Change this to the path of the PDF file you want to use

In [None]:
loader = PyPDFLoader(pdf_path)
docs = []
docs.extend(loader.load())

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
docs = text_splitter.split_documents(docs)
len(docs)

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")

In [None]:
docs = db.similarity_search("What is data mining?")

In [None]:
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

In [None]:
model = OllamaLLM(model="llama3")

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=db.as_retriever(k=10)
)

In [None]:
question = "What is data mining?"
result = qa_chain({"query": question})


In [None]:
print(result["result"])