In [1]:
import os

import chromadb
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_ollama import OllamaEmbeddings

load_dotenv("../../.env.research")

False

## Load text embeddings in Chroma
In this notebook, we will load the text embeddings of the Rummikub rules in Chroma. We do this using the RecursiveCharacterTextSplitter to split the text into chunks of 1000 characters. It splits text by recursively looking at characters. It tries to split by different characters to find one that works. Chroma is a vector store that allows us to store and retrieve embeddings. A vectorstore is a collection of embeddings that can be queried, and embeddings are numerical representations of words or sentences that capture their meanings and relationships.

In [8]:
def load_pdfs_from_directory(directory_path):
    all_documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            loader = PDFPlumberLoader(file_path=file_path)
            documents = loader.load()
            all_documents.extend(documents)
    return all_documents

In [9]:
pdf_docs = load_pdfs_from_directory(os.getenv("DATA_DIR"))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
all_splits = text_splitter.split_documents(pdf_docs)

In [10]:
client = chromadb.HttpClient(host="localhost", port=5000)
if "rummikub_rules" in client.list_collections():
    client.delete_collection("rummikub_rules")
vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="rummikub_rules",
    client=client,
)

retriever = vectorstore.as_retriever()

In [11]:
retriever.invoke("Explain manipulation")

[Document(metadata={'CreationDate': "D:20241118154334+01'00'", 'Creator': 'Adobe Acrobat Pro (32-bit) 24.4.20272', 'ModDate': "D:20241118212718+01'00'", 'Producer': 'Adobe Acrobat Pro (32-bit) 24.4.20272', 'Title': '2600-0236-0041 080719_rummikub rules', 'file_path': './data\\rummikub_rules.pdf', 'page': 1, 'source': './data\\rummikub_rules.pdf', 'total_pages': 13}, page_content='Manipulation:\nManipulation is the most exciting part of playing "Rummikub®". Players try to table the greatest amount\nof tiles by rearranging or adding to sets which are already on the table. Sets can be manipulated in many\nways (examples follow) as long as at the end of each round only legitimate sets remain and no loose tiles\nare left over.\nAdd one or more tiles from rack to make new set:\nTiles on rack Tiles on table\n3 8 4 5 6 8 8 8\nBlue 4,5,6 are on the table. The player'),
 Document(metadata={'CreationDate': "D:20241118154334+01'00'", 'Creator': 'Adobe Acrobat Pro (32-bit) 24.4.20272', 'ModDate': "

## Conclusion
Works fine, we will try to also load the images in the next notebook.