In [1]:
import chromadb

from sentence_transformers import SentenceTransformer

from src.text_utils import extract_text_from_pdf, text_splitter
from src.utils import list_files_per_subject

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
complete_file_list = list_files_per_subject('literature')

In [3]:
# Instanciating the ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db" )

# tenta obter ou criar a coleção
try:
    collection = client.get_collection("meus_docs")
except Exception:
    collection = client.create_collection("meus_docs")

# Loading embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
for file in complete_file_list:
    print("Extracting text from: ", file['file_path'])
    text = extract_text_from_pdf(file['file_path'])
    print("Extracted", len(text), "characters.")

    splitted_text = text_splitter(text, 
                                  chunk_size=1000, 
                                  chunk_overlap=200)
    print("Split into", len(splitted_text), "chunks.")

    print("Embedding chunks...")
    embeddings = model.encode(splitted_text, show_progress_bar=True)

    print("Storing in ChromaDB...")
    collection.add(
        ids=[f"chunk_{i}" for i in range(len(splitted_text))],
        documents=splitted_text,
        embeddings=embeddings.tolist(),
        metadatas=[{"source": file["dirname"], "page": i} for i in range(len(splitted_text))]
    )

Extracting text from:  literature/teste/teste.pdf
Extracted 17019 characters.
Split into 24 chunks.
Embedding chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.79it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/MLOps Now - The MLOps Platform_ Revolutionising Machine Learning Efficiency.pdf
Extracted 17019 characters.
Split into 24 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.66it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/Hidden_technical_debt_in_machine_learning_systems.pdf
Extracted 36224 characters.
Split into 47 chunks.
Embedding chunks...


Batches: 100%|██████████| 2/2 [00:00<00:00, 31.56it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/MLOps Now - MLOps Best Practices and Challenges.pdf
Extracted 9387 characters.
Split into 13 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 49.72it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/MLOps Now - ML Engineer vs Data Scientist.pdf
Extracted 5187 characters.
Split into 7 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.84it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/Introduction to MLOps _ Paperspace Blog.pdf
Extracted 16110 characters.
Split into 23 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 36.57it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/What is MLOps_ - Machine Learning Operations Explained - AWS.pdf
Extracted 13901 characters.
Split into 19 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 44.38it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/MLOps Now - What is MLOps_ Demystifying Machine Learning Operations.pdf
Extracted 16730 characters.
Split into 23 chunks.
Embedding chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00, 37.58it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/practitioners_guide_to_mlops_whitepaper.pdf
Extracted 64873 characters.
Split into 90 chunks.
Embedding chunks...


Batches: 100%|██████████| 3/3 [00:00<00:00, 31.80it/s]

Storing in ChromaDB...





Extracting text from:  literature/mlops/Operationalizing_Machine_Learning_An_Interview_Study.pdf
Extracted 110095 characters.
Split into 144 chunks.
Embedding chunks...


Batches: 100%|██████████| 5/5 [00:00<00:00, 27.62it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/What is MLOps.txt
Extracted 28925 characters.
Split into 38 chunks.
Embedding chunks...


Batches: 100%|██████████| 2/2 [00:00<00:00, 56.08it/s]


Storing in ChromaDB...
Extracting text from:  literature/mlops/MLOps_ Continuous delivery and automation pipelines in machine learning  _  Cloud Architecture Center  _  Google Cloud Documentation.pdf
Extracted 36653 characters.
Split into 49 chunks.
Embedding chunks...


Batches: 100%|██████████| 2/2 [00:00<00:00, 35.42it/s]

Storing in ChromaDB...



