In [1]:
!pip install -q chromadb==1.0.7 llama-index llama-index-core llama-index-embeddings-huggingface tf-keras llama-index-vector-stores-chroma
!pip list | grep -e "index-core" -e "index-embeddings" -e "chroma"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
chroma-hnswlib                           0.7.6
chromadb                                 1.0.7
llama-index-core                         0.12.34.post1
llama-index-embeddings-huggingface       0.5.3
llama-index-embeddings-openai            0.3.1
llama-index-vector-stores-chroma         0.4.1


In [2]:
# load some documents and test the Document Loader class
datasource_path: str = "/tmp/data_path/"
text_data: str = ".txt"

print(f"Text Data Path is: {datasource_path}, extensions are: {text_data}")

Text Data Path is: /tmp/data_path/, extensions are: .txt


In [3]:
# import libraries
try:
    from llama_index.core.ingestion import IngestionPipeline
    from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
    from llama_index.core.node_parser import SemanticSplitterNodeParser
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    from llama_index.vector_stores.chroma import ChromaVectorStore
    from chromadb import Client, Collection
except Exception as e:
    print(f"Caught Exception: {e}")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load the directory with the llamaindex loader 
loader: SimpleDirectoryReader = SimpleDirectoryReader(input_dir=datasource_path, required_exts=[text_data])

# ok, what's inside?
data = loader.load_data()
print(f"Number of document loaded: {len(data)}")
print(f" -> Each document is of type: {type(data[0])}")

Number of document loaded: 2
 -> Each document is of type: <class 'llama_index.core.schema.Document'>


In [5]:
# print Documents
for doc in data:
    print(f"{doc.doc_id}, {doc.embedding}, {doc.metadata}")

558d297c-476c-4771-b18b-246743072018, None, {'file_path': '/tmp/data_path/rfc2104.txt', 'file_name': 'rfc2104.txt', 'file_type': 'text/plain', 'file_size': 22916, 'creation_date': '2025-05-05', 'last_modified_date': '2025-05-05'}
67791bf3-6e60-42b0-83dc-7d32cefddd1d, None, {'file_path': '/tmp/data_path/rfc6248.txt', 'file_name': 'rfc6248.txt', 'file_type': 'text/plain', 'file_size': 10531, 'creation_date': '2025-05-05', 'last_modified_date': '2025-05-05'}


In [None]:
# instantiate a local embedding function using huggingface embedder
embedding_model: str = "all-MiniLM-L6-v2"
hf_embedder = HuggingFaceEmbedding(embedding_model)

In [None]:
# create a local in-memory instance of ChromaDB
collection: str = "jupyter"
chroma_client: Client = Client()
chroma_collection: Collection = chroma_client.get_or_create_collection(collection,  metadata={"hnsw:space": "cosine"})
vector_store: ChromaVectorStore = ChromaVectorStore(chroma_collection=chroma_collection)

# ok vector db available
print(vector_store)

In [None]:
# instantiate ingestion pipeline 
txt_pipe: IngestionPipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(embed_model=hf_embedder),
        hf_embedder,
    ],
    vector_store=vector_store
)

In [None]:
# run pipeline!
res = txt_pipe.run(documents=data)
print(f"Ingested {len(res)} semantically chunked documents")
print(f"Vector DB contains {chroma_collection.count()} items")

In [None]:
# query the DB
QUERY_TEXT = "QUESTION"

# embed query
query = hf_embedder.get_text_embedding(QUERY_TEXT)

# query the vector database
index: VectorStoreIndex = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=hf_embedder)
retriever = index.as_retriever(similarity_top_k=5, embed_model=hf_embedder)

In [None]:
# retrieve and print results
top_k = retriever.retrieve(QUERY_TEXT)
print(f"Found {len(top_k)} documents")

# display scores
for item in top_k:
    print(f"ID: [{item.id_}] - Score: {item.score:.3f}")

In [None]:
# instantiate node splitter pipeline (no embeddings)
ns_pipeline: IngestionPipeline = IngestionPipeline(
    transformations=[SemanticSplitterNodeParser(embed_model=hf_embedder)],
)

In [None]:
# run node splitter
ns_res = ns_pipeline.run(documents=data)
print(f"Produced {len(ns_res)} Semantically Correlated Nodes")

In [None]:
# split nodes in batches
batch_num: int = 5
nodes_len: int = len(ns_res)
print(f"Splitting {nodes_len} nodes in {batch_num} batches")

# split and return
step: int = nodes_len//batch_num
batches: list = []
if (step > 0):
    for k in range(0, nodes_len, step):
        batches.append(ns_res[k:k+step])
else:
    print(f"Refusing to split: Cannot prepare batches of {step} length")

from numpy import cumsum
print(f"Generated {len(batches)} batches of size {step}")
print(cumsum([len(x) for x in batches]))