In [None]:
!pip install -q chromadb==1.0.7 llama-index llama-index-core llama-index-embeddings-huggingface tf-keras llama-index-vector-stores-chroma
!pip list | grep -e "index-core" -e "index-embeddings" -e "chroma"

In [None]:
# load some documents and test the Document Loader class
datasource_path: str = "/tmp/data_path/"
text_data: str = ".txt"

print(f"Text Data Path is: {datasource_path}, extensions are: {text_data}")

In [None]:
# import libraries
try:
    from llama_index.core.ingestion import IngestionPipeline
    from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
    from llama_index.core.node_parser import SemanticSplitterNodeParser
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    from llama_index.vector_stores.chroma import ChromaVectorStore
    from chromadb import Client, Collection
except Exception as e:
    print(f"Caught Exception: {e}")

In [None]:
# load the directory with the llamaindex loader 
loader: SimpleDirectoryReader = SimpleDirectoryReader(input_dir=datasource_path, required_exts=[text_data])

# ok, what's inside?
data = loader.load_data()
print(f"Number of document loaded: {len(data)}")
print(f" -> Each document is of type: {type(data[0])}")

In [None]:
# print Documents
for doc in data:
    print(f"{doc.doc_id}, {doc.embedding}, {doc.metadata}")

In [None]:
# instantiate a local embedding function using huggingface embedder
embedding_model: str = "all-MiniLM-L6-v2"
hf_embedder = HuggingFaceEmbedding(embedding_model)

In [None]:
# create a local in-memory instance of ChromaDB
collection: str = "jupyter"
chroma_client: Client = Client()
chroma_collection: Collection = chroma_client.get_or_create_collection(collection,  metadata={"hnsw:space": "cosine"})
vector_store: ChromaVectorStore = ChromaVectorStore(chroma_collection=chroma_collection)

# ok vector db available
print(vector_store)

In [None]:
# instantiate ingestion pipeline 
txt_pipe: IngestionPipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(embed_model=hf_embedder),
        hf_embedder,
    ],
    vector_store=vector_store
)

In [None]:
# run pipeline!
res = txt_pipe.run(documents=data)
print(f"Ingested {len(res)} semantically chunked documents")
print(f"Vector DB contains {chroma_collection.count()} items")

In [None]:
# query the DB
QUERY_TEXT = "QUESTION"

# embed query
query = hf_embedder.get_text_embedding(QUERY_TEXT)

# query the vector database
index: VectorStoreIndex = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=hf_embedder)
retriever = index.as_retriever(similarity_top_k=5, embed_model=hf_embedder)

In [None]:
# retrieve and print results
top_k = retriever.retrieve(QUERY_TEXT)
print(f"Found {len(top_k)} documents")

# display scores
for item in top_k:
    print(f"ID: [{item.id_}] - Score: {item.score:.3f}")