In [1]:
!pip install -U pip
!pip install langchain langchain_community sentence_transformers
!pip install chromadb_client httpx starlette
!pip install boto3 tqdm fastapi

Collecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting langchain-core<0.3.0,>=0.2.7 (from langchain)
  Downloading langchain_core-0.2.9-py3-none-any.whl.metadata (6.0 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl.metadata (2.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.80-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting tran

In [2]:
# import required libraries
try:
    import os, yaml
    import boto3
    from tqdm import tqdm
    from typing import Callable
    from boto3.s3.transfer import TransferConfig
    from botocore.exceptions import ClientError
    from chromadb import HttpClient
    from chromadb.config import Settings
    from chromadb import PersistentClient, Collection
    from chromadb.utils import embedding_functions as ef
    from langchain_community.document_loaders import DirectoryLoader, TextLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter as rts
    import uuid
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

In [3]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))
                
# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

In [4]:
# declare document splitter functions
def load_text_documents(path: str = ".", pattern: str = "**/*.txt",
                        multithread: bool = False) -> list:
    loader = DirectoryLoader(path,
                             glob=pattern,
                             loader_cls=TextLoader,
                             loader_kwargs={'autodetect_encoding': True},
                             use_multithreading=multithread,
                             silent_errors=True,
                             show_progress=True)
    return loader.load()

# document splitter
def split_text_documents(documents: list = None,
                         chunk_size: int = 1000,
                         chunk_overlap: int = 0) -> list:
    if documents is None:
        return None

    splitter = rts(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

In [5]:
# embedding function: use sentence transformer to cacluate embeddings locally
def s_transformer(model: str = "all-MiniLM-L6-v2"):
    return ef.SentenceTransformerEmbeddingFunction(model_name=model)


In [6]:
# ChromaDB connection function
def chroma_client(host: str = "localhost",
                  port: int = 8080,
                  allow_reset: bool = False) -> HttpClient:
    clientSettings: Settings = Settings(allow_reset=allow_reset)

    # create chroma client object
    chromadb_client_http = HttpClient(host=host, port=port,
                                      settings=clientSettings)
    return chromadb_client_http

In [7]:
# chromadb interaction client
class RemoteChromaClient(object):
    def __init__(self, host: str = "localhost",
                 port: int = 8080,
                 collection: str = "default",
                 embedding_function: Callable = None):
        self._client: HttpClient = chroma_client(host=host, port=port)
        if embedding_function is None:
            raise Exception("RemoteChromaClient: embedding_function cannot be None: you must specify an embedding function")
        else:
            self._collection: Collection = self._client.get_or_create_collection(collection, embedding_function=embedding_function)

    def Client(self) -> HttpClient:
        return self._client

    def Collection(self) -> Collection:
        return self._collection

    def Heartbeat(self) -> int:
        return self._client.heartbeat()

    def GenerateEmbeddings(self, training_data_path: str = ".",
                           pattern: str = "**/*.txt",
                           chunk_size: int = 1000, chunk_overlap: int = 0,
                           multithread: bool = False):
        # load custom knowledge data and tokenize it
        knowledge_body = load_text_documents(path=training_data_path,
                                             pattern=pattern,
                                             multithread=multithread)
        print(f"Loaded {len(knowledge_body)} Documents...")
        tokenized_docs = split_text_documents(documents=knowledge_body,
                                              chunk_size=chunk_size,
                                              chunk_overlap=chunk_overlap)
        print(f"Tokenized documents number: {len(tokenized_docs)}.")

        if len(tokenized_docs) > 0:
            for doc in tqdm(tokenized_docs, ascii=True, desc="Ingesting..."):
                self.Collection().add(ids=[str(uuid.uuid1())],
                                      documents=doc.page_content,
                                      metadatas=doc.metadata)

    def __str__(self) -> str:
        return f"ChromaDB Client: {self._client.database} - Collection: {self._collection}"


In [8]:
# download training data from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
os.makedirs(creds.training_data.path, exist_ok = True)
    
# get list of data files
try:
    data_files = minio_api.list_objects_v2(Bucket=creds.training_data.trainingDataBucket)
    if data_files.get("Contents"):
        for item in data_files.get("Contents"):
            minio_api.download_file(creds.training_data.trainingDataBucket,
                                    item.get('Key'),
                                    "/".join((creds.training_data.path, item.get('Key'))))
except Exception as e:
    print(f"Caught Exception {e}")

Accessing S3 endpoint http://minio-svc.minio.svc.cluster.local:9000 with ACCESS_KEY O3wC8Aoi1e46YSoJerUm...


In [9]:
# instantiate connection to ChromaDB
print(f"Connecting to Chroma instance @ {creds.chromadb.host} on port {creds.chromadb.port}")

try:
    chroma_instance = chroma_client(host=creds.chromadb.host, port=creds.chromadb.port)
except Exception as e:
    print(f"Caught Exception: {e}")

Connecting to Chroma instance @ chromadb-chromadb-helm.chromadb.svc.cluster.local on port 8080
Caught Exception: Could not connect to a Chroma server. Are you sure it is running?


In [13]:
# create embedding function and start dataset vectorization
embed_func = s_transformer(model=creds.embeddings.sentence_transformer.model)
try:
    cc = RemoteChromaClient(host=creds.chromadb.host,
                            port=int(creds.chromadb.port),
                            collection=creds.chromadb.collection,
                            embedding_function=embed_func)
    print(f"Objects in collection: {cc.Collection().count()}")
    cc.GenerateEmbeddings(training_data_path=creds.training_data.path,
                          pattern=creds.training_data.pattern,
                          chunk_size=creds.training_data.chunk_size,
                          chunk_overlap=creds.training_data.chunk_overlap)
    print(f"Objects in collection after ingestion: {cc.Collection().count()}")
except Exception as e:
    print(f"Caught exception: {e}")


Objects in collection: 0


100%|██████████| 3/3 [00:00<00:00, 2144.33it/s]


Loaded 3 Documents...
Tokenized documents number: 164.


Ingesting...: 100%|##########| 164/164 [00:14<00:00, 11.55it/s]

Objects in collection after ingestion: 164





In [14]:
# Query embedding
try:
    # try to retrieve data
    results = cc.Collection().query(query_texts=["rfc2104"], n_results=10)
    print(results)
except Exception as e:
    print(f"Caught exception: {e}")

{'ids': [['77990f6e-2e1b-11ef-b811-0a580a80027b', '71a7fd72-2e1b-11ef-b811-0a580a80027b', '7772df92-2e1b-11ef-b811-0a580a80027b', '775bc104-2e1b-11ef-b811-0a580a80027b', '78aea300-2e1b-11ef-b811-0a580a80027b', '776ac35c-2e1b-11ef-b811-0a580a80027b', '78709416-2e1b-11ef-b811-0a580a80027b', '78fe3794-2e1b-11ef-b811-0a580a80027b', '710d13ca-2e1b-11ef-b811-0a580a80027b', '7977c8c0-2e1b-11ef-b811-0a580a80027b']], 'distances': [[0.7568771427603788, 0.8441264629364014, 0.9610137250414995, 1.011700142846496, 1.0952370053196285, 1.1075994819235395, 1.1273425628787646, 1.1507273803510183, 1.1567634344100952, 1.1686783373034475]], 'embeddings': None, 'metadatas': [[{'source': '/tmp/training_data/rfc6238.txt'}, {'source': '/tmp/training_data/rfc2104.txt'}, {'source': '/tmp/training_data/rfc4226.txt'}, {'source': '/tmp/training_data/rfc4226.txt'}, {'source': '/tmp/training_data/rfc6238.txt'}, {'source': '/tmp/training_data/rfc4226.txt'}, {'source': '/tmp/training_data/rfc6238.txt'}, {'source': '/tm