In [None]:
# install dependencies
# use pip and virtualenv
!python --version
!pip install nltk matplotlib prettytable tqdm
!pip install langchain_community chromadb sentence_transformers

In [None]:
# global vars
TRAINING_DATA_PATH = "training_data.local/"

# import libraries
try:
    from langchain_core.documents import Document
    from langchain_community.document_loaders import DirectoryLoader, TextLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter as rts
    from prettytable import PrettyTable
    from chromadb.utils import embedding_functions as ef
    from chromadb import Client
    import uuid
    from tqdm.autonotebook import tqdm, trange
    from nltk import word_tokenize, sent_tokenize, download
    import os
    download("punkt")
    import matplotlib.pyplot as plt
except Exception as e:
    print(f"Caught Exception {e}")

In [None]:
# Text Processing Primitives
# Embedding function
def s_transformer(model: str = "all-MiniLM-L6-v2"):
    return ef.SentenceTransformerEmbeddingFunction(model_name=model)

# load text documents from filesystem
def load_text_documents(path: str = ".", pattern: str = "**/*.txt",
                        multithread: bool = False) -> list:
    loader = DirectoryLoader(path,
                             glob=pattern,
                             loader_cls=TextLoader,
                             loader_kwargs={'autodetect_encoding': True},
                             use_multithreading=multithread,
                             silent_errors=True,
                             show_progress=True)
    return loader.load()

# prepare the knowledge corpus for further processing
def prepare_corpus(raw_loader: list) -> list:
    # collect statistics in relation to the data corpus
    corpus = []
    for doc in raw_loader:
        document_tokens = word_tokenize(doc.page_content)
        document_sentences = sent_tokenize(doc.page_content)
        corpus.append({"metadata": doc.metadata,
                       "raw_sentences": document_sentences,
                       "sentence_count": len(document_sentences),
                       "wordcount": len(document_tokens),
                       "vocabulary": set(document_tokens),
                       "lexical_richness": len(set(document_tokens)) / len(document_tokens)})

    # display corpus
    data_table = PrettyTable()
    data_table.field_names = ["Dataset", "Word Count", "Sentence Count", "Vocabulary", "Lexical Richness"]
    for dataset in corpus:
        data_table.add_row([dataset.get("metadata").get("source"), dataset.get("wordcount"), dataset.get("sentence_count"), len(dataset.get("vocabulary")), dataset.get("lexical_richness")])

    # display dataset statistics
    print(data_table)

    # return processed data
    tokenized_data = []
    for doc in tqdm(corpus, ascii=True, desc="Tokenizing..."):
        metadata = doc.get("metadata")
        for data in doc.get("raw_sentences"):
            tokenized_data.append(Document(metadata=metadata, page_content=data))

    # return corpus and metadata
    return tokenized_data, corpus

# split corpus in chunks for vectorization
def split_text_documents(documents: list = None,
                         chunk_size: int = 1000,
                         chunk_overlap: int = 0) -> list:
    if documents is None:
        return None

    splitter = rts(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

In [None]:
# walk the raw data storage path and search for text documents
raw_loader = load_text_documents(TRAINING_DATA_PATH)

# collect statistics in relation to the data corpus
corpus, metadata = prepare_corpus(raw_loader)

In [None]:
# plot dataset statistics graph
x_vals = [len(x.get("vocabulary")) for x in metadata]
y_vals = [100 * x.get("lexical_richness") for x in metadata]
y2_vals = [x.get("sentence_count") for x in metadata]

# plot vocabulary vs richness data
plt.subplot(211)
plt.ylabel("Lexical Richness")
plt.xlabel("Vocabulary")
plt.scatter(x_vals, y_vals)
# plot sentence count vs vocabulary
plt.subplot(212)
plt.ylabel("Sentence Count")
plt.xlabel("Vocabulary")
plt.scatter(x_vals, y2_vals)
plt.show()

In [None]:
# prepare data for further tokenization
CHUNK_SIZE = 1000 # 100 words per chunk
OVERLAP = 10 # overlapping words

# split documents
tokenized_docs = split_text_documents(corpus, chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP)
print(f"Tokenized {len(tokenized_docs)} documents...")

# remove buffers
del(raw_loader)
del(corpus)
del(metadata)

In [None]:
# now call the embedding model and upload data to the vector database
SIMILARITY_FUNCTION = "cosine"
COLLECTION_NAME = "rag_demo"
os.environ['HF_HOME'] = '/tmp/huggingface/hub/'

# connect to a running chroma instance
try:
    vector_store = Client()
    chroma_collection = vector_store.get_or_create_collection(COLLECTION_NAME,
                                                              metadata={"hnsw:space": SIMILARITY_FUNCTION},
                                                              embedding_function=s_transformer())
except Exception as e:
    print(f"Caught Exception: {e}")

In [None]:
# embed data and push vectors to the database
if len(tokenized_docs) > 0:
    for doc in tqdm(tokenized_docs, ascii=True, desc="Ingesting..."):
        chroma_collection.add(ids=[str(uuid.uuid1())], documents=doc.page_content, metadatas=doc.metadata)

In [None]:
# explore the vector collection
print(f"Objects stored in the collection {chroma_collection.name}: {chroma_collection.count()}")
chroma_collection.query(query_texts=["why openshift is better?"], n_results=2)