## Import Packages, Environment Variable

In [2]:
import openai
from openai.embeddings_utils import get_embedding, get_embeddings
import os
from dotenv import load_dotenv

# Use the PyPDF2 library to read a PDF file
#from pypdf import PdfReader
from tqdm import tqdm

import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
#from sentence_transformers import CrossEncoder
load_dotenv()

False

In [3]:
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [4]:
ENGINE = 'text-embedding-ada-002'

In [5]:
embedded_text = get_embedding('I love to be vectorized', engine=ENGINE)

RetryError: RetryError[<Future at 0x27e1357ee50 state=finished raised AuthenticationError>]

In [8]:
len(embedded_text)

1536

## Read PDF files

In [4]:
# Open the PDF file in read-binary mode
with open('../data/Introduction to Kubernetes.pdf', 'rb') as file:

    # Create a PDF reader object
    reader = PdfReader(file)

    # Initialize an empty string to hold the text
    intro_to_kube = ''

    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
       
        # Extract the text from the page
        text = page.extract_text()

        # Find the starting point of the text we want to extract
        # In this case, we are extracting text starting from the string ' ]'
        # intro_to_kube += '\n\n' + text[text.find(' ]')+2:]
        intro_to_kube += '\n\n' + text

# Strip any leading or trailing whitespace from the resulting string
intro_to_kube = intro_to_kube.strip()
print(len(intro_to_kube))

  5%|▍         | 2/41 [00:00<00:02, 16.80it/s]

100%|██████████| 41/41 [00:01<00:00, 27.40it/s]

46150





In [5]:
import re

pattern = r'(?<=\S)\n(?=\S)'
replacement = ' '
intro_to_kube = re.sub(pattern, replacement, intro_to_kube)
print(len(intro_to_kube))

46150


In [13]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')

[36661, 1070]

In [14]:
tokenizer.encode('nnaemeka here')

[77, 3458, 336, 53413, 1618]

## Chunking Method (redundant)

In [9]:
import re
# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [16]:
from urllib.request import urlopen

#

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

In [17]:
split = overlapping_chunks(text, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 17 documents with average length 476.7 tokens


In [18]:
split = overlapping_chunks(text)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 24 documents with average length 477.4 tokens


## Set up Vector Database

In [25]:
import chromadb
from datetime import datetime
import hashlib
from chromadb.utils import embedding_functions

In [26]:
COLLECTION_NAME = "semantic-search"

In [27]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-cos-v1")

In [28]:
client = chromadb.PersistentClient(path="/tmp/semantic")

In [29]:
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
    embedding_function=sentence_transformer_ef
    )


In [30]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [31]:
def prepare_for_chroma(texts, engine=None):
    now = datetime.utcnow()

    if engine:
        embeddings = get_embeddings(texts, engine=ENGINE)
        return {
        'ids':[my_hash(text) for text in texts],
        'documents': [text for text in texts],
        'embeddings': [embedding for embedding in embeddings],
        'metadata': [dict(head=text[0], date_uploaded=str(now)) for text in texts]
    }

    return {
        'ids':[my_hash(text) for text in texts],
        'documents': [text for text in texts],
        'metadata': [dict(head=text[0], date_uploaded=str(now)) for text in texts]
    }

In [32]:
texts = ['hi']

In [33]:
response =  prepare_for_chroma(texts, engine=ENGINE)

In [34]:
response

{'ids': ['49f68a5c8493ec2c0bf489821c21fc3b'],
 'documents': ['hi'],
 'metadata': [{'head': 'h', 'date_uploaded': '2023-10-20 13:24:57.880216'}]}

In [35]:
def upload_texts_to_chroma(texts, collection, batch_size=None, show_progress_bar=True, engine=None):
    total_added = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i : i + batch_size]
        output = prepare_for_chroma(batch, engine=engine)

        if output.get('embeddings', None):
            out = collection.add(
                documents= output['documents'],
                embeddings= output['embeddings'],
                metadatas= output['metadata'],
                ids= output['ids']
                )

        else:
            out = collection.add(
                documents= output['documents'],
                metadatas= output['metadata'],
                ids= output['ids']
                )
        print(out)
        total_added += 1

        return total_added

In [29]:
upload_texts_to_chroma(texts, collection, engine=ENGINE)

  0%|          | 0/1 [00:00<?, ?it/s]


1

In [37]:
def query_from_chroma(query, collection, engine=None, top_k=3):
    if engine:
        query_embedding = get_embedding(query, engine=ENGINE)

        return collection.query(
            query_embeddings=query_embedding,
            n_results=top_k
            )
    return collection.query(
                query_texts=[query],
                n_results=top_k
            )

In [32]:
query_from_chroma('hello', collection)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['49f68a5c8493ec2c0bf489821c21fc3b']],
 'distances': [[0.07519697162713412]],
 'metadatas': [[{'date_uploaded': '2023-10-18 09:35:59.505288'}]],
 'embeddings': None,
 'documents': [['hi']]}

In [39]:
def delete_texts_from_chroma(texts, collection):
    hashes = [my_hash(text) for text in texts]

    return collection.delete(
        ids=hashes
    )

In [40]:
# delete text
delete_texts_from_chroma(texts, collection)

In [None]:
#test collection is empty
query_from_chroma('hello', collection)

## Finding Custom Delimiters

#### This method requires hands-on familiarity of documents

In [10]:
# Importing the Counter and re libraries
from collections import Counter
import re

# Find all occurrences of one or more spaces in 'principles_of_ds'
matches = re.findall(r'[\s]{1,}', intro_to_kube)

# The 10 most frequent spaces that occur in the document
most_common_spaces = Counter(matches).most_common(10)

# Print the most common spaces and their frequencies
print(most_common_spaces)

[(' ', 7093), ('\n\n\n', 29), ('\n\n', 11)]


In [None]:
# Only keep documents of at least 50 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, intro_to_kube.split('\n\n\n')))

avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens')

In [89]:
# Process the documents in batches to obtain embeddings for each document
embeddings = None
for s in tqdm(range(0, len(split), 100)):
    if embeddings is None:
        embeddings = np.array(get_embeddings(split[s:s+100], engine=ENGINE))
    else:
        embeddings = np.vstack([embeddings, np.array(get_embeddings(split[s:s+100], engine=ENGINE))])


100%|██████████| 1/1 [00:04<00:00,  4.30s/it]


In [90]:
# Assume you have a list of text embeddings called `embeddings`
# First, compute the cosine similarity matrix between all pairs of embeddings
cosine_sim_matrix = cosine_similarity(embeddings)

# Instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(
    n_clusters=None,         # the algorithm will determine the optimal number of clusters based on the data
    distance_threshold=0.1,  # clusters will be formed until all pairwise distances between clusters are greater than 0.1
    metric='precomputed',  # we are providing a precomputed distance matrix (1 - similarity matrix) as input
    linkage='complete'       # form clusters by iteratively merging the smallest clusters based on the maximum distance between their components
)

# Fit the model to the cosine distance matrix (1 - similarity matrix)
agg_clustering.fit(1 - cosine_sim_matrix)

# Get the cluster labels for each embedding
cluster_labels = agg_clustering.labels_

# Print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')

Cluster 0: 2 embeddings
Cluster 1: 2 embeddings
Cluster 2: 2 embeddings
Cluster 3: 1 embeddings
Cluster 4: 1 embeddings
Cluster 5: 1 embeddings
Cluster 6: 1 embeddings
Cluster 7: 1 embeddings
Cluster 8: 1 embeddings
Cluster 9: 1 embeddings
Cluster 10: 1 embeddings
Cluster 11: 1 embeddings
Cluster 12: 1 embeddings
Cluster 13: 1 embeddings
Cluster 14: 1 embeddings
Cluster 15: 1 embeddings
Cluster 16: 1 embeddings
Cluster 17: 1 embeddings
Cluster 18: 1 embeddings
Cluster 19: 1 embeddings
Cluster 20: 1 embeddings
Cluster 21: 1 embeddings
Cluster 22: 1 embeddings
Cluster 23: 1 embeddings
Cluster 24: 1 embeddings
Cluster 25: 1 embeddings
Cluster 26: 1 embeddings
Cluster 27: 1 embeddings
Cluster 28: 1 embeddings
Cluster 29: 1 embeddings
Cluster 30: 1 embeddings
Cluster 31: 1 embeddings
Cluster 32: 1 embeddings
Cluster 33: 1 embeddings
Cluster 34: 1 embeddings
Cluster 35: 1 embeddings
Cluster 36: 1 embeddings
Cluster 37: 1 embeddings


In [91]:
pruned_documents = []
for _label, count in zip(unique_labels, counts):
    pruned_documents.append('\n\n'.join([text for text, label in zip(split, cluster_labels) if label == _label]))

    
avg_length = sum([len(tokenizer.encode(t)) for t in pruned_documents]) / len(pruned_documents)
print(f'Our pruning approach has {len(pruned_documents)} documents with average length {avg_length:.1f} tokens')

Our pruning approach has 38 documents with average length 238.2 tokens


In [92]:
print(pruned_documents[0])


Minikube Installation Binary download or Debian Package Mac OS 1. Verify virtualization for MacOS. VMX in the output indicates enabled virtualization 2. Install the VirtualBox hypervisor for MacOs 3. Download and install the .dmg package.


Minikube Installation Install using brew package manager Or Binary Download Windows Installation 1. Verify the virtualization support on your Windows system (multiple output lines ending with 'Yes' indicate supported virtualization)


## Query Vector Database with OpenAI embeddings

In [98]:
upload_texts_to_chroma(pruned_documents, collection, batch_size=128, engine=ENGINE)

  0%|          | 0/1 [00:01<?, ?it/s]


1

In [101]:
query = "How do I setup Kubernetes?"

results_from_chroma = query_from_chroma(query, collection, top_k=5)

In [102]:
results_from_chroma

{'ids': [['19551b58f60e0ded85d29c635ca4e5f1',
   'e6f6650f9b61b98de63bd87f092d153d',
   'b46e395aaf05e8f1e8e74bd574a01f97',
   '2ac07e37162a9b0a64cdf6b4a5ae9927',
   'fc8a457e7b543faad654c3d50ba94b24']],
 'distances': [[0.14350228015213295,
   0.15193963205075378,
   0.152135480726164,
   0.15850031407387533,
   0.16811634666797715]],
 'metadatas': [[{'date_uploaded': '2023-10-18 11:35:06.903148', 'head': 'R'},
   {'date_uploaded': '2023-10-18 11:35:06.903148', 'head': 'C'},
   {'date_uploaded': '2023-10-18 11:35:06.903148', 'head': 'O'},
   {'date_uploaded': '2023-10-18 11:35:06.903148', 'head': '\n'},
   {'date_uploaded': '2023-10-18 11:35:06.903148', 'head': '3'}]],
 'embeddings': None,
 'documents': [['R1: Installing Kubernetes Objective ● Define Kubernetes cluster configuration ● Understand infrastructure where kubernetes is installed Configuration There are many different cluster configurations that can be used when installing kubernetes. Some of the configurations are described 

In [78]:
results_from_chroma['ids'][0]

['6c32aa8ec11f8121e8d7c496e6e09dd9',
 '1cac21d57ab91cf61ce0af52bfdc8db6',
 'd14aac94004bca3680e49a6169ff89cc',
 'ccfddfd8ab381ea3711c36a8c411d937',
 'c515c071ec94d678b0aeff2485a9c074']

In [103]:
for i, doc in enumerate(results_from_chroma['documents'][0]):
    print(f"id: {results_from_chroma['ids'][0][i]}, {results_from_chroma['distances'][0][i]}, {doc}\n")

id: 19551b58f60e0ded85d29c635ca4e5f1, 0.14350228015213295, R1: Installing Kubernetes Objective ● Define Kubernetes cluster configuration ● Understand infrastructure where kubernetes is installed Configuration There are many different cluster configurations that can be used when installing kubernetes. Some of the configurations are described below: 1. All in One single node installation: Both master and worker nodes components are installed and running on a single node. This is used during learning and testing, it is not advisable for production. 2. Single master and multi worker: Single master node managing multiple worker nodes. The single master node runs a multi-stacked etcd instance. 3. Single master with single node etcd and multi worker nodes: We have a single master node with an external etcd instance. This master node manages the multiple worker nodes. 4. Multi master and multi worker: This configuration is mainly used for high availability . Multi master nodes are configured w

### Use Crossencoder to re-rank search results

In [None]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
def get_results_from_chroma(query, top_k=3, re_rank=False, verbose=True):
    results_from_chroma = query_from_chroma(query, collection, top_k=5)
    if not results_from_chroma:
        return []
    if verbose:
        print("Query: ", query)
    final_results = []

    if re_rank:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [query, results_from_chroma['document']]

        # Compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations)

        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_chroma = results_from_chroma['document'][idx]
            final_results.append(result_from_chroma)
            if verbose:
                print(f"{results_from_chroma['ids'][idx]}\t{result_from_chroma['distances']['idx']:.2f}\t{similarity_scores[idx]:.2f}\t{results_from_chroma['documents'][idx][:50]}")
        return final_results

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')

    for i, doc in enumerate(results_from_chroma['documents'][0]):
        print(f"id: {results_from_chroma['ids'][0][i]}, {results_from_chroma['distances'][0][i]}, {doc}\n")
    return final_results
