### Install Required Dependencies

In [1]:
# # install below libraries if don't if you are trying for the first time.
# !pip install langchain
# !pip install numpy
# !pip install requests
# !pip install tqdm

In [2]:
# imports
import numpy as np, requests, os, json, hashlib
from tqdm.notebook import tqdm

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import parse_url
from requests.packages.urllib3.util.retry import Retry
from requests.compat import urljoin

In [3]:
# Embedder Client
class OzoneEmbedder(object):
    """Ozone Embedder Client Application"""
    def __init__(self, api_details) -> None:
        super(OzoneEmbedder, self).__init__()
        self.username = api_details["username"]
        self.bearer_token = api_details["bearer_token"]
        self.endpoint = api_details["endpoint"]
        self.url_details = parse_url(self.endpoint)
        self.max_retries = 3
        self.backoff_factor = 0.3

    def connect(self):
        # creating persistent connection
        retries = Retry(
            total=self.max_retries,
            backoff_factor=self.backoff_factor
        )
        adapter = HTTPAdapter(max_retries=retries)
        scheme = self.url_details.scheme
        self.connection = requests.Session()
        self.connection.mount(scheme, adapter)

    def close(self):
        self.connection.close()
    
    def __enter__(self):
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def get_embedding(self, text, model="siv-sentence-bitnet-pmbv2-wikid-large"):
        """
        text: input text
        model: 
            "siv-sentence-bitnet-pmbv2-wikid-large" or,
            "siv-sentence-bitnet-pmbv2-wikid-small" or,
            "sentence-bitnet-pmbv2"
        """
        
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {self.bearer_token}",
            "Content-Type": "application/x-www-form-urlencoded",
        }

        data = {
            "input_text": text,
            "embedder_name": model,
        }

        response = requests.post(
            self.endpoint,
            headers=headers, 
            data=data
        )
        return response.json()
    
class BinaryIndexer:
    def __init__(self, dim):
        """
        An implementation of binary indexing for numpy.
        
        This can be used for small scale datasets. 
        This is basic implemenation experimentation and analysis.
        """
        
        self.__name__ = "binary_indexer"
        self.dim = dim
        # hamming distance scoring        
        self.scorer = lambda dist: (self.dim-dist)/self.dim
        
    def create(self, docids, meta, embeddings:np.uint8):
        """
        docid: array of document ids [Nx1] (usually array of strings/integer)
        meta: array of document meta [Nx1] (usually be array of text)
        embeddings: array of embeddings [NxM] (np.ndarray)
        
        Note: N is number of examples and M is dimension of embedding (M*8 should be same as dimension of index)
        
        """
        if len(docids) != embeddings.shape[0] or len(meta) != embeddings.shape[0]:
            raise AssertionError("#embeddings doesnt match with #docids or #meta")
        
        if embeddings.shape[1]*8 != self.dim:
            raise AssertionError("invalid dimension `embedding dim * 8 should be same as index dimension`")
        
        self.docids = docids
        self.metas = meta
        self.search_base = embeddings
        return {"affected_rows":len(docids), "insertion":"successful"}

    def search(self, query_embedding, topn=3):
        """
        search:
        
        query_embedding: embedding of query sentence [1xM]
        
        """
        dist_pbit = np.bitwise_xor(self.search_base, query_embedding)
        dist_bit = np.unpackbits(dist_pbit, axis=1)
        dist = np.sum(dist_bit, axis=1)
        topn_indices = np.argsort(dist)[:topn]
        
        results = []
        for index in topn_indices:
            docid = self.docids[index]
            meta = self.metas[index]
            results.append({"docid": docid, "meta": meta, "score":self.scorer(dist[index])})
        return results
    
    def delete(self, docid):
        """
        delete by document id
        
        """
        if docid in self.docids:
            index_to_delete = self.docids.index(docid)
            del self.docids[index_to_delete]
            del self.metas[index_to_delete]
            self.search_base = np.delete(self.search_base, index_to_delete, axis=0)
            return {"affected_rows":1, "deletion":"successful"}
        else:
            return {"affected_rows":0, "deletion":"failed", "message":"id was never indexed"}
            
            
    def add(self, docid, meta, embedding):
        """
        insert document
        """
        self.docids.append(docid)
        self.metas.append(meta)
        self.search_base = np.vstack((self.search_base, embedding))
        return {"affected_rows":1, "insertion":"successful"}
        
    def save(self, index_path):
        np.savez(index_path, dim=self.dim, docids=self.docids, metas=self.metas, search_base=self.search_base)
        
    def load(self, index_path):
        with np.load(index_path) as npfl:
            self.dim = npfl['dim']
            self.metas = npfl['metas'].tolist()
            self.docids = npfl['docids'].tolist()
            self.search_base = npfl['search_base']
        

### Load credential information from environment variable

In [4]:
with open(os.environ.get('OZAI_API_CREDENTIALS')) as fp:
    credential = json.load(fp)

### Read text document

In [5]:
text_path="./sample.txt"

### Preprocess text document using langchain

In [6]:
# load text document and split by chunk size
# Note: Document handler can be changed based on usage (check more options https://python.langchain.com/docs/modules/data_connection/document_loaders/)

# load
text_loader = TextLoader(text_path)
documents = text_loader.load()

# split document
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

Created a chunk of size 370, which is longer than the specified 200
Created a chunk of size 255, which is longer than the specified 200
Created a chunk of size 487, which is longer than the specified 200
Created a chunk of size 461, which is longer than the specified 200
Created a chunk of size 629, which is longer than the specified 200
Created a chunk of size 526, which is longer than the specified 200
Created a chunk of size 545, which is longer than the specified 200
Created a chunk of size 503, which is longer than the specified 200
Created a chunk of size 258, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200
Created a chunk of size 352, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 430, which is longer than the specified 200
Created a chunk of size 394, which is longer than the specified 200
Created a chunk of size 257, which is longer tha

### Encoding documents 

In [7]:
# Encode the documents
with OzoneEmbedder(credential) as ozone_embedder:
    encoded_documents = np.asarray([ozone_embedder.get_embedding(d.page_content)['embedding'][0] for d in tqdm(docs)]).astype('uint8')

  0%|          | 0/38 [00:00<?, ?it/s]

### Create index using BinaryIndexer

In [8]:
# Encoded documents are packed bit ('unit8')
# make sure it fits to your RAM
print(f"embedding size: {encoded_documents.shape}")

# Actual embedding dimension would be 8 times as data is uint8

dimension = encoded_documents.shape[1] * 8  # Dimension of the binary vectors

ndocs = len(encoded_documents)
docids = [hashlib.md5(d.page_content.encode()).hexdigest() for d in docs] # docid
metas = [d.page_content for d in docs] # texts

# Create binary index
index = BinaryIndexer(dimension)

# Add the binary vectors to the index
# Note: avoid duplicate data insert
index.create(docids, metas, encoded_documents)


embedding size: (38, 300)


{'affected_rows': 38, 'insertion': 'successful'}

### Save and load Binary Index

In [9]:
# # persist data and text
# index.save("./index.npz")

# # loading index from persistent disk
# index = BinaryIndexer(dimension)
# index.load("./index.npz")

### Inser data to Binary Index

In [10]:
with OzoneEmbedder(credential) as ozone_embedder:
    text = "this new text to insert"
    meta = text
    docid = hashlib.md5(meta.encode(encoding='utf-8')).hexdigest()
    encoded_embedding = np.asarray(ozone_embedder.get_embedding(text)['embedding']).astype('uint8')
    index.add(docid, meta, encoded_embedding)

### Query Example

In [11]:

# Perform a search on the index
# query = "what was the U. S. Bill of Rights"
query = "this is new text to insert"
with OzoneEmbedder(credential) as ozone_embedder:
    encoded_query = ozone_embedder.get_embedding(query)['embedding']

    # Convert the query vector to a uint8 binary vector
    xq = np.asarray(encoded_query).astype('uint8')
    result = index.search(xq, topn=5)  # Retrieve top 5 most similar documents

    for i, s in enumerate(result):
        print(f"""
        Query: {query}\n
        --------------------------
        Closest [{i}], DocID [{s["docid"]}]: score: [{s["score"]}]\n
        Text: {s["meta"]}
        
        xxxxxxx
        """)


        Query: this is new text to insert

        --------------------------
        Closest [0], DocID [a90f57c5ee0586f67c7e7803a5a00435]: score: [0.9008333333333334]

        Text: this new text to insert
        
        xxxxxxx
        

        Query: this is new text to insert

        --------------------------
        Closest [1], DocID [44f47da8668684bdfaa674f57e040b52]: score: [0.7683333333333333]

        Text: [*]  The etext, when displayed, is clearly readable, and
          does *not* contain characters other than those
          intended by the author of the work, although tilde
          (~), asterisk (*) and underline (_) characters may
          be used to convey punctuation intended by the
          author, and additional characters may be used to
          indicate hypertext links; OR
        
        xxxxxxx
        

        Query: this is new text to insert

        --------------------------
        Closest [2], DocID [e61a668b8f3ccaa05c9da12d345d50d4]: score:

### Delete document by ID

In [12]:
index.delete(docid)

{'affected_rows': 1, 'deletion': 'successful'}