### Install Required Dependencies

In [1]:
# # install below libraries if don't if you are trying for the first time.
# !pip install langchain
# !pip install numpy
# !pip install faiss-cpu
# !pip install requests
# !pip install tqdm

In [2]:
# imports
import numpy as np, faiss, sqlite3, requests, os, json
from tqdm.notebook import tqdm

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import parse_url
from requests.packages.urllib3.util.retry import Retry
from requests.compat import urljoin

In [3]:
# Embedder Client
class OzoneEmbedder(object):
    """Ozone Embedder Client Application"""
    def __init__(self, api_details) -> None:
        super(OzoneEmbedder, self).__init__()
        self.username = api_details["username"]
        self.bearer_token = api_details["bearer_token"]
        self.endpoint = api_details["endpoint"]
        self.url_details = parse_url(self.endpoint)
        self.max_retries = 3
        self.backoff_factor = 0.3

    def connect(self):
        # creating persistent connection
        retries = Retry(
            total=self.max_retries,
            backoff_factor=self.backoff_factor
        )
        adapter = HTTPAdapter(max_retries=retries)
        scheme = self.url_details.scheme
        self.connection = requests.Session()
        self.connection.mount(scheme, adapter)

    def close(self):
        self.connection.close()
    
    def __enter__(self):
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def get_embedding(self, text, model="siv-sentence-bitnet-pmbv2-wikid-large"):
        """
        text: input text
        model: 
            "siv-sentence-bitnet-pmbv2-wikid-large" or,
            "siv-sentence-bitnet-pmbv2-wikid-small" or,
            "sentence-bitnet-pmbv2"
        """
        
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {self.bearer_token}",
            "Content-Type": "application/x-www-form-urlencoded",
        }

        data = {
            "input_text": text,
            "embedder_name": model,
        }

        response = requests.post(
            self.endpoint,
            headers=headers, 
            data=data
        )
        return response.json()
    
class DocumentDatabase:
    def __init__(self, db_file):
        self.db_file = db_file

    def _create_table(self):
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS documents
                              (id INTEGER PRIMARY KEY AUTOINCREMENT,
                               content TEXT,
                               UNIQUE(id) ON CONFLICT IGNORE)''')
        self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_id ON documents (id)')

    def __enter__(self):
        self.conn = sqlite3.connect(self.db_file)
        self.cursor = self.conn.cursor()
        self._create_table()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.conn.close()
        
    def insert_document(self, document):
        self.cursor.execute("INSERT INTO documents (content) VALUES (?)", (document,))
        self.conn.commit()

    def select_documents(self, query):
        self.cursor.execute(query)
        return self.cursor.fetchall()

### Load credential information from environment variable

In [4]:
with open(os.environ.get('OZAI_API_CREDENTIALS')) as fp:
    credential = json.load(fp)

### Read text document

In [5]:
text_path="./sample.txt"

### Preprocess text document using langchain

In [6]:
# load text document and split by chunk size
# Note: Document handler can be changed based on usage (check more options https://python.langchain.com/docs/modules/data_connection/document_loaders/)

# load
text_loader = TextLoader(text_path)
documents = text_loader.load()

# split document
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

Created a chunk of size 370, which is longer than the specified 200
Created a chunk of size 255, which is longer than the specified 200
Created a chunk of size 487, which is longer than the specified 200
Created a chunk of size 461, which is longer than the specified 200
Created a chunk of size 629, which is longer than the specified 200
Created a chunk of size 526, which is longer than the specified 200
Created a chunk of size 545, which is longer than the specified 200
Created a chunk of size 503, which is longer than the specified 200
Created a chunk of size 258, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200
Created a chunk of size 352, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 430, which is longer than the specified 200
Created a chunk of size 394, which is longer than the specified 200
Created a chunk of size 257, which is longer tha

### Encoding documents 

In [7]:
# Encode the documents
with OzoneEmbedder(credential) as ozone_embedder:
    encoded_documents = np.asarray([ozone_embedder.get_embedding(d.page_content)['embedding'][0] for d in tqdm(docs)]).astype('uint8')

  0%|          | 0/38 [00:00<?, ?it/s]

### Create index using Faiss

In [8]:
# Encoded documents are packed bit ('unit8')
# make sure it fits to your RAM
print(f"embedding size: {encoded_documents.shape}")

# Actual embedding dimension would be 8 times as data is uint8

dimension = encoded_documents.shape[1] * 8  # Dimension of the binary vectors

# Create faiss binary index
index = faiss.IndexBinaryFlat(dimension)

# Add the binary vectors to the index
# Note: avoid duplicate data insert
index.add(encoded_documents)

# persist data and text
faiss.write_index_binary(index, 'index.dat')

embedding size: (38, 300)


### Storing document in sqlite

In [9]:
with DocumentDatabase('index.db') as conn:
    
    # Insert the documents into the database
    for doc in tqdm(docs):
        conn.insert_document(doc.page_content)
    

  0%|          | 0/38 [00:00<?, ?it/s]

### Query Example

In [10]:

# Perform a search on the index
query = "what was the U. S. Bill of Rights"

with OzoneEmbedder(credential) as ozone_embedder, DocumentDatabase('index.db') as conn:
    encoded_query = ozone_embedder.get_embedding(query)['embedding']

    # Convert the query vector to a uint8 binary vector
    xq = np.asarray(encoded_query).astype('uint8')
    D, I = index.search(xq, k=5)  # Retrieve top 5 most similar documents

    selected_data = [conn.select_documents(f"select * from documents where id={i};")[0] for i in I[0]]
    for i, s in enumerate(selected_data):
        print(f"""
        Query: {query}\n
        --------------------------
        Closest [{i}], DocID [{s[0]}]:\n
        Text: {s[1]}
        
        xxxxxxx
        """)


        Query: what was the U. S. Bill of Rights

        --------------------------
        Closest [0], DocID [36]:

        Text: VIII

Excessive bail shall not be required nor excessive fines imposed,
nor cruel and unusual punishments inflicted.


IX
        
        xxxxxxx
        

        Query: what was the U. S. Bill of Rights

        --------------------------
        Closest [1], DocID [24]:

        Text: ***

These original Project Gutenberg Etexts will be compiled into a file
containing them all, in order to improve the content ratios of Etext
to header material.

***
        
        xxxxxxx
        

        Query: what was the U. S. Bill of Rights

        --------------------------
        Closest [2], DocID [21]:

        Text: WHAT IF YOU *WANT* TO SEND MONEY EVEN IF YOU DON'T HAVE TO?
The Project gratefully accepts contributions in money, time,
scanning machines, OCR software, public domain etexts, royalty
free copyright licenses, and every other sort of contrib