### Install Required Dependencies

In [None]:
# # install below libraries if don't if you are trying for the first time.
# !pip install langchain
# !pip install numpy
# !pip install faiss-cpu
# !pip install requests
# !pip install tqdm
# !pip install ozonetel-ai

In [None]:
# imports
import numpy as np, faiss, sqlite3, requests, os, json
from tqdm.notebook import tqdm

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

In [None]:
class DocumentDatabase:
    def __init__(self, db_file):
        self.db_file = db_file

    def _create_table(self):
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS documents
                              (id INTEGER PRIMARY KEY AUTOINCREMENT,
                               content TEXT,
                               UNIQUE(id) ON CONFLICT IGNORE)''')
        self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_id ON documents (id)')

    def __enter__(self):
        self.conn = sqlite3.connect(self.db_file)
        self.cursor = self.conn.cursor()
        self._create_table()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.conn.close()
        
    def insert_document(self, document):
        self.cursor.execute("INSERT INTO documents (content) VALUES (?)", (document,))
        self.conn.commit()

    def select_documents(self, query):
        self.cursor.execute(query)
        return self.cursor.fetchall()

### Define credential in environment variable

In [None]:
import os
os.environ["OZAI_API_CREDENTIALS"] = "./cred.json"

### Read text document

In [None]:
text_path="./sample.txt"

### Preprocess text document using langchain

In [None]:
# load text document and split by chunk size
# Note: Document handler can be changed based on usage (check more options https://python.langchain.com/docs/modules/data_connection/document_loaders/)

# load
text_loader = TextLoader(text_path)
documents = text_loader.load()

# split document
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

### Encoding documents 

In [None]:
from ozoneai.embeddings import list_models

In [None]:
list_models()

In [None]:
# endcoder_modelid = "paraphrase-multilingual-mpnet-base-v2" 
# model = "siv-sentence-bitnet-pmbv2-wikid-large"

endcoder_modelid = "BAAI/bge-m3"
model = "sieve-bge-m3-en-aug-v1"

In [None]:
# Encode the documents

# Import `BinarizeSentenceEmbedding` class from the `ozoneai.embeder` module.
from ozoneai.embeddings import BinarizeSentenceEmbedding

batch_size = 20
# Extract Embeddings: Use the `binarize` method to obtain binarized embeddings for given texts .
# Supported models encoders are `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` and `BAAI/bge-m3`
# Alternatively if you have stored these models in local directory you can use like `/path/to/paraphrase-multilingual-mpnet-base-v2` or `/path/to/bge-m3`
with BinarizeSentenceEmbedding(
    endcoder_modelid=endcoder_modelid) as embedder:
    
    ndocs = len(docs)
    encoded_documents = []
    for i in tqdm(range(0, ndocs, batch_size)):
        d = docs[i:min(i+batch_size, ndocs)]
        d = [di.page_content for di in d]
        emb = embedder.encode(d)
        emb_binarized = embedder.binarize(emb, model=model) # max limit 20 vectors per request
        encoded_documents.append(emb_binarized.embedding)

encoded_documents = np.concatenate(encoded_documents)

### Create index using Faiss

In [None]:
# rm -rvf index*

In [None]:
# Encoded documents are packed bit ('unit8')
# make sure it fits to your RAM
print(f"embedding size: {encoded_documents.shape}")

# Actual embedding dimension would be 8 times as data is uint8

dimension = encoded_documents.shape[1] * 8  # Dimension of the binary vectors

# Create faiss binary index
index = faiss.IndexBinaryFlat(dimension)

# Add the binary vectors to the index
# Note: avoid duplicate data insert
index.add(encoded_documents)

# persist data and text
faiss.write_index_binary(index, 'index.dat')

### Storing document in sqlite

In [None]:
with DocumentDatabase('index.db') as conn:
    
    # Insert the documents into the database
    for doc in tqdm(docs):
        conn.insert_document(doc.page_content)
    

### Query Example

In [None]:
# Perform a search on the index
query = "What is positional embeddings?"

with BinarizeSentenceEmbedding(
    endcoder_modelid=endcoder_modelid) as embedder, DocumentDatabase('index.db') as conn:
    
    emb = embedder.encode(query)
    emb_binarized = embedder.binarize(emb, model=model)
    
    encoded_query = emb_binarized.embedding

    D, I = index.search(encoded_query, k=5)  # Retrieve top 5 most similar documents

    selected_data = [conn.select_documents(f"select * from documents where id={i};")[0] for i in I[0]]
    for i, s in enumerate(selected_data):
        print(f"""
        Query: {query}\n
        --------------------------
        Nearest [{i}], DocID [{s[0]}]:\n
        Text: {s[1]}
        
        xxxxxxx
        """)