### Install Required Dependencies

In [1]:
# install below libraries if don't if you are trying for the first time.
# !pip install langchain
# !pip install numpy
# !pip install pymilvus
# !pip install requests
# !pip install tqdm

### Make sure milvus service is running

#### Procedure to configure milvus
- download docker compose`dokcer-compose.yml` using `wget https://github.com/milvus-io/milvus/releases/download/v2.2.10/milvus-standalone-docker-compose.yml -O docker-compose.yml` 
- to start service run `docker-compose up -d`
- To stop the service `docker-compose down`
- To check active containers `docker ps -a`


In [2]:
# imports
import numpy as np, faiss, sqlite3, requests, os, json
from tqdm.notebook import tqdm
import hashlib

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import parse_url
from requests.packages.urllib3.util.retry import Retry
from requests.compat import urljoin

# import milvus modules
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [3]:
# Embedder Client
class OzoneEmbedder(object):
    """Ozone Embedder Client Application"""
    def __init__(self, api_details) -> None:
        super(OzoneEmbedder, self).__init__()
        self.username = api_details["username"]
        self.bearer_token = api_details["bearer_token"]
        self.endpoint = api_details["endpoint"]
        self.url_details = parse_url(self.endpoint)
        self.max_retries = 3
        self.backoff_factor = 0.3

    def connect(self):
        # creating persistent connection
        retries = Retry(
            total=self.max_retries,
            backoff_factor=self.backoff_factor
        )
        adapter = HTTPAdapter(max_retries=retries)
        scheme = self.url_details.scheme
        self.connection = requests.Session()
        self.connection.mount(scheme, adapter)

    def close(self):
        self.connection.close()
    
    def __enter__(self):
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def get_embedding(self, text, model="siv-sentence-bitnet-pmbv2-wikid-large"):
        """
        text: input text
        model: 
            "siv-sentence-bitnet-pmbv2-wikid-large" or,
            "siv-sentence-bitnet-pmbv2-wikid-small" or,
            "sentence-bitnet-pmbv2"
        """
        
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {self.bearer_token}",
            "Content-Type": "application/x-www-form-urlencoded",
        }

        data = {
            "input_text": text,
            "embedder_name": model,
        }

        response = requests.post(
            self.endpoint,
            headers=headers, 
            data=data
        )
        return response.json()


### Load credential information from environment variable

In [4]:
with open(os.environ.get('OZAI_API_CREDENTIALS')) as fp:
    credential = json.load(fp)

### Read text document

In [5]:
text_path="./sample.txt"

### Preprocess text document using langchain

In [6]:
# load text document and split by chunk size
# Note: Document handler can be changed based on usage (check more options https://python.langchain.com/docs/modules/data_connection/document_loaders/)

# load
text_loader = TextLoader(text_path)
documents = text_loader.load()

# split document
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

Created a chunk of size 370, which is longer than the specified 200
Created a chunk of size 255, which is longer than the specified 200
Created a chunk of size 487, which is longer than the specified 200
Created a chunk of size 461, which is longer than the specified 200
Created a chunk of size 629, which is longer than the specified 200
Created a chunk of size 526, which is longer than the specified 200
Created a chunk of size 545, which is longer than the specified 200
Created a chunk of size 503, which is longer than the specified 200
Created a chunk of size 258, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200
Created a chunk of size 352, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 430, which is longer than the specified 200
Created a chunk of size 394, which is longer than the specified 200
Created a chunk of size 257, which is longer tha

### Encoding documents 

In [7]:
# Encode the documents
with OzoneEmbedder(credential) as ozone_embedder:
    encoded_documents = np.asarray([ozone_embedder.get_embedding(d.page_content)['embedding'][0] for d in tqdm(docs)]).astype('uint8')

  0%|          | 0/38 [00:00<?, ?it/s]

### Connect to Milvus

In [8]:
connections.connect(
  alias="default",
  host='localhost',
  port='19530'
)

### Create a collection to index data

In [9]:
# define all collection details for search and index
collection_details = {
    "name":"search_demo",
    "description":"Search and Retrieval Demo",
    "partition_name":"search_app",
    "index_field":"embeddings",
    "output_field":["docid", "texts"],
    "fields":["docid", "texts", "embeddings"],
    "index_params":{"metric_type":"HAMMING", "index_type":"BIN_FLAT", "params":{"nlist":10}},
    "search_params":{"metric_type": "HAMMING", "offset": 0}
}

In [10]:
# Encoded documents are packed bit ('unit8') then represented on binary vectors

# Actual embedding dimension would be 8 times as data is uint8
dimension = encoded_documents.shape[1] *8  # Dimension of the binary vectors
print(f"embedding size: {encoded_documents.shape}, index dimension: {dimension}")

# define Field Schemas
docid = FieldSchema(
    name="docid",
    dtype=DataType.VARCHAR,
    max_length=64,
    is_primary=True
)

texts = FieldSchema(
    name="texts",
    dtype=DataType.VARCHAR,
    max_length=1000 # change based on requirement
)

embeddings= FieldSchema(
    name="embeddings",
    dtype=DataType.BINARY_VECTOR,
    dim=dimension
)

# define field order (this has be same order during data insertion)
fields = [docid, texts, embeddings]

# create schema
schema = CollectionSchema(
  fields=fields,
  description=collection_details["description"]
)

# Create or get an existing collection.
collection = Collection(collection_details["name"], schema=schema)
# collection = Collection(collection_name)      # Get an existing collection.

# create partition if doesnt exist
if not collection.has_partition(collection_details["partition_name"]):
    collection.create_partition(collection_details["partition_name"])
    
# create index
# Note: only vector fields can be indexed
if not collection.has_index():
    collection.create_index(
        field_name=collection_details["index_field"],
        index_params=collection_details["index_params"]
    )

    utility.index_building_progress("search_demo")

embedding size: (38, 300), index dimension: 2400


### Update or insert data

In [11]:
# prep data to insert
# Note : 1) All fields should be in same sequence as schema
#        2) Make sure data length should be same for all fields

# consideration docid is made md5 of text. this can be changed based on use case
insert_data = [
    [hashlib.md5(d.page_content.encode()).hexdigest() for d in docs], # docid
    [d.page_content for d in docs], # texts
    [bytes(d.tolist()) for d in encoded_documents] # embedding as binary vector
]

insert_ack = collection.insert(insert_data) # insert/update data to milvus (it upfdate)
collection.flush() # make sure you persist the data
print(insert_ack)

(insert count: 38, delete count: 0, upsert count: 0, timestamp: 442492135574863876, success count: 38, err count: 0)


### Search Query

In [12]:
# query always happens on memory. so make sure for runtime search you must load the collection separately
query_collection = Collection(collection_details["name"])
query_collection.load()

# Perform a search on the index
# query = "what was the U. S. Bill of Rights"
query = docs[0].page_content

with OzoneEmbedder(credential) as ozone_embedder:
    encoded_query = ozone_embedder.get_embedding(query)['embedding']
    query_embedding = [bytes(i) for i in np.asarray(encoded_query).astype('uint8')]
    
    results = query_collection.search(
        data=query_embedding, 
        anns_field=collection_details["index_field"], 
        param=collection_details["search_params"],
        limit=5,
        expr=None,
        output_fields=collection_details["output_field"] ,# set the names of the fields you want to retrieve from the search result.
        consistency_level="Strong"
    )
    
    ids_list=results[0].ids
    ed_distance=results[0].distances

    result_dict_list = []

    for index, ids in enumerate(ids_list):
        hit = results[0][index]
        result_dict = {
            "ids": ids,
            "score": hit.score,
            "distance": hit.distance,
            "docid": hit.entity.get('docid'),
            "texts": hit.entity.get('texts')

        }
        print(f"""
            Query: {query}\n
            --------------------------
            Closest [{index}], DocID [{result_dict['docid']}]:\n
            Text: {result_dict['texts']}

            xxxxxxx
            """)
        result_dict_list.append(result_dict)


            Query: **Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans and By Computers, Since 1971**

*These Etexts Prepared By Hundreds of Volunteers and Donations*

            --------------------------
            Closest [0], DocID [4cb5885f975ad37c3b062f44b9493f76]:

            Text: **Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans and By Computers, Since 1971**

*These Etexts Prepared By Hundreds of Volunteers and Donations*

            xxxxxxx
            

            Query: **Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans and By Computers, Since 1971**

*These Etexts Prepared By Hundreds of Volunteers and Donations*

            --------------------------
            Closest [1], DocID [e61a668b8f3ccaa05c9da12d345d50d4]:

            Text: [*]  The etext may be readily converted by the reader at
          no expense into plain ASC

### Deletion of collection

In [13]:
# # To drop partition run below
# collection.release()
# collection.drop_partition(collection_details['partition_name'])

# # To drop collection run below 
# utility.drop_collection(collection_details["name"])


# # To delete particular data by docid
expr = """docid in ["4cb5885f975ad37c3b062f44b9493f76"]"""
collection.delete(expr=expr)

(insert count: 0, delete count: 1, upsert count: 0, timestamp: 442492141590806529, success count: 0, err count: 0)