In [1]:
import time
import numpy as np
import pandas as pd
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection)

In [2]:
fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"

**Milvus Connection**

In [1]:
connections.connect("default", host="dev_gpu", port="19530")
print(fmt.format("connected to Milvus"))

In [4]:
has = utility.has_collection("paraphrase_milvus")
print(f"Does collection paraphrase_milvus exist in Milvus: {has}")

Does collection paraphrase_milvus exist in Milvus: False


In [5]:
data = pd.read_csv("data/question_answer.csv")
data.head(3)

Unnamed: 0,id,question,answer
0,0,Is Disability Insurance Required By Law?,Not generally. There are five states that requ...
1,1,Can Creditors Take Life Insurance After ...,If the person who passed away was the one with...
2,2,Does Travelers Insurance Have Renters Ins...,One of the insurance carriers I represent is T...


In [6]:
data.shape

(1000, 3)

**Create collection**

In [7]:
dim=768

fields = [FieldSchema(name='id', dtype=DataType.INT64, descrition='ids', max_length=1000, is_primary=True, auto_id=False),
          FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, descrition='embedding vectors', dim=dim),
          FieldSchema(name="q_text", dtype=DataType.VARCHAR, description="question data", max_length=5000),
          FieldSchema(name="a_text", dtype=DataType.VARCHAR, description="answer data", max_length=5000)
         ]

schema = CollectionSchema(fields=fields, description='get the answer')
paraphrase_milvus = Collection(name="question_answer", schema=schema)

print("Collection is created")

Collection is created


**Embeddings**

In [8]:
model_path = "./paraphrase-multilingual-mpnet-base-v2/"
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_path, device="cpu")

In [9]:
question = np.array(data["question"])[:800]
embeddings = model.encode(question)

print(embeddings.shape)

(800, 768)


In [9]:
# collection.describe()

In [10]:
ids = np.array(data["id"])[:800]
question_data = np.array(data["question"])[:800]
answer_data = np.array(data["answer"])[:800]

entities = [ids, embeddings.tolist(), question_data.tolist(), answer_data.tolist()]

In [11]:
df = pd.DataFrame({'ids': entities[0],
                   'embeddings': entities[1],
                   'q_text': entities[2],
                  "a_text":entities[3]})
df.head(3)

Unnamed: 0,ids,embeddings,q_text,a_text
0,0,"[-0.042642127722501755, 0.24229967594146729, -...",Is Disability Insurance Required By Law?,Not generally. There are five states that requ...
1,1,"[0.011983036063611507, 0.27371856570243835, -0...",Can Creditors Take Life Insurance After ...,If the person who passed away was the one with...
2,2,"[-0.05496260151267052, 0.2613506615161896, -0....",Does Travelers Insurance Have Renters Ins...,One of the insurance carriers I represent is T...


In [12]:
df.shape

(800, 4)

In [13]:
insert_result = paraphrase_milvus.insert(entities)

paraphrase_milvus.flush()
print("Data is inserted into Collection")  # check the num_entities

Data is inserted into Collection


**Create Index**

In [14]:
index = {"index_type": "IVF_FLAT","metric_type": "L2","params": {"nlist": 128},}

paraphrase_milvus.create_index("embedding", index)

print("Index is created")

Index is created


In [15]:
print(fmt.format("Start loading"))
paraphrase_milvus.load()


=== Start loading                  ===



**Search**

In [16]:
print(fmt.format("Start searching based on vector similarity"))


=== Start searching based on vector similarity ===



In [17]:
test_question = np.array(data["question"])[950]
print("test data :", test_question)

test_emb = model.encode(test_question)
print(test_emb.shape, test_emb.dtype, type(test_emb))

test data : Is  Car  Insurance  Federal  Law?
(768,) float32 <class 'numpy.ndarray'>


In [18]:
search_params = {"metric_type": "L2","params": {"nprobe": 10},}
# search_params={"metric_type": "IP", "params": {}}

In [19]:
start_time = time.time()

result = paraphrase_milvus.search(data=test_emb.reshape(1,-1), anns_field="embedding", 
                                  param=search_params, limit=3, output_fields=["q_text", "a_text"])


end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, \n question text is : {hit.entity.get('q_text')}, \n answer text is : {hit.entity.get('a_text')} \n ")
print(search_latency_fmt.format(end_time - start_time))

hit: (distance: 2.2892966270446777, id: 132), 
 question text is : Is  Auto  Insurance  A  Federal  Requirement?, 
 answer text is : Auto Insurance is not a requirement mandated by Federal law, but instead is required by most if not all states as a condition of registering a car for use on public access roads and highways. Property and Casualty insurance, including car insurance is regulated by state authority as opposed to the Federal Government. 
 
hit: (distance: 3.5432300567626953, id: 364), 
 question text is : What  Auto  Insurance  Is  Required  By  Law?, 
 answer text is : If you live in a state that requires auto insurance typically the coverages that would be required are bodily injury, property damage, and uninsured motorist at minimum. Regulations are different from state to state. The amounts and types of coverage can vary greatly. Contact a local agent to know what the coverage requirements are for the state that you reside in. 
 
hit: (distance: 4.268867492675781, id: 72

In [20]:
input = "Is  Disability  Insurance  Required  By  Law?"
emb = model.encode(input)

In [21]:
start_time = time.time()
result = paraphrase_milvus.search(data=emb.reshape(1,-1), anns_field="embedding", 
                                  param=search_params, limit=3, output_fields=["q_text", "a_text"])

end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, \n question text is : {hit.entity.get('q_text')}, \n answer text is : {hit.entity.get('a_text')} \n ")
print(search_latency_fmt.format(end_time - start_time))

hit: (distance: 1.6388407558104445e-12, id: 0), 
 question text is : Is  Disability  Insurance  Required  By  Law?, 
 answer text is : Not generally. There are five states that require most all employers carry short term disability insurance on their employees. These states are: California, Hawaii, New Jersey, New York, and Rhode Island. Besides this mandatory short term disability law, there is no other legislative imperative for someone to purchase or be covered by disability insurance. 
 
hit: (distance: 1.5215604305267334, id: 235), 
 question text is : How  Necessary  Is  Disability  Insurance?, 
 answer text is : Disability insurance is paycheck insurance. How important is your paycheck? Very important! Disability insurance is an intricate part of defensive financial planning. Most working Americans have 90 days of case reserves for emergencies. So disability insurance can generate tax free benefits for time when you can't work because of sickness and/or injury. 
 
hit: (distance