In [None]:
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Dense Retrieval using Milvus

- Understand the Python Elastic Search Client
- Map BM25 to Elastic Search 
- Compute Evaluation metrics 
- Other users of Elastic Search

## Goals

In [1]:
!ls

00_data_fetch_bq.ipynb		 Untitled.ipynb
00_data_fetch_spark.ipynb	 __pycache__
01_b_setup.ipynb		 ann_benchmark_recall.ipynb
01_data_cleanup.ipynb		 faiss_document_store.db
01_data_subset.ipynb		 metrics_utils.py
02_retrieval_dense_milvus.ipynb  old
02_retrieval_sparse.ipynb	 test_setup.ipynb


## Imports

In [2]:
import datetime
import pickle
import uuid
import datetime
import numpy as np
import time
import pandas as pd
import tqdm
import torch
import metrics_utils

ModuleNotFoundError: No module named 'numpy'

In [None]:
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)
import pymilvus

from sentence_transformers import SentenceTransformer


In [None]:
pd.options.display.max_colwidth = 500 # increase column width

## Data

In [None]:
path_posts = "gs://np-training-tmp/stackoverflow/final_subset/posts.parquet"
path_posts_related = "gs://np-training-tmp/stackoverflow/final_subset/related_posts.parquet"

In [None]:
collection_name = "stackoverflow"

In [None]:
# def get_model():
#     model = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')
#     return model
    
# def get_milvus_connection(collection_name):
#     milvus_client = Collection(collection_name)
#     return milvus_client

## Model

In [None]:
model = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')


In [None]:
print ( list(model.children()) )

In [None]:
text = "Replace me by any question / answer you'd like."
text_embbedding = model.encode(text)

In [None]:
text_embbedding.shape


model.

In [None]:
?model.encode

In [None]:
dim = model.get_sentence_embedding_dimension()


In [None]:
df = pd.read_parquet(path_posts)
df['Tags']  = df['Tags'].apply(lambda x: " ".join( x.tolist()))

In [None]:
len(df)

In [None]:
connections.connect("default", host="localhost", port="19530")


In [None]:
utility.list_collections()

In [None]:
if collection_name in utility.list_collections():
    utility.drop_collection(collection_name)

In [None]:
?Collection

In [None]:
fields = [
    FieldSchema(name="Id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="AcceptedAnswerId", dtype=DataType.INT64),
    FieldSchema(name="Title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="QuestionBody", dtype=DataType.VARCHAR, max_length=50_000),
    FieldSchema(name="Tags", dtype=DataType.VARCHAR, max_length=5000),
    FieldSchema(name="ViewCount", dtype=DataType.INT64),
    FieldSchema(name="AnswerCount", dtype=DataType.INT64),
    FieldSchema(name="CommentCount", dtype=DataType.INT64),
    FieldSchema(name="Score", dtype=DataType.INT64),
    FieldSchema(name="AnswerId", dtype=DataType.INT64),
    FieldSchema(name="AcceptedAnswerBody", dtype=DataType.VARCHAR, max_length=50_000),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim) ,
    
    #FieldSchema(name="CreationDate", dtype=DataType.VARCHAR),

]

schema = CollectionSchema(fields, "collection containing stackoverflow")

stackoverflow_milvus = Collection(collection_name, schema, consistency_level="Strong")

In [None]:
schema

In [None]:
fields = [f.name for f in schema.fields]

In [None]:
fields

In [None]:

df[['AcceptedAnswerId','AnswerId']] = df[['AcceptedAnswerId','AnswerId']].fillna(-1).astype(int)

cols = ['ViewCount','AnswerCount','CommentCount' ,'Score' ]
df[cols] = df[cols ].fillna(0).astype(int)


df[['AcceptedAnswerBody']] = df[['AcceptedAnswerBody']].fillna("")



In [None]:
df.head()

In [None]:
len(df)

In [None]:
df_subset = df.head(5_000_000).copy()

In [None]:
df_subset['Title'].tolist();

In [None]:
?model.encode

In [None]:
embeddings = model.encode(df_subset['Title'].head(1000).tolist() , show_progress_bar=True)

In [None]:
# embeddings = np.random.random((len(df_subset),dim))

#embeddings = model.encode(df_subset['Title'], show_progress_bar=True)

if torch.cuda.is_available():
    embeddings = model.encode(df_subset['Title'].tolist() , show_progress_bar=True)
    df_subset['embedding'] = embeddings.tolist()
    df_subset.to_parquet(path_posts.replace(".parquet", "_with_embedding.parquet") , index=False)


In [None]:
df_subset = pd.read_parquet( path_posts.replace(".parquet", "_with_embedding.parquet") )

In [None]:
df_subset.head()

In [None]:
df_subset = df_subset [fields]

In [None]:
df_subset.iloc[0].to_dict();

In [None]:
df_subset.dtypes

In [None]:
insert_result = stackoverflow_milvus.insert( df_subset  )



In [None]:
insert_result

In [None]:
stackoverflow_milvus.num_entities

In [None]:
stackoverflow_milvus.indexes

https://milvus.io/docs/index.md

In [None]:
# index = {
#     "index_type": "IVF_FLAT",
#     "metric_type": "L2",
#     "params": {"nlist": 128},
# }

index = {
    "index_type": "FLAT",
    "metric_type": "L2",
    "params": {}
}



In [None]:
stackoverflow_milvus.create_index("embedding", index)

In [None]:
?stackoverflow_milvus.create_index

In [None]:
stackoverflow_milvus.indexes

In [None]:
stackoverflow_milvus.load()


In [None]:
!ls

In [None]:
vectors_to_search = list(df_subset.iloc[0:1]['embedding'])

# search_params = {
#     "metric_type": "L2",
#     "params": {"nprobe": 10},
# }


search_params = {
    "metric_type": "L2",
    #"params": {"nprobe": 128}
    
}



In [None]:
len(vectors_to_search) , len(vectors_to_search[0])

In [None]:
?stackoverflow_milvus.search;

In [None]:
?time.time

In [None]:
start_time = time.time()
result = stackoverflow_milvus.search(data=vectors_to_search, anns_field="embedding", param=search_params, limit=3
                                     , output_fields=["Id"]
                                    
                                    )
end_time = time.time()

# for hits in result:
#     for hit in hits:
#         print(f"hit: {hit}, random field: {hit.entity.get('random')}")
print((end_time - start_time))

In [None]:
for hits in result:
    for hit in hits:
        print(f"hit: {hit}, score:{hit.score} id: {hit.entity.get('Id')} , data:{hit.entity._row_data} ")

In [None]:
hit.score

In [None]:
def format_resp(hits, row):
    payload = []
    query = row['PostTitle']
    
    for hit in hits:
        doc_id = int(hit.entity.get('Id'))
        
        r = {
             'query': query
             , 'query_id' : row['PostId']
             ,'doc_id' : doc_id
             , 'is_relevant' : doc_id in row['RelatedPostIds']
             ,'score' : hit.score
             ,'doc_title' : hit.entity.get('Title')


        }
        payload.append(r)    
    return payload

# def fetch_as_relevancy_eval(row,search_params, num_hits=10):
    
    
#     vectors_to_search = [model.encode( row['PostTitle'])]

    
#     result = stackoverflow_milvus.search(data=vectors_to_search, anns_field="embedding", param=search_params, limit=num_hits
#                                      , output_fields=["Id","Title"]
                                    
#                                     )
 
    
#     payload = format_resp(result, row)
    
#     return pd.DataFrame(payload)
    

# def evaluate_relevancy_hits(df,search_params,num_hits=10):
    
#     payload = []
#     for index, row in df.iterrows():

#         payload_query = fetch_as_relevancy_eval(row,search_params)
        
#         payload.extend(payload_query.to_dict(orient='records') )

    
#     #return pd.DataFrame.from_records(payload)
#     return pd.DataFrame(payload)


# def evaluate_relevancy_hits2_old(df,search_params,num_hits=20):
    
    
#     res = df.parallel_apply(fetch_as_relevancy_eval,num_hits=num_hits,search_params=search_params, axis = 1)

#     return res
    
    
def evaluate_relevancy_hits(df,search_params,num_hits=20, batch_size=10):
    
    payload_all = []
    print(f"Encoding {len(df)} vectors")
    
    for pos in tqdm.trange(0, len(df), batch_size):
        
        df_subset = df.iloc[pos:pos + batch_size] 
    
        vectors_to_search = model.encode( list( df_subset['PostTitle']) )

        result = stackoverflow_milvus.search(data=vectors_to_search, anns_field="embedding", param=search_params, limit=num_hits
                                         , output_fields=["Id","Title"]
                                        )

        for hits , row in zip( result, df_subset.to_dict(orient='records') ):
            payload = format_resp(hits, row)
            payload_all.extend(payload)


    print(f"formatted response")

    df_res = pd.DataFrame(payload_all)
    return df_res

In [None]:
pdf_related = pd.read_parquet(path_posts_related)

In [None]:
pdf_related

In [None]:
search_params = {
    "metric_type": "L2",
    #"params": {"nprobe": 128}
    
}

In [None]:
vectors_to_search = model.encode( list( pdf_related.iloc[0:5]['PostTitle']) )

    

In [None]:
result = stackoverflow_milvus.search(data=vectors_to_search, anns_field="embedding", param=search_params, limit=20
                                     , output_fields=["Id","Title"]
                                    )

In [None]:
result

In [None]:
payload_all = []

for hits , row in zip( result, pdf_related.iloc[0:5].to_dict(orient='records') ):
    payload = format_resp(hits, row)
    payload_all.extend(payload)

df_res = pd.DataFrame(payload_all)


In [None]:
df_res

In [None]:
len(pdf_related)


In [None]:
df_res = evaluate_relevancy_hits(pdf_related.iloc[0:50] , search_params=search_params)

In [None]:
df_res = evaluate_relevancy_hits(pdf_related , search_params=search_params)

In [None]:

df_agg_res  = df_res.groupby(['query_id'], as_index=False).apply (lambda x: pd.Series(metrics_utils.all_metrics(x['is_relevant'])))


In [None]:
df_agg_res

In [None]:
df_agg_res.drop(columns='query_id').agg(np.mean)

In [None]:
connections.disconnect('default')

In [None]:
connections.list_connections()

### Speeding up

In [None]:
connections.connect("default", host="localhost", port="19530")


In [None]:
index_ivf_flat = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}


In [None]:
stackoverflow_milvus.create_index("embedding", index_ivf_flat)

In [None]:
resp=stackoverflow_milvus.indexes[0]

In [None]:
resp.to_dict()

In [None]:
stackoverflow_milvus.load()


In [None]:
search_params_ivf_flat = {
    "metric_type": "L2",
    "params": {"nprobe": 10000}
    
}

In [None]:
df_res = evaluate_relevancy_hits(pdf_related , search_params=search_params_ivf_flat)

In [None]:
df_agg_res.drop(columns='query_id').agg(np.mean)

In [None]:
df_agg_res.to_parquet("../tmp/df_agg_res__faiss.parquet", index=False)
df_agg_res.head()

In [None]:
df_res.to_parquet("../tmp/df_res__faiss.parquet", index=False)
df_res.head()

cant add new fields     
order of fields matter    
field size matters    

**Can vectors with duplicate primary keys be inserted into Milvus?**    
Yes. Milvus does not check if vector primary keys are duplicates.


**When vectors with duplicate primary keys are inserted, does Milvus treat it as an update operation?**
No. Milvus does not currently support update operations and does not check if entity primary keys are duplicates. You are responsible for ensuring entity primary keys are unique, and if they aren't Milvus may contain multiple entities with duplicate primary keys.

If this occurs, which data copy will return when queried remains an unknown behavior. This limitation will be fixed in future releases.

https://milvus.io/docs/product_faq.md#Can-vectors-with-duplicate-primary-keys-be-inserted-into-Milvus

In [None]:
# reference

https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb


https://milvus.io/tools/sizing/