In [1]:
# !pip install -U pandas pinecone-client sentence-transformers


# Dense Retriever

This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.



The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised contrastive learning objective. We used the pretrained nreimers/MiniLM-L6-H384-uncased model and fine-tuned in on a 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.




[Doc](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [2]:
from sentence_transformers import SentenceTransformer

import datasets
import rich
from IPython.display import Image, JSON
from IPython.core.display import HTML
import numpy as np

from transformers import AutoTokenizer
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)
import pymilvus

In [3]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [4]:

# Initialize retriever with SentenceTransformer model 
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

## Tokenizer

In [5]:
text = "What does 'super' do in Python? - difference between super().__init__() and explicit superclass __init__()"
#text = "meN shoes running ran"

resp = model.encode(text, output_value=None)

rich.print(resp)

In [6]:
resp['token_embeddings'].shape # for each token, we have embedding


torch.Size([38, 384])

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [8]:
tokenizer(text)


{'input_ids': [101, 2054, 2515, 1005, 3565, 1005, 2079, 1999, 18750, 1029, 1011, 4489, 2090, 3565, 1006, 1007, 1012, 1035, 1035, 1999, 4183, 1035, 1035, 1006, 1007, 1998, 13216, 3565, 26266, 1035, 1035, 1999, 4183, 1035, 1035, 1006, 1007, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokens = tokenizer.tokenize(text) 
rich.print ( tokens)

In [10]:
input_ids= tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2054,
 2515,
 1005,
 3565,
 1005,
 2079,
 1999,
 18750,
 1029,
 1011,
 4489,
 2090,
 3565,
 1006,
 1007,
 1012,
 1035,
 1035,
 1999,
 4183,
 1035,
 1035,
 1006,
 1007,
 1998,
 13216,
 3565,
 26266,
 1035,
 1035,
 1999,
 4183,
 1035,
 1035,
 1006,
 1007]

In [11]:
decoded_string = tokenizer.decode(input_ids)
decoded_string

"what does'super'do in python? - difference between super ( ). _ _ init _ _ ( ) and explicit superclass _ _ init _ _ ( )"

## Dset

In [12]:
dset = datasets.load_from_disk("../data/processed")

In [13]:
dset = dset.map(
    lambda example: {'embedding':model.encode(example['description_final'], device='cpu', normalize_embeddings=True)}, batched=True, batch_size=32)


  0%|          | 0/782 [00:00<?, ?ba/s]

In [14]:
dset

Dataset({
    features: ['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at', 'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio', 'photo_description', 'photographer_username', 'photographer_first_name', 'photographer_last_name', 'exif_camera_make', 'exif_camera_model', 'exif_iso', 'exif_aperture_value', 'exif_focal_length', 'exif_exposure_time', 'photo_location_name', 'photo_location_latitude', 'photo_location_longitude', 'photo_location_country', 'photo_location_city', 'stats_views', 'stats_downloads', 'ai_description', 'ai_primary_landmark_name', 'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude', 'ai_primary_landmark_confidence', 'blur_hash', 'description_final', 'image', 'embedding'],
    num_rows: 24995
})

In [15]:
rich.print ( dset[0] )

In [16]:
collection_name = "unsplash"

## Milvus

In [17]:
connections.connect("default", host="localhost", port="19530")


Milvus `collections` is the same as Elastic Search concept of `indexes` / table.

Each collection is meant for a seperate use case. 

In [18]:
utility.list_collections()

['unsplash']

In [19]:
if collection_name in utility.list_collections():
    utility.drop_collection(collection_name)

In [20]:
?Collection

[0;31mInit signature:[0m [0mCollection[0m[0;34m([0m[0mname[0m[0;34m,[0m [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0musing[0m[0;34m=[0m[0;34m'default'[0m[0;34m,[0m [0mshards_num[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      This is a class corresponding to collection in milvus. 
[0;31mInit docstring:[0m
Constructs a collection by name, schema and other parameters.
Connection information is contained in kwargs.

:param name: the name of collection
:type name: str

:param schema: the schema of collection
:type schema: class `schema.CollectionSchema`

:param using: Milvus link of create collection
:type using: str

:param shards_num: How wide to scale collection. Corresponds to how many active datanodes
                can be used on insert.
:type shards_num: int

:param kwargs:
    * *consistency_level* (``str/int``) --
    Which consistency level to use when searching 

unlike Elastic Search , Milvus requires us to specify the document schema beforehand.   

Currently Milvus stores the metadata for a document in MySql, hence some of the data type names

In [21]:
dset[0]

{'photo_id': 'XMyPniM9LF0',
 'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',
 'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',
 'photo_submitted_at': '2014-09-29 00:08:38.594364',
 'photo_featured': 't',
 'photo_width': 4272,
 'photo_height': 2848,
 'photo_aspect_ratio': 1.5,
 'photo_description': 'Woman exploring a forest',
 'photographer_username': 'michellespencer77',
 'photographer_first_name': 'Michelle',
 'photographer_last_name': 'Spencer',
 'exif_camera_make': 'Canon',
 'exif_camera_model': 'Canon EOS REBEL T3',
 'exif_iso': 400.0,
 'exif_aperture_value': '1.8',
 'exif_focal_length': '50.0',
 'exif_exposure_time': '1/100',
 'photo_location_name': None,
 'photo_location_latitude': None,
 'photo_location_longitude': None,
 'photo_location_country': None,
 'photo_location_city': None,
 'stats_views': 2375421,
 'stats_downloads': 6967,
 'ai_description': 'woman walking in the middle of forest',
 'ai_primary_landmark_name': None,
 'ai_

In [23]:
dim = len(dset[0]['embedding'])
dim

384

In [24]:
fields = [
    FieldSchema(name="photo_id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=512),
    FieldSchema(name="photo_url", dtype=DataType.VARCHAR, max_length=60000),
    FieldSchema(name="photo_image_url", dtype=DataType.VARCHAR, max_length=60000),
    FieldSchema(name="photographer_username", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="photographer_first_name", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="photographer_last_name", dtype=DataType.VARCHAR, max_length=500),

    FieldSchema(name="stats_views", dtype=DataType.INT64),
    FieldSchema(name="stats_downloads", dtype=DataType.INT64),
    FieldSchema(name="description_final", dtype=DataType.VARCHAR, max_length=50_000),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim) ,
]



schema = CollectionSchema(fields, "collection containing unsplash data")

unsplash_milvus = Collection(collection_name, schema, consistency_level="Strong")

In [25]:
schema

{
  auto_id: False
  description: collection containing unsplash data
  fields: [{
    name: photo_id
    description: 
    type: 21
    params: {'max_length': 512}
    is_primary: True
    auto_id: False
  }, {
    name: photo_url
    description: 
    type: 21
    params: {'max_length': 60000}
  }, {
    name: photo_image_url
    description: 
    type: 21
    params: {'max_length': 60000}
  }, {
    name: photographer_username
    description: 
    type: 21
    params: {'max_length': 500}
  }, {
    name: photographer_first_name
    description: 
    type: 21
    params: {'max_length': 500}
  }, {
    name: photographer_last_name
    description: 
    type: 21
    params: {'max_length': 500}
  }, {
    name: stats_views
    description: 
    type: 5
  }, {
    name: stats_downloads
    description: 
    type: 5
  }, {
    name: description_final
    description: 
    type: 21
    params: {'max_length': 50000}
  }, {
    name: embedding
    description: 
    type: 101
    params: {'d

In [26]:
fields = [f.name for f in schema.fields]

In [27]:
fields

['photo_id',
 'photo_url',
 'photo_image_url',
 'photographer_username',
 'photographer_first_name',
 'photographer_last_name',
 'stats_views',
 'stats_downloads',
 'description_final',
 'embedding']

replace Nan or NA columns with a default value

In [28]:
df = dset.to_pandas()[fields]

df['photographer_last_name'] = df['photographer_last_name'].fillna("").astype(str) 

In [29]:
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photographer_username,photographer_first_name,photographer_last_name,stats_views,stats_downloads,description_final,embedding
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,michellespencer77,Michelle,Spencer,2375421,6967,Woman exploring a forest,"[0.08303035, -0.06755101, 0.016757922, 0.09100..."
1,rDLBArZUl1c,https://unsplash.com/photos/rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,ugmonk,Jeff,Sheldon,13784815,82141,Succulents in a terrarium,"[0.097154394, 0.047049347, -0.032927733, 0.002..."
2,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,johnprice,John,Price,1302461,3428,Rural winter mountainside,"[-0.014408475, 0.03351519, 0.0059956918, 0.120..."
3,iuZ_D1eoq9k,https://unsplash.com/photos/iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,krisatomic,Kris,Atomic,2890238,33704,Poppy seeds and flowers,"[-0.058240242, 0.053993173, 0.02657202, 0.0003..."
4,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,jonaseriksson,Jonas,Eriksson,8704860,49662,Silhouette near dark trees,"[0.034429714, 0.034026287, 0.023224471, 0.0585..."


In [30]:
df.dtypes

photo_id                   object
photo_url                  object
photo_image_url            object
photographer_username      object
photographer_first_name    object
photographer_last_name     object
stats_views                 int64
stats_downloads             int64
description_final          object
embedding                  object
dtype: object

## Embedding Insertion

In [31]:
insert_result = unsplash_milvus.insert( df  )



In [32]:
insert_result

(insert count: 24995, delete count: 0, upsert count: 0, timestamp: 437288611590701057, success count: 24995, err count: 0)

In [33]:
unsplash_milvus.num_entities

24995

In [34]:
unsplash_milvus.indexes

[]

the embeddings are inserted but no index is created 

Milvus supports several indexes / ANN

https://milvus.io/docs/index.md

In [35]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 20},
}

# n_list = number of clusters to create

# index = {
#     "index_type": "FLAT",
#     "metric_type": "L2",
#     "params": {}
# }



In [36]:
?unsplash_milvus.create_index

[0;31mSignature:[0m
[0munsplash_milvus[0m[0;34m.[0m[0mcreate_index[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfield_name[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_params[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpymilvus[0m[0;34m.[0m[0morm[0m[0;34m.[0m[0mindex[0m[0;34m.[0m[0mIndex[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates index for a specified field. Return Index Object.

:param field_name: The name of the field to create an index for.
:type  field_name: str

:param index_params: The indexing parameters.
:type  index_params: dict

:param timeout: An optional duration of time in seconds to allow for the RPC. When timeout
                is set to None, client waits until server response or error occur
:type  timeout: float

:param 

create the index

In [37]:
unsplash_milvus.create_index("embedding", index)

Status(code=0, message='')

In [38]:
unsplash_milvus.indexes

[<pymilvus.orm.index.Index at 0x7f06f5fb6cd0>]

load the index into memory

In [39]:
unsplash_milvus.load()


## Embedding Retrieval

In [40]:
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 1}
    
}
# n_probe = number of clusters to search [1 , n_list]

In [41]:
vectors_to_search = list(df.iloc[0:1]['embedding'])







In [42]:
df.iloc[0].to_dict()['description_final']

'Woman exploring a forest'

In [43]:
len(vectors_to_search) , len(vectors_to_search[0])

(1, 384)

In [44]:
%%time
result = unsplash_milvus.search(data=vectors_to_search, anns_field="embedding", param=search_params, limit=3
                                     , output_fields=["photo_id","description_final"]
                                    
                                    )


CPU times: user 10.6 ms, sys: 84 µs, total: 10.6 ms
Wall time: 345 ms


In [45]:
for hits in result:
    for hit in hits:
        print(f"hit: {hit}, score:{hit.score} id: {hit.entity.get('photo_id')} , data:{hit.entity._row_data} ")

hit: (distance: 1.0, id: XMyPniM9LF0), score:1.0 id: XMyPniM9LF0 , data:{'photo_id': 'XMyPniM9LF0', 'description_final': 'Woman exploring a forest'} 
hit: (distance: 0.789671003818512, id: RKNE63GLNAo), score:0.789671003818512 id: RKNE63GLNAo , data:{'photo_id': 'RKNE63GLNAo', 'description_final': 'woman walking around forest'} 
hit: (distance: 0.762397050857544, id: wPwXGGG2HyI), score:0.762397050857544 id: wPwXGGG2HyI , data:{'photo_id': 'wPwXGGG2HyI', 'description_final': 'Woman in a shirt in a forest'} 


just like ES, we get the id , score , and the metadata when inderted

In [46]:
def find_results(query:str , k =5):
    """
    Prints K nearest neighbors similar to the input query.
    Parameters
    ------------
    query: str
        Input query.
    k: int
        Nearest neighbors to fetch.
    """
    
    # Generate the embeddings for the query.
    query_features = model.encode(query, normalize_embeddings=True)
    query_features =np.expand_dims(query_features, axis=0)
    
    
    
    # Perform the dot product between query embeddings and image embeddings 
    #doc_scores = query_features @ corpus.T
    
    result = unsplash_milvus.search(data=query_features, anns_field="embedding", param=search_params, limit=3
                                     , output_fields=["photo_id","description_final","photographer_username","photographer_first_name","photographer_last_name","photo_image_url"]
                                    
                                    )
    

    

    display(HTML(f"<h4>Query: {query} </h4>"))
    
    # Iterate over the top k results
    for hits in result:
        for hit in hits:
            
            photo_data = hit.entity._row_data
            
            print(photo_data)
            display(Image(url=photo_data["photo_image_url"] + "?w=200"))
            
            # Display the attribution text
            display(HTML(f"""
                         Photo title: {photo_data["description_final"]}   <br/>
                         Photo by <a href="https://unsplash.com/@{photo_data["photographer_username"]}?utm_source=SearchWorkshop&utm_medium=referral">{photo_data["photographer_first_name"]} {photo_data["photographer_last_name"]}</a> on <a href="https://unsplash.com/?utm_source=SearchWorkshop&utm_medium=referral">Unsplash</a> <br/>
                         Distance: {hit.score}
                         """
                                            ))
            print()
        
            


In [47]:
find_results( "Two dogs playing in the snow")

{'photo_id': 'FAcSe7SjDUU', 'description_final': 'brown and black dogs running on snow', 'photographer_username': 'lukavovk', 'photographer_first_name': 'Luka', 'photographer_last_name': 'Vovk', 'photo_image_url': 'https://images.unsplash.com/photo-1577366761509-937637f02454'}



{'photo_id': 'AVUX8QXnj4Y', 'description_final': 'tan dog playing on snow', 'photographer_username': 'hitterphoto', 'photographer_first_name': 'Hitter', 'photographer_last_name': 'Rudolf', 'photo_image_url': 'https://images.unsplash.com/photo-1546717689-5955401cd6b2'}



{'photo_id': 'QtxgNsmJQSs', 'description_final': 'white and black dog on snow field', 'photographer_username': 'tadekl', 'photographer_first_name': 'Tadeusz', 'photographer_last_name': 'Lakota', 'photo_image_url': 'https://images.unsplash.com/photo-1547494912-c69d3ad40e7f'}





In [48]:
find_results( "Two dogs playing in the snow")

{'photographer_last_name': 'Vovk', 'photo_image_url': 'https://images.unsplash.com/photo-1577366761509-937637f02454', 'photo_id': 'FAcSe7SjDUU', 'description_final': 'brown and black dogs running on snow', 'photographer_username': 'lukavovk', 'photographer_first_name': 'Luka'}



{'photographer_last_name': 'Rudolf', 'photo_image_url': 'https://images.unsplash.com/photo-1546717689-5955401cd6b2', 'photo_id': 'AVUX8QXnj4Y', 'description_final': 'tan dog playing on snow', 'photographer_username': 'hitterphoto', 'photographer_first_name': 'Hitter'}



{'photographer_last_name': 'Lakota', 'photo_image_url': 'https://images.unsplash.com/photo-1547494912-c69d3ad40e7f', 'photo_id': 'QtxgNsmJQSs', 'description_final': 'white and black dog on snow field', 'photographer_username': 'tadekl', 'photographer_first_name': 'Tadeusz'}





In [49]:
find_results( "boy and girl on a beach")

{'photographer_first_name': 'Daria', 'photographer_last_name': 'Nepriakhina', 'photo_image_url': 'https://images.unsplash.com/uploads/14122598645355eb0b65d/d1524764', 'photo_id': 'pV87YnElHow', 'description_final': 'children enjoying the beach', 'photographer_username': 'epicantus'}



{'photographer_first_name': 'Scott', 'photographer_last_name': 'Webb', 'photo_image_url': 'https://images.unsplash.com/photo-1432105214010-ae5e45b2cebb', 'photo_id': 'rl7mUDEUmVE', 'description_final': 'Couple on the beach', 'photographer_username': 'scottwebb'}



{'photographer_first_name': 'Anton', 'photographer_last_name': 'Lammert', 'photo_image_url': 'https://images.unsplash.com/photo-1572986349976-5b54a901b36b', 'photo_id': 'DtHchyQtyZ8', 'description_final': 'people at beach', 'photographer_username': 'anton_lammert'}





In [50]:
find_results( "image of a man in a desert")

{'photographer_first_name': 'Jeremy', 'photographer_last_name': 'Bishop', 'photo_image_url': 'https://images.unsplash.com/photo-1543182791-a5e80144e4cc', 'photo_id': 'ovXu5yiEVWc', 'description_final': 'person on desert', 'photographer_username': 'jeremybishop'}



{'photographer_first_name': 'David', 'photographer_last_name': 'Monje', 'photo_image_url': 'https://images.unsplash.com/photo-1551286663-102d2d654aa0', 'photo_id': 'eUSfRwB_qY8', 'description_final': 'man standing at desert', 'photographer_username': 'davidmonje'}



{'photographer_first_name': 'David', 'photographer_last_name': 'Billings', 'photo_image_url': 'https://images.unsplash.com/photo-1567713002938-648baf71966e', 'photo_id': 'p8BM14LpLF0', 'description_final': 'silhouette of person standing on desert', 'photographer_username': 'dav_billings'}





## timing

In [51]:
%%timeit
search_query = "Two dogs playing in the snow"
k=5
text_features = model.encode(search_query)


result = unsplash_milvus.search(data=[text_features], anns_field="embedding", param=search_params, limit=k
                                     , output_fields=["photo_id","description_final"]
                                    
                                    )

200 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
