In [1]:
# !pip install -U pandas pinecone-client sentence-transformers


# Dense Retriever

This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.



The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised contrastive learning objective. We used the pretrained nreimers/MiniLM-L6-H384-uncased model and fine-tuned in on a 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.




[Doc](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [2]:
from sentence_transformers import SentenceTransformer

import datasets
import rich
from IPython.display import Image, JSON
from IPython.core.display import HTML
import numpy as np

from transformers import AutoTokenizer


In [3]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [4]:

# Initialize retriever with SentenceTransformer model 
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

## Tokenizer

In [5]:
text = "What does 'super' do in Python? - difference between super().__init__() and explicit superclass __init__()"
#text = "meN shoes running ran"

resp = model.encode(text, output_value=None)

rich.print(resp)

In [6]:
resp['token_embeddings'].shape # for each token, we have embedding


torch.Size([38, 384])

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [8]:
tokenizer(text)


{'input_ids': [101, 2054, 2515, 1005, 3565, 1005, 2079, 1999, 18750, 1029, 1011, 4489, 2090, 3565, 1006, 1007, 1012, 1035, 1035, 1999, 4183, 1035, 1035, 1006, 1007, 1998, 13216, 3565, 26266, 1035, 1035, 1999, 4183, 1035, 1035, 1006, 1007, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokens = tokenizer.tokenize(text) 
rich.print ( tokens)

In [10]:
input_ids= tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2054,
 2515,
 1005,
 3565,
 1005,
 2079,
 1999,
 18750,
 1029,
 1011,
 4489,
 2090,
 3565,
 1006,
 1007,
 1012,
 1035,
 1035,
 1999,
 4183,
 1035,
 1035,
 1006,
 1007,
 1998,
 13216,
 3565,
 26266,
 1035,
 1035,
 1999,
 4183,
 1035,
 1035,
 1006,
 1007]

In [11]:
decoded_string = tokenizer.decode(input_ids)
decoded_string

"what does'super'do in python? - difference between super ( ). _ _ init _ _ ( ) and explicit superclass _ _ init _ _ ( )"

## Dset

In [12]:
dset = datasets.load_from_disk("../data/processed")

In [13]:
dset

Dataset({
    features: ['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at', 'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio', 'photo_description', 'photographer_username', 'photographer_first_name', 'photographer_last_name', 'exif_camera_make', 'exif_camera_model', 'exif_iso', 'exif_aperture_value', 'exif_focal_length', 'exif_exposure_time', 'photo_location_name', 'photo_location_latitude', 'photo_location_longitude', 'photo_location_country', 'photo_location_city', 'stats_views', 'stats_downloads', 'ai_description', 'ai_primary_landmark_name', 'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude', 'ai_primary_landmark_confidence', 'blur_hash', 'description_final', 'image'],
    num_rows: 24995
})

In [14]:
rich.print ( dset[0] )

In [15]:
corpus = model.encode(dset['description_final'])

In [19]:
def find_results(query:str , k =5):
    
    
    query_features = model.encode(query)
    doc_scores = query_features @ corpus.T

    top_items = doc_scores.argsort()[-k:][::-1]



    degug_info = {
         "query_original":  query 
         , "query_processed" : query_features
        , "doc_scores":  doc_scores 
        , "top_items":  top_items 
    }
    rich.print (degug_info )
    

    display(HTML(f"<h1>Query: {query} </h1>"))
    
    # Iterate over the top k results
    for idx, photo_data in enumerate( dset.select(top_items)):

        doc_idx = top_items[idx]
        # Display the photo
        display(Image(url=photo_data["photo_image_url"] + "?w=200"))

        # Display the attribution text
        display(HTML(f"""
                     Photo title: {photo_data["description_final"]}   <br/>
                     Photo by <a href="https://unsplash.com/@{photo_data["photographer_username"]}?utm_source=NaturalLanguageImageSearch&utm_medium=referral">{photo_data["photographer_first_name"]} {photo_data["photographer_last_name"]}</a> on <a href="https://unsplash.com/?utm_source=SearchWorkshop&utm_medium=referral">Unsplash</a> <br/>
                     Distance: {doc_scores[doc_idx]}
                     """
                                        ))
        print()

In [20]:
find_results( "Two dogs playing in the snow")
















In [21]:
find_results( "boy and girl on a beach")
















In [22]:
find_results( "image of a man in a desert")
















## timing

In [23]:
%%timeit
search_query = "Two dogs playing in the snow"
k=5
text_features = model.encode(search_query)
doc_scores = text_features @ corpus.T

top_items = doc_scores.argsort()[-k:][::-1]


44.6 ms ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
