## Part 1 - Basic sentence transformer inference and similarity

In [1]:
from sentence_transformers import SentenceTransformer, util as STutil
model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
#Example from https://sbert.net
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [3]:
sentence_embeddings[0]

array([-1.37173478e-02, -4.28515673e-02, -1.56286266e-02,  1.40537238e-02,
        3.95537391e-02,  1.21796295e-01,  2.94333752e-02, -3.17524038e-02,
        3.54959406e-02, -7.93139935e-02,  1.75878108e-02, -4.04369719e-02,
        4.97259200e-02,  2.54912376e-02, -7.18700215e-02,  8.14968199e-02,
        1.47071329e-03,  4.79626954e-02, -4.50335927e-02, -9.92174819e-02,
       -2.81769559e-02,  6.45045862e-02,  4.44670282e-02, -4.76217009e-02,
       -3.52952406e-02,  4.38671447e-02, -5.28565906e-02,  4.33059118e-04,
        1.01921484e-01,  1.64072327e-02,  3.26996855e-02, -3.45986448e-02,
        1.21339513e-02,  7.94870630e-02,  4.58346773e-03,  1.57778282e-02,
       -9.68209002e-03,  2.87625771e-02, -5.05806282e-02, -1.55793820e-02,
       -2.87907012e-02, -9.62279644e-03,  3.15556414e-02,  2.27348916e-02,
        8.71449262e-02, -3.85027826e-02, -8.84718522e-02, -8.75500124e-03,
       -2.12343112e-02,  2.08923593e-02, -9.02078077e-02, -5.25732674e-02,
       -1.05639147e-02,  

In [4]:
sentence_embeddings[1]

array([ 5.64525127e-02,  5.50024174e-02,  3.13796103e-02,  3.39484885e-02,
       -3.54247168e-02,  8.34668055e-02,  9.88800526e-02,  7.27545470e-03,
       -6.68655150e-03, -7.65809417e-03,  7.93738365e-02,  7.39683921e-04,
        1.49291735e-02, -1.51046747e-02,  3.67674604e-02,  4.78743427e-02,
       -4.81969751e-02, -3.76052558e-02, -4.60278504e-02, -8.89815986e-02,
        1.20228149e-01,  1.30663276e-01, -3.73936482e-02,  2.47853715e-03,
        2.55825766e-03,  7.25814849e-02, -6.80436492e-02, -5.24696112e-02,
        4.90234382e-02,  2.99563147e-02, -5.84429502e-02, -2.02262979e-02,
        2.08822265e-02,  9.76691917e-02,  3.52390707e-02,  3.91141288e-02,
        1.05667831e-02,  1.56234833e-03, -1.30822696e-02,  8.52899719e-03,
       -4.84093698e-03, -2.03766543e-02, -2.71801036e-02,  2.83307731e-02,
        3.66017632e-02,  2.51276121e-02, -9.90861952e-02,  1.15626622e-02,
       -3.60380560e-02, -7.23784044e-02, -1.12670086e-01,  1.12942066e-02,
       -3.86397764e-02,  

In [5]:
#Use cosine similarity to compare the first two embeddings and get a score
similarities = STutil.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])

In [6]:
similarities

tensor([[0.5381]])

## Part 2 - Inference of a small dataset

In [7]:
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset

In [8]:
!free -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
               total        used        free      shared  buff/cache   available
Mem:           7.7Gi       2.6Gi       367Mi       316Mi       4.7Gi       4.4Gi
Swap:          1.0Gi        17Mi       1.0Gi


In [9]:
#See the model card here: https://huggingface.co/intfloat/e5-small-v2
model = SentenceTransformer('intfloat/e5-small-v2')

In [11]:
#Should result in about 100MB less RAM available
!free -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
               total        used        free      shared  buff/cache   available
Mem:           7.7Gi       2.7Gi       299Mi       316Mi       4.7Gi       4.3Gi
Swap:          1.0Gi        17Mi       1.0Gi


In [78]:
#The E5 models expect 'query:' and 'passage:' prefixes
def get_embeddings(texts,prefix="passage:"):
    #The E5 models expects either 'query:' or 'passage:' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=True)
    return embeddings

In [14]:
test_e5 = get_embeddings(["Hello world"])
print(test_e5.shape)
test_e5

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(1, 384)


array([[-0.01651159,  0.05317227,  0.02218877,  0.01459493, -0.03386985,
         0.01365773,  0.06086577, -0.06649579,  0.02530954,  0.04438521,
         0.04585038,  0.01183973, -0.02752812,  0.05420253,  0.02308486,
        -0.03905389,  0.02286719,  0.04260659, -0.11509677,  0.00169173,
         0.0842922 , -0.0500995 , -0.02057447, -0.03929692, -0.05684805,
        -0.02806323,  0.02312   ,  0.02836978, -0.03971665, -0.11180957,
        -0.05623088, -0.0114374 ,  0.05634441, -0.01455612,  0.06406952,
        -0.06155059,  0.02405358,  0.04834818,  0.0012839 ,  0.07727719,
        -0.01154466, -0.04606187,  0.07619117, -0.0493258 , -0.0108081 ,
        -0.0296545 ,  0.00302119, -0.04783588,  0.08394358, -0.02380241,
        -0.00654026,  0.00967073,  0.03202047,  0.06201934,  0.03467854,
         0.07422528,  0.03135679,  0.06370142,  0.01781506,  0.03473587,
        -0.00904139,  0.01034076, -0.13161916,  0.1011786 ,  0.09905522,
         0.09953163,  0.00154796, -0.00053407, -0.0

### We use part of the CC_News dataset

In [15]:
# Load 50000 examples of the the 'cc_news' dataset from Hugging Face
dataset = load_dataset("cc_news",split='train[:50000]')

In [16]:
dataset

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
    num_rows: 50000
})

In [17]:
dataset['title']

['Daughter Duo is Dancing in The Same Company',
 'New York City Ballet Announces Interim Leadership Team',
 'Watch Pennsylvania Ballet & Boston Ballet Face Off for the Super Bowl',
 'dance shoes',
 'Rebecca Krohn on Her Retirement from New York City Ballet',
 "Roy Kaiser to Become Nevada Ballet Theatre's New Artistic Director",
 "What It's Like Inside NYCB After Peter Martins",
 'Nutcracker Secrets and Surprises',
 'Inside the Beijing Dance Academy',
 'dance shoes',
 'Isabella Boylston and James Whiteside Get Hilariously Candid',
 'Ballet Performances This Week',
 'Guillaume Côté on NBoC\'s "Frame by Frame"',
 'Broadway\'s "Carousel" Stars Some Familiar Ballet Faces',
 'The Joffrey Presents Ekman\'s "Midsummer Night\'s Dream"',
 'Wonderfully Simple Graphic Design Software',
 'Bay Community News',
 '25 Year Old Molests Child Under 12',
 'How to install the Trac project management tool on Ubuntu 16.04',
 'New Amazon class certifies cloud pros in securing data on AWS',
 'How to set up a G

In [18]:
title_embeddings = get_embeddings(dataset['title'])

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

In [25]:
import sys
sys.getsizeof(title_embeddings)
#50000 embeddings at 384 dims each is how much in RAM?

76800128

In [19]:
import pickle
with open('cc_news_title_embeddings_50000.pkl', 'wb') as fd:
    pickle.dump(title_embeddings, fd, pickle.HIGHEST_PROTOCOL)

In [26]:
#NOTE that 50k embeddings of 384 dims each uses about 74MB pickled disk space
!ls -lah cc_news_title_embeddings_50000.pkl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-rw-r--r-- 1 root users 74M Sep 14 18:22 cc_news_title_embeddings_50000.pkl


### Part 3 - brute-force nearest neighbor calculation

In [86]:
import numpy as np
def knn(query,k=5):
    query_embedding = get_embeddings(query,prefix="query:")
    cosine_scores = STutil.pytorch_cos_sim(query_embedding, title_embeddings)
    sorted_indices = np.argsort(cosine_scores, axis=1)
    top_k_indices = list(sorted_indices[0])[::-1][:k]
    most_similar = [dataset['title'][i] for i in top_k_indices]
    return most_similar

In [92]:
knn("housing market")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[tensor(14962), tensor(25296), tensor(27162), tensor(15319), tensor(13053)]


['Feb. home prices soar 6.3 pct in a fierce competition to buy - WAFB 9 News Baton Rouge, Louisiana News, Weather, Sports',
 'Given strong Tampa Bay housing market, is it better to rent or buy?',
 'Prices and homes listed up, sales down in Metro Vancouver real estate market',
 'US sales of new homes shot up 4 percent in March - WAFB 9 News Baton Rouge, Louisiana News, Weather, Sports',
 'After the bubble burst: How homebuying fared in Huntington Beach, Fountain Valley, Garden Grove, Westminster']