# Lab 2 - Basic sentence transformer inference and similarity

In [None]:
from sentence_transformers import SentenceTransformer, util as STutil
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#Example from https://sbert.net
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [None]:
sentence_embeddings[0]

In [None]:
sentence_embeddings[1]

In [None]:
#Use cosine similarity to compare the first two embeddings and get a score
similarities = STutil.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])

In [None]:
similarities

## Inference of a small dataset

In [None]:
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset

In [None]:
!free -h

In [None]:
#See the model card here: https://huggingface.co/intfloat/e5-small-v2
model = SentenceTransformer('intfloat/e5-small-v2')

In [None]:
#Should result in about 100MB less RAM available
!free -h

In [None]:
#The E5 models expect 'query: ' and 'passage: ' prefixes
def get_embeddings(texts,prefix="passage: "):
    #The E5 models expects either 'query: ' or 'passage: ' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=True)
    return embeddings

In [None]:
test_e5 = get_embeddings(["Hello world"])
print(test_e5.shape)
test_e5

### We use part of the CC_News dataset

In [None]:
# Load 50000 examples of the the 'cc_news' dataset from Hugging Face
dataset = load_dataset("cc_news",split='train[:50000]')

In [None]:
dataset

In [None]:
dataset['title']

In [None]:
title_embeddings = get_embeddings(dataset['title'])

In [None]:
import sys
sys.getsizeof(title_embeddings)
#50000 embeddings at 384 dims each is how much in RAM?

In [None]:
import pickle
with open('cc_news_title_embeddings_50000.pkl', 'wb') as fd:
    pickle.dump(title_embeddings, fd, pickle.HIGHEST_PROTOCOL)

In [None]:
#NOTE that 50k embeddings of 384 dims each uses about 74MB pickled disk space
!ls -lah cc_news_title_embeddings_50000.pkl

### brute-force nearest neighbor calculation

In [None]:
import numpy as np
def knn(query,k=5):
    query_embedding = get_embeddings(query,prefix="query: ")
    cosine_scores = STutil.pytorch_cos_sim(query_embedding, title_embeddings)
    sorted_indices = np.argsort(cosine_scores, axis=1)
    top_k_indices = list(sorted_indices[0])[::-1][:k]
    most_similar = [dataset['title'][i] for i in top_k_indices]
    return most_similar

In [None]:
knn("housing market")

In [None]:
knn("property market")

In [None]:
knn("ballet dancing changes")

In [None]:
knn("climate change")

In [None]:
knn("global warming in the united states")

In [None]:
knn("taylor swift")