In [88]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np
import faiss

## 1. Load models to test out

In [89]:
cosine_similarity_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')

In [90]:
dotprod_model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')

## 2. Load data

In [91]:
with open('./data/processed_books.json') as f:
    data = json.load(f)

In [92]:
texts = list(map(lambda x: x['Text'], data))

## 3. Generate embeddings for all models

In [93]:
models = ["msmarco-MiniLM-L6-cos-v5", "msmarco-MiniLM-L12-cos-v5", "msmarco-distilbert-base-tas-b", "msmarco-distilbert-dot-v5", "msmarco-distilbert-cos-v5", "msmarco-bert-base-dot-v5"]

In [94]:
for model in models:
    loaded_model = SentenceTransformer(f'sentence-transformers/{model}')
    loaded_embeddings = loaded_model.encode(texts, show_progress_bar=True)
    with open(f'./embeddings/{model}_emb.npy', 'wb') as f:
        np.save(f, loaded_embeddings)
    del loaded_model

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

## 4. Try out semantic search over the embeddings

In [95]:
cosine_similarity_embeddings = np.load('./embeddings/msmarco-distilbert-cos-v5_emb.npy')
dotprod_embeddings = np.load('./embeddings/msmarco-bert-base-dot-v5_emb.npy')

### 4a. Inner product model

In [96]:
dotprod_index = faiss.IndexFlatIP(768)

In [97]:
dotprod_index.add(dotprod_embeddings)

In [98]:
dotprod_index.ntotal

78

In [99]:
dotprod_index.search(dotprod_embeddings[0:5], 5)

(array([[201.04308, 188.15239, 184.08752, 183.69452, 183.38145],
        [209.02307, 194.02136, 192.75854, 191.89632, 191.85345],
        [207.8888 , 191.42604, 191.14378, 191.03983, 190.74652],
        [204.66745, 192.45135, 191.84619, 191.73785, 190.88144],
        [208.35567, 192.5751 , 191.80557, 191.03018, 190.52061]],
       dtype=float32),
 array([[ 0, 38, 52, 66, 46],
        [ 1, 74, 65, 48, 25],
        [ 2, 46, 34, 65, 48],
        [ 3, 65, 48, 71, 54],
        [ 4, 46, 11, 29, 34]]))

In [100]:
query = "Book about adventure in the jungle"

In [101]:
query_embed = dotprod_model.encode([query])
query_dists, query_nnids = dotprod_index.search(query_embed, 5)
for id in query_nnids[0]:
    print(data[id]['Name'], data[id]['Author'])

The Jungle Book Rudyard Kipling
Tarzan and the Lost Empire Edgar Rice Burroughs
The Adventures of Sherlock Holmes Arthur Conan Doyle
The Adventure of Tom Sawyer Mark Twain
Life on the Mississippi Mark Twain


In [102]:
query_nnids[0]

array([67, 24, 44, 47, 51])

### 4b. Cosine similarity model

In [103]:
cossim_index = faiss.IndexFlatIP(768)

In [104]:
cosine_similarity_embeddings.shape

(78, 768)

In [105]:
np.linalg.norm(cosine_similarity_embeddings, axis=1)

array([1.        , 1.        , 1.        , 0.9999999 , 0.99999994,
       1.        , 0.99999994, 1.        , 1.0000001 , 1.        ,
       0.99999994, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 0.99999994, 0.99999994,
       1.        , 0.99999994, 0.9999999 , 1.        , 1.        ,
       1.        , 0.99999994, 1.        , 1.        , 1.0000001 ,
       0.9999999 , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.0000001 ,
       1.0000001 , 0.99999994, 1.        , 1.        , 1.        ,
       0.9999999 , 1.        , 0.99999994, 0.99999994, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.99999994, 0.99999994, 1.        , 0.99999994, 1.        ,
       1.        , 1.        , 1.        , 0.99999994, 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.     

In [106]:
cossim_index.add(cosine_similarity_embeddings)

In [107]:
query = "Book about poor kid"

In [108]:
query_embed = cosine_similarity_model.encode([query])
query_embed_normalized = query_embed 
query_dists, query_nnids = cossim_index.search(query_embed_normalized, 5)
for num, id in enumerate(query_nnids[0]):
    print(data[id]['Name'], "|", data[id]['Author'],"|", query_dists[0][num])

A Modest Proposal | Jonathan Swift | 0.328462
Grimm's Fairy Tales | Jacob Grimm and Wilhelm Grimm | 0.31709605
Little Women | Louisa May Alcott | 0.3064351
Winnie-the-Pooh | A. A. Milne | 0.2992
Notre-Dame de Paris | Victor Hugo | 0.26384926
