In [3]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np
import faiss

## 1. Load models to test out

In [4]:
cosine_similarity_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')

Downloading (…)66baf/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)de5b566baf/README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

Downloading (…)5b566baf/config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)66baf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

Downloading (…)de5b566baf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b566baf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
dotprod_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

## 2. Load data

In [6]:
with open('processed_books.json') as f:
    data = json.load(f)

In [7]:
texts = list(map(lambda x: x['Text'], data))

## 3. Generate embeddings

In [8]:
cosine_similarity_embeddings = cosine_similarity_model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
dotprod_embeddings = dotprod_model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
cosine_similarity_embeddings.shape, dotprod_embeddings.shape

((78, 768), (78, 768))

In [11]:
with open('msmarco-distilbert-cos-v5_emb.npy', 'wb') as f:
    np.save(f, cosine_similarity_embeddings)
with open('msmarco-distilbert-base-tas-b_emb.npy', 'wb') as f:
    np.save(f, dotprod_embeddings)

## 4. Try out semantic search over the embeddings

In [None]:
# cosine_similarity_embeddings = np.load('msmarco-distilbert-cos-v5_emb.npy')
# dotprod_embeddings = np.load('msmarco-distilbert-base-tas-b_emb.npy')

### 4a. Inner product model

In [12]:
dotprod_index = faiss.IndexFlatIP(768)

In [13]:
dotprod_index.add(dotprod_embeddings)

In [14]:
dotprod_index.ntotal

78

In [15]:
dotprod_index.search(dotprod_embeddings[0:5], 5)

(array([[131.4838  , 110.01494 , 108.75418 , 107.001305, 105.67543 ],
        [138.61632 , 113.564575, 110.47829 , 109.51257 , 108.82703 ],
        [143.60857 , 119.47454 , 115.846535, 115.78643 , 114.877655],
        [131.46785 , 112.02876 , 109.30338 , 109.0805  , 108.019295],
        [136.34137 , 111.84899 , 109.53629 , 108.93631 , 108.492874]],
       dtype=float32),
 array([[ 0, 17, 62, 38, 58],
        [ 1, 71, 31, 63, 58],
        [ 2, 65,  7, 16, 46],
        [ 3, 71, 44, 34, 65],
        [ 4, 17, 46, 11, 29]]))

In [16]:
query = "Book about adventure in the jungle"

In [17]:
query_embed = dotprod_model.encode([query])
query_dists, query_nnids = dotprod_index.search(query_embed, 5)
for id in query_nnids[0]:
    print(data[id]['Name'], data[id]['Author'])

The Jungle Book Rudyard Kipling
The Wonderful Wizard of Oz L. Frank Baum
Tarzan and the Lost Empire Edgar Rice Burroughs
Treasure Island Robert Louis Stevenson
A Journey to the Centre of the Earth Jules Verne


In [18]:
query_nnids[0]

array([67, 69, 24, 45, 32])

### 4b. Cosine similarity model

In [19]:
cossim_index = faiss.IndexFlatIP(768)

In [20]:
cosine_similarity_embeddings.shape

(78, 768)

In [21]:
np.linalg.norm(cosine_similarity_embeddings, axis=1)

array([1.        , 1.        , 1.        , 0.9999999 , 0.99999994,
       1.        , 0.99999994, 1.        , 1.0000001 , 1.        ,
       0.99999994, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 0.99999994, 0.99999994,
       1.        , 0.99999994, 0.9999999 , 1.        , 1.        ,
       1.        , 0.99999994, 1.        , 1.        , 1.0000001 ,
       0.9999999 , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.0000001 ,
       1.0000001 , 0.99999994, 1.        , 1.        , 1.        ,
       0.9999999 , 1.        , 0.99999994, 0.99999994, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.99999994, 0.99999994, 1.        , 0.99999994, 1.        ,
       1.        , 1.        , 1.        , 0.99999994, 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.     

In [22]:
cossim_index.add(cosine_similarity_embeddings)

In [23]:
query = "Book about poor kid"

In [26]:
query_embed = cosine_similarity_model.encode([query])
query_embed_normalized = query_embed 
query_dists, query_nnids = cossim_index.search(query_embed_normalized, 5)
for num, id in enumerate(query_nnids[0]):
    print(data[id]['Name'], "|", data[id]['Author'],"|", query_dists[0][num])

A Modest Proposal | Jonathan Swift | 0.328462
Grimm's Fairy Tales | Jacob Grimm and Wilhelm Grimm | 0.31709605
Little Women | Louisa May Alcott | 0.3064351
Winnie-the-Pooh | A. A. Milne | 0.2992
Notre-Dame de Paris | Victor Hugo | 0.26384926
