In [1]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

In [2]:
cosine_similarity_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4', device='cpu')
cosine_similarity_embeddings = np.load('msmarco-distilbert-base-v4_emb.npy')
cosine_similarity_embeddings_normalized = cosine_similarity_embeddings / np.linalg.norm(cosine_similarity_embeddings, axis=1)[:, None]
cosine_similarity_index = faiss.IndexFlatIP(768)
cosine_similarity_index.add(cosine_similarity_embeddings_normalized)

In [3]:
dotprod_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b', device='cpu')
dotprod_embeddings = np.load('msmarco-distilbert-base-tas-b_emb.npy')
dotprod_index = faiss.IndexFlatIP(768)
dotprod_index.add(dotprod_embeddings)

In [4]:
with open('processed_books_metadata.json') as f:
    metadata = json.load(f)

### Test out dot prod model

In [16]:
dotprod_topone_acc = 0
dotprod_topfive_acc = 0
dotprod_topten_acc = 0
for idx, book in enumerate(metadata):
    query = book["Description"]
    query_embedding = dotprod_model.encode([query])
    D, I = dotprod_index.search(query_embedding, 10)
    I = list(I[0])
    if I[0] == idx:
        dotprod_topone_acc += 1
    if idx in I[:5]:
        dotprod_topfive_acc += 1
    if idx in I:
        dotprod_topten_acc += 1
dotprod_topone_acc /= len(metadata)
dotprod_topfive_acc /= len(metadata)
dotprod_topten_acc /= len(metadata)
print(f"Top one accuracy: {dotprod_topone_acc}, top five accuracy: {dotprod_topfive_acc}, top ten accuracy: {dotprod_topten_acc}")

Top one accuracy: 0.7435897435897436, top five accuracy: 0.9230769230769231, top ten accuracy: 0.9743589743589743


### Test out cosine similarity model

In [17]:
cossim_topone_acc = 0
cossim_topfive_acc = 0
cossim_topten_acc = 0
for idx, book in enumerate(metadata):
    query = book["Description"]
    query_embedding = cosine_similarity_model.encode([query])
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1)[:, None]
    D, I = dotprod_index.search(query_embedding, 10)
    I = list(I[0])
    if I[0] == idx:
        cossim_topone_acc += 1
    if idx in I[:5]:
        cossim_topfive_acc += 1
    if idx in I:
        cossim_topten_acc += 1
cossim_topone_acc /= len(metadata)
cossim_topfive_acc /= len(metadata)
cossim_topten_acc /= len(metadata)
print(f"Top one accuracy: {cossim_topone_acc}, top five accuracy: {cossim_topfive_acc}, top ten accuracy: {cossim_topten_acc}")

Top one accuracy: 0.5384615384615384, top five accuracy: 0.8717948717948718, top ten accuracy: 0.9102564102564102
