In [None]:
import pathlib
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
MODEL_DIR = pathlib.Path().absolute().parent / "models"

In [None]:
# Define the device to use, using a CUDA GPU if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir=MODEL_DIR)
model = AutoModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
SONNET18 = [
    "Shall I compare thee to a summer’s day?",
    "Thou art more lovely and more temperate:",
    "Rough winds do shake the darling buds of May,",
    "And summer’s lease hath all too short a date;",
    "Sometime too hot the eye of heaven shines,",
    "And often is his gold complexion dimm'd;",
    "And every fair from fair sometime declines,",
    "By chance or nature’s changing course untrimm'd;",
    "But thy eternal summer shall not fade,",
    "Nor lose possession of that fair thou ow’st;",
    "Nor shall death brag thou wander’st in his shade,",
    "When in eternal lines to time thou grow’st:",
    "So long as men can breathe or eyes can see,",
    "So long lives this, and this gives life to thee."
]

SONNET30 = [
    "When to the sessions of sweet silent thought",
    "I summon up remembrance of things past,",
    "I sigh the lack of many a thing I sought,",
    "And with old woes new wail my dear time’s waste:",
    "Then can I drown an eye, unused to flow,",
    "For precious friends hid in death’s dateless night,",
    "And weep afresh love’s long since cancell’d woe,",
    "And moan the expense of many a vanish’d sight:",
    "Then can I grieve at grievances foregone,",
    "And heavily from woe to woe tell o’er",
    "The sad account of fore-bemoaned moan,",
    "Which I new pay as if not paid before.",
    "But if the while I think on thee, dear friend,",
    "All losses are restor’d and sorrows end."
]

In [None]:
sentences = [line.lower() for sonnet in [SONNET18, SONNET30] for line in sonnet]

In [None]:
def encode(strs):
    encoded_input = tokenizer(strs, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :].detach().cpu().numpy()

In [None]:
sentence_embeddings = encode(sentences)
sentence_embeddings.shape

In [None]:
d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)

In [None]:
index.add(sentence_embeddings)

In [None]:
k = 5
xq = encode(["remembering the past"])
D, I = index.search(xq, k)
print(D, I)
for i in I[0]:
    print(i, sentences[i])