In [None]:
import argparse
import datetime
import json
import pathlib
import PyPDF2
import PyPDF2.errors
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
ROOT_DIR = pathlib.Path().absolute().parent
DATA_DIR = ROOT_DIR / "data"
DOCUMENT_DIR = DATA_DIR / "docs"
TEXT_DIR = DATA_DIR / "texts"

# The models directory is where we store the models, pre-trained and our index
# It can be rebuilt from the text files.
MODEL_DIR = ROOT_DIR / "models"

In [None]:
# Define the device to use, using a CUDA GPU if available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir=MODEL_DIR)
model = AutoModel.from_pretrained('bert-base-uncased').to(device)


In [None]:
text_files = [(TEXT_DIR / f) for f in ['idris2.txt', 'jshutt.txt']]

In [None]:
texts = [f.read_text(encoding='utf-8') for f in text_files]

In [None]:
def get_embeddings(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :].detach().cpu().numpy()


In [None]:
embeddings = get_embeddings(texts)

In [None]:
index = faiss.IndexFlatIP(embeddings.shape[1])
index.train(embeddings)
index.add(embeddings)

In [None]:
def search(query, index, k=5):
    query_embedding = get_embeddings([query])
    scores, indices = index.search(query_embedding, k)
    return scores, indices

def print_search(query, index):
    D, I = search(query, index)
    print(D, I)
    for i in I[0]:
        if i >= 0:
            print(i, text_files[i], texts[i][:100])

In [None]:
print_search("Can Racket code be written as S-expressions?", index)

In [None]:
print_search("Shall I compare thee to a summer's day", index)