For a given document, find other documents of varying similarity.

In [None]:
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np


pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)

# load train.pkl
train_path = Path.cwd().parent.joinpath('data/processed/train.pkl')
train = pd.read_pickle(train_path)

# load test_unlabelled.pkl
test_path = Path.cwd().parent.joinpath('data/interim/test_unlabelled.pkl')
test = pd.read_pickle(test_path)

# concatenate train and test data
fulldata = pd.concat([train, test])

# load our fine-tuned BigBird-CT with in-batch negatives model which has been trained on the full dataset
model_fulldata_bigbird_ct_path = Path.cwd().parent.joinpath('models/FULLDATA_bigbird-ct')
model = SentenceTransformer(model_fulldata_bigbird_ct_path)

sentences = fulldata['Concatenated'].tolist()
codes = fulldata['ModuleCode'].tolist()

# get document embeddings for our testing set modules
embeddings = model.encode(sentences,
                          batch_size = 16,
                          show_progress_bar = True)

Batches:   0%|          | 0/269 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 696 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


In [None]:
# find the cosine similarity matrix for the embeddings
cos_sim = util.cos_sim(embeddings, embeddings)

# add all pairs to a list, with their cosine similarity score, including self-similarities
all_sentence_combinations = []
for i in range(len(cos_sim) - 1):
    for j in range(i, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

In [None]:
def most_similar_embeddings(document_id, every_n):
    '''
    Get the documents of varying similarity to some specified document
    The first document displayed is that in question
    '''
    # get all pairs that feature the document embedding of interest
    similarity_pairs = []
    for score, i, j in all_sentence_combinations:
        if (i == document_id) or (j == document_id):
            similarity_pairs.append([score, i, j])
    # sort the list by descending cosine similarity
    similarity_pairs = sorted(similarity_pairs, key = lambda x: x[0], reverse = True)
    # get every nth similarity pair
    similarity_pairs = similarity_pairs[::every_n]

    # make dataframe containing details of fifty largest similarity pairs
    most_similar_df = pd.DataFrame(columns = ['ModuleCode', 'Document', 'Cosine Similarity'])
    for comparison in similarity_pairs:
        score, i, j = comparison
        if j == document_id:
            to_append = i
        else:
            to_append = j
        most_similar_df.loc[len(most_similar_df)] = [codes[to_append], sentences[to_append], float(score)]

    return most_similar_df

most_similar = most_similar_embeddings(3200, 10)

most_similar

Output hidden; open in https://colab.research.google.com to view.

The first document given in the output is the one being compared to.