In [2]:
import os
from pathlib import Path
import obsidiantools.api as otools
# two level above the current directory
wkd = Path(os.getcwd()).parent.parent.parent
vault = otools.Vault(wkd).connect().gather()

In [3]:
# Define a list with sentences (1k - 100k sentences)
corpus = []

for k, v in vault.text_index.items():
    corpus.append(f"File:\n{k}\n\nContent:\n{v}")

In [6]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-uncased'
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
# or reuse a pretrained model
# model_name = './output/tsdae-model'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

Downloading: 100%|██████████| 612/612 [00:00<00:00, 524kB/s]
Downloading: 100%|██████████| 86.7M/86.7M [00:01<00:00, 53.4MB/s]
Downloading: 100%|██████████| 383/383 [00:00<00:00, 111kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 919kB/s] 
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 1.12MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 35.9kB/s]


In [7]:
import torch
import re

# regex to grab the file title from content (File:\n(blabla))
regex = re.compile(r"File:\n(.*)")

def search(query: str):
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(
            regex.findall(corpus[idx])[0],
            "(Score: {:.4f})".format(score))

In [8]:
queries = [
    "monkeys",
    "bias",
    "multiverse",
    "reinforcement learning"
]
for query in queries:
    search(query)
    print("\n---------------------------------------------------------------\n")

Query: monkeys

Top 5 most similar sentences in corpus:
Data privacy insecurities emerge from the monkey brain (Score: 0.8625)
Animal instinct (Score: 0.8350)
Monkey Brain (Score: 0.8334)
Homo Sapiens (Score: 0.8074)
Chocolate and the monkey brain (Score: 0.8055)

---------------------------------------------------------------

Query: bias

Top 5 most similar sentences in corpus:
Cognitive bias learned by AI (Score: 0.8516)
Overcoming Bias  Standard Biases (Score: 0.8488)
Bias (Score: 0.8453)
Survivorship Bias (Score: 0.8419)
Cognitive biases (Score: 0.8331)

---------------------------------------------------------------

Query: multiverse

Top 5 most similar sentences in corpus:
Quantum computers are Quantum multiverses (Score: 0.8417)
The multiverse (Score: 0.8367)
Quantum level three multiverse (Score: 0.8271)
We are unaware of aliens because they live in the level two unobservable multiverse (Score: 0.8193)
Fine tuned Universe (Score: 0.8138)

-------------------------------------

In [41]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/louisbeaumont/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/louisbeaumont/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/louisbeaumont/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/louisbeaumont/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/louisbeaumont/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_da

True

In [9]:
from sentence_transformers import evaluation
sentences1 = ['Ant is the most successful species on Earth', 'Cooperating most of the time while punishing un-cooperating behaviours, with some forgiveness, is a very efficient strategy in a multiplayer game', 'There is light in the world, and it is us!']
sentences2 = ['As life emerged from the primeval soup several billion ago, the molecules that caused themselves to be replicated at the expense of others became more numerous', 'Sharks, after eating a lot of fish have dirty teeth, they move to a place in the ocean to have their teeth cleaned by small fish, they could easily close their jaws when the fish is in it but if they do so, the others small fishes won\'t be likely to cooperate with this shark in the future', 'dog food']
scores = [0.6, 0.8, 0.2]

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

In [10]:
# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(corpus)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=500
)

model.save(f'output/{model_name}-obsidian')

When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6-cos-v1 and are newly initialized: ['encoder.layer.0.crossattention.self.query.weight', 'encoder.layer.1.crossattention.output.dense.weight', 'encoder.layer.3.crossattention.self.value.weight', 'encoder.layer.1.crossattention.output.LayerNorm.weight', 'encoder.layer.5.crossattention.self.value.bias', 'encoder.layer.1.crossattention.output.dense.bias', 'encoder.layer.4.crossattention.self.key.bias', 'encoder.layer.1.crossattention.self.value.weight', 'cls.predictions.transform.LayerNorm.weight', 'encoder.layer.0.crossattention.self.key.weight', 'encoder.layer.4.crossattention.self.value.weight', 'encoder.layer.5.crossattention.output.LayerNorm.bias', 'encoder.layer.2.crossattention.self.key.bias', 'encoder.layer.4.crossattention.self.value.bias', 'encoder.layer.0.crossattention.output.dense.bias

In [43]:
search("monkeys")

Query: monkeys

Top 5 most similar sentences in corpus:
Monkey Brain (Score: 0.7949)
The fall of evolution (Score: 0.7936)
The Victory Of Man At Nature's Game (Score: 0.7871)
Data privacy insecurities emerge from the monkey brain (Score: 0.7849)
Goal (Score: 0.7814)
