In [2]:
import os
from pathlib import Path
import obsidiantools.api as otools
# two level above the current directory
wkd = Path(os.getcwd()).parent.parent.parent
vault = otools.Vault(wkd).connect().gather()

In [3]:
# Define a list with sentences (1k - 100k sentences)
corpus = []

for k, v in vault.text_index.items():
    corpus.append(f"File:\n{k}\n\nContent:\n{v}")

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-uncased'
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
# or reuse a pretrained model
# model_name = './output/tsdae-model'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [7]:
import torch
import re

# regex to grab the file title from content (File:\n(blabla))
regex = re.compile(r"File:\n(.*)")

def search(query: str):
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(
            regex.findall(corpus[idx])[0],
            "(Score: {:.4f})".format(score))

In [None]:
queries = [
    "monkeys",
    "bias",
    "multiverse",
    "reinforcement learning"
]
for query in queries:
    search(query)
    print("\n---------------------------------------------------------------\n")

In [None]:
import nltk
nltk.download('all')

In [9]:
from sentence_transformers import evaluation
sentences1 = ['Ant is the most successful species on Earth', 'Cooperating most of the time while punishing un-cooperating behaviours, with some forgiveness, is a very efficient strategy in a multiplayer game', 'There is light in the world, and it is us!']
sentences2 = ['As life emerged from the primeval soup several billion ago, the molecules that caused themselves to be replicated at the expense of others became more numerous', 'Sharks, after eating a lot of fish have dirty teeth, they move to a place in the ocean to have their teeth cleaned by small fish, they could easily close their jaws when the fish is in it but if they do so, the others small fishes won\'t be likely to cooperate with this shark in the future', 'dog food']
scores = [0.6, 0.8, 0.2]

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

In [None]:
# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(corpus)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=500
)

model.save(f'output/{model_name}-obsidian')

In [None]:
model.save_to_hub(f"{model_name.split('/')[-1]}-obsidian", exist_ok=True)

In [None]:
search("monkeys")