In [1]:
!pip install sentence-transformers datasets tensorboardX peft

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Training 

This script is based on https://sbert.net/docs/sentence_transformer/training_overview.html

In [4]:
import logging
import random

import numpy
import torch

from datasets import Dataset, load_dataset

from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerModelCardData,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss, CachedMultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
random.seed(12)
torch.manual_seed(12)
numpy.random.seed(12)

In [13]:
# Feel free to adjust these variables:
use_prompts = False
include_prompts_in_pooling = False

In [None]:
base_model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"


In [14]:
model = SentenceTransformer(
    base_model_name,
    #tokenizer_kwargs={"max_seq_length": 512},
    model_card_data=SentenceTransformerModelCardData(
        language="de",
        license="apache-2.0",
        model_name=f"{base_model_name} trained on german Natural Questions pairs",
    ),
).to(torch.bfloat16)

2025-05-14 09:52:11 - Use pytorch device_name: mps
2025-05-14 09:52:11 - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-mpnet-base-v2


In [15]:
model.set_pooling_include_prompt(include_prompts_in_pooling)

In [None]:
#from peft import LoraModel, LoraConfig, TaskType

In [17]:
#peft_config = LoraConfig(
#    task_type= TaskType.FEATURE_EXTRACTION,
#    inference_mode=False,
#    r=64,
#    lora_alpha=128,
#    lora_dropout=0.1,
#)
#model.add_adapter(peft_config)

# 2. (Optional) Define prompts
if use_prompts:
    query_prompt = "query: "
    corpus_prompt = "document: "
    prompts = {
        "query": query_prompt,
        "answer": corpus_prompt,
    }

In [18]:
natural_questions_german = load_dataset("oliverguhr/natural-questions-german", split="train")

natural_questions_german = natural_questions_german.remove_columns(["answer", "query"]) # delete the english language columns
natural_questions_german = natural_questions_german.rename_column("query_de", "query").rename_column("answer_de", "answer")


natural_questions_german = natural_questions_german.train_test_split(test_size=0.1, seed=12)

train_dataset: Dataset = natural_questions_german["train"]
eval_dataset: Dataset = natural_questions_german["test"]

In [19]:
train_dataset

Dataset({
    features: ['query', 'answer'],
    num_rows: 90207
})

In [21]:
# 4. Define a loss function
#loss = CachedMultipleNegativesRankingLoss(model, mini_batch_size=32) # <- this does not work with mps
loss = MultipleNegativesRankingLoss(model)

In [22]:
# 5. (Optional) Specify training arguments
run_name = "german-nq-" + base_model_name.split("/")[-1]
if use_prompts:
    run_name += "-prompts"
if not include_prompts_in_pooling:
    run_name += "-exclude-pooling-prompts"
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"models/{run_name}",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=4e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=0.2,
    save_strategy="steps",
    save_steps=0.2,
    save_total_limit=2,
    logging_steps=5,
    logging_first_step=True,
    run_name=run_name,  # Will be used in W&B if `wandb` is installed
    seed=12,
    prompts=prompts if use_prompts else None,
    report_to="tensorboard",
)

In [23]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    #evaluator=dev_evaluator,
)
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 