# Fine tune an Open Source Embedding Model

https://colab.research.google.com/drive/1VVfR3i07b7OsDsRLHm0ADvPdNqA2RgmI

# Check Setup

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# Check GPU availability

flg_cuda_is_available = torch.cuda.is_available()
if flg_cuda_is_available:
  print(f"CUDA Available: {flg_cuda_is_available}")
  print(f"GPU: {torch.cuda.get_device_name(0)}")

# Validate model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

print("Setup validated successfully!")

CUDA Available: True
GPU: Tesla T4


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Setup validated successfully!


In [None]:
import transformers
import sentence_transformers

print(f"{transformers.__version__=}")
print(f"{sentence_transformers.__version__=}")

transformers.__version__='4.55.4'
sentence_transformers.__version__='5.1.0'


# Load Model

[Sentence Transformers Pretrained Models](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)

[bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) is a small yet performant embedding model.
* Only English
* Dimension: 384
* Context: 512

In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained embedding model
model_id = "BAAI/bge-small-en-v1.5"
model_bsl = SentenceTransformer(model_id)
print("Model loaded successfully!")

Model loaded successfully!


# Load Dataset

[philschmid/finanical-rag-embedding-dataset](https://huggingface.co/datasets/philschmid/finanical-rag-embedding-dataset)

In [None]:
from datasets import load_dataset, DatasetDict

# Load your dataset (replace with your specific dataset)
dataset = load_dataset("philschmid/finanical-rag-embedding-dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'context'],
        num_rows: 7000
    })
})

In [None]:
print(dataset["train"][0])  # Inspect the data structure

{'question': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?', 'context': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.'}


# How Does IR evaluator works?

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Define queries, documents, and relevant pairs
queries = {"query1": "What is quantum mechanics?",
           "query2": "Who was the first Roman Emperor?"
           }
docs = {"doc1": "Quantum mechanics is the branch of physics that studies the sub-atomic behaviour",
        "doc2": "The sun rises in the east.",
        "doc3": "Augustus (formerly named Octavious, who changed his name after Julius Caesar after his death), is considered the one who ended the Roman Republic and started the empyre"}
relevant_docs = {"query1": ["doc1"],
                 "query2": ["doc2"]}

# Initialize evaluator
evaluator_dummy = InformationRetrievalEvaluator(queries, docs, relevant_docs)

# Evaluate the model
model_bsl.evaluate(evaluator_dummy)



{'cosine_accuracy@1': 0.5,
 'cosine_accuracy@3': 1.0,
 'cosine_accuracy@5': 1.0,
 'cosine_accuracy@10': 1.0,
 'cosine_precision@1': 0.5,
 'cosine_precision@3': 0.3333333333333333,
 'cosine_precision@5': 0.2,
 'cosine_precision@10': 0.1,
 'cosine_recall@1': 0.5,
 'cosine_recall@3': 1.0,
 'cosine_recall@5': 1.0,
 'cosine_recall@10': 1.0,
 'cosine_ndcg@10': 0.8154648767857288,
 'cosine_mrr@10': 0.75,
 'cosine_map@100': 0.75}

# Data Preparation

In [None]:
ds = dataset['train']
# Add an id column to the dataset
ds = ds.add_column("id", range(len(ds)))

# split dataset into a 10% test set
# 90% train, 10% test + validation
ds_train_test = ds.train_test_split(test_size=0.1, seed=42)
# Split the 10% test + valid in half test, half valid
ds_valid_test = ds_train_test['test'].train_test_split(test_size=0.5, seed=42)
# gather everyone if you want to have a single DatasetDict
ds_split = DatasetDict({
    'train': ds_train_test['train'],
    'test': ds_valid_test['test'],
    'valid': ds_valid_test['train']})



In [None]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'id'],
        num_rows: 6300
    })
    test: Dataset({
        features: ['question', 'context', 'id'],
        num_rows: 350
    })
    valid: Dataset({
        features: ['question', 'context', 'id'],
        num_rows: 350
    })
})

# Baseline evaluation

Check model performance before FT

In [None]:
ds_test = ds_split['test']

# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
corpus = {}
queries = {}

for row in ds:
    q_id = row['id']
    corpus[q_id] = row['context']

for row in ds_test:
    q_id = row['id']
    queries[q_id] = row['question']
    relevant_docs[q_id] = [q_id]

# Initialize evaluator
evaluator_test = InformationRetrievalEvaluator(queries, corpus, relevant_docs
                                              # ,score_functions={"cosine": cos_sim}
                                              )


# Evaluate the model
evals_test_bsl = model_bsl.evaluate(evaluator_test)

print(f"Baseline model evals: {model_id}")
evals_test_bsl

Baseline model evals: BAAI/bge-small-en-v1.5


{'cosine_accuracy@1': 0.5942857142857143,
 'cosine_accuracy@3': 0.7371428571428571,
 'cosine_accuracy@5': 0.8,
 'cosine_accuracy@10': 0.8457142857142858,
 'cosine_precision@1': 0.5942857142857143,
 'cosine_precision@3': 0.24571428571428572,
 'cosine_precision@5': 0.15999999999999998,
 'cosine_precision@10': 0.08457142857142856,
 'cosine_recall@1': 0.5942857142857143,
 'cosine_recall@3': 0.7371428571428571,
 'cosine_recall@5': 0.8,
 'cosine_recall@10': 0.8457142857142858,
 'cosine_ndcg@10': 0.7212104601552022,
 'cosine_mrr@10': 0.681048752834467,
 'cosine_map@100': 0.6858932357463235}

# Training:

[MultipleNegativesRankingLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss)

* Train and valid: For training process
* Test: Benchmark against baseline

In [None]:
from sentence_transformers import SentenceTransformerTrainer, losses, SentenceTransformerModelCardData
from sentence_transformers import SentenceTransformerTrainingArguments


model = SentenceTransformer(
  model_id,
  model_kwargs={
      "attn_implementation":
      "sdpa"  # scaled_dot_product_attention Pytroch native impl
      },
  model_card_data=SentenceTransformerModelCardData(
      language="en",
      license="apache-2.0",
      model_name="BGE small v1.5 Financial",
  ),
)
train_loss = losses.MultipleNegativesRankingLoss(model) # anchor + positive


# Training dataset
ds_train = (ds_split['train']
            .select_columns(["context", "question"])
            .rename_column("context", "positive")
            .rename_column("question", "anchor")
            )

# Evaluator
ds_valid = ds_split['valid']

relevant_docs = {}
corpus = {}
queries = {}
for row in ds_valid:
    q_id = row['id']
    corpus[q_id] = row['context']
    queries[q_id] = row['question']
    relevant_docs[q_id] = [q_id]

evaluator_valid = InformationRetrievalEvaluator(
    queries, corpus, relevant_docs
                                              )

# Define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="bge-small-financial", # output directory and hugging face model ID
    num_train_epochs=1,                         # number of epochs
    per_device_train_batch_size=32,             # train batch size
    gradient_accumulation_steps=16,             # for a global batch size of 512
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use cosine learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    report_to="none",
    metric_for_best_model='eval_cosine_recall@1'
)


trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  # training arguments
    train_dataset=ds_train,
    loss=train_loss,
    evaluator=evaluator_valid,
)

# start training
tm_start = time.perf_counter()
trainer.train()
tm_end = time.perf_counter()
tm_process = tm_end - tm_start
print(f"Training time: {tm_process:.2f}s")


# save the best model
trainer.save_model()

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Epoch,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
1,0.3218,No log,0.885714,0.957143,0.974286,0.982857,0.885714,0.319048,0.194857,0.098286,0.885714,0.957143,0.974286,0.982857,0.938499,0.923773,0.924779


Training time: 63.00s


In [None]:
fine_tuned_model.model_card_data

SentenceTransformerModelCardData(language=[], license=None, model_name=None, model_id=None, train_datasets=[], eval_datasets=[], task_name='semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more', tags=['sentence-transformers', 'sentence-similarity', 'feature-extraction', 'dense'], local_files_only=False, generate_widget_examples=True, base_model=None, base_model_revision=None, non_default_hyperparameters={}, all_hyperparameters={}, eval_results_dict={}, training_logs=[], widget=[], predict_example=None, label_example_list=[], code_carbon_callback=None, citations={}, best_model_step=None, first_save=True, widget_step=-1, pipeline_tag='sentence-similarity', library_name='sentence-transformers', version={'python': '3.12.11', 'sentence_transformers': '5.1.0', 'transformers': '4.55.4', 'torch': '2.8.0+cu126', 'accelerate': '1.10.1', 'datasets': '4.0.0', 'tokenizers': '0.21.4'})

# Benchmark

In [None]:
# Evaluate the model
evals_test_ft = fine_tuned_model.evaluate(evaluator_test)

print(f"FT model evals: {model_id}")
evals_test_ft

FT model evals: BAAI/bge-small-en-v1.5


{'cosine_accuracy@1': 0.6542857142857142,
 'cosine_accuracy@3': 0.7971428571428572,
 'cosine_accuracy@5': 0.8457142857142858,
 'cosine_accuracy@10': 0.8914285714285715,
 'cosine_precision@1': 0.6542857142857142,
 'cosine_precision@3': 0.2657142857142857,
 'cosine_precision@5': 0.16914285714285712,
 'cosine_precision@10': 0.08914285714285713,
 'cosine_recall@1': 0.6542857142857142,
 'cosine_recall@3': 0.7971428571428572,
 'cosine_recall@5': 0.8457142857142858,
 'cosine_recall@10': 0.8914285714285715,
 'cosine_ndcg@10': 0.7718514612605226,
 'cosine_mrr@10': 0.7335589569161,
 'cosine_map@100': 0.7368102729714265}

# Conclusions. Use FT when:
* Need a small but performant model
* Specific vocabulary or slangs
* Easy to build a dataset
* No need to frequently update knowledge base

# Propietary Models
Some providers do not offer yet an embedding fine tunning service.
For example, GCP offers one on Vertext platform: [Tune text embeddings](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings)

# References:
[Why, When and How to Fine-Tune a Custom Embedding Model](https://weaviate.io/blog/fine-tune-embedding-model)
[Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/)