# Install Necessary Packages

In [None]:
!pip install transformers[torch]

In [None]:
!pip install -U sentence-transformers

In [None]:
!pip install datasets

In [None]:
#https://sbert.net/docs/sentence_transformer/training_overview.html
#https://huggingface.co/BAAI/bge-large-en


In [None]:
from sentence_transformers import SentenceTransformer
import torch


  from tqdm.autonotebook import tqdm, trange


#   Load Dataset

In [None]:

from datasets import load_dataset
dataset = load_dataset("sentence-transformers/all-nli", "triplet")
train_dataset = dataset["train"].select(range(5000))
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]


More infromation about the dataset over here: # https://sbert.net/docs/sentence_transformer/training_overview.html#dataset


In [None]:
train_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 5000
})

In [None]:
train_dataset.to_pandas()
# Anchor: The original sentence or query.
# Positive answer: A correct or relevant response to the anchor.
# Negative answer: An incorrect or irrelevant response to the anchor

Unnamed: 0,anchor,positive,negative
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.","A person is at a diner, ordering an omelette."
1,Children smiling and waving at camera,There are children present,The kids are frowning
2,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,The boy skates down the sidewalk.
3,Two blond women are hugging one another.,There are women showing affection.,The women are sleeping.
4,"A few people in a restaurant setting, one of t...",The diners are at a restaurant.,The people are sitting at desks in school.
...,...,...,...
4995,The people are outside.,People on ATVs and dirt bikes are traveling al...,A woman in a pink shirt is handing a bag to th...
4996,The people are outside.,People on ATVs and dirt bikes are traveling al...,A small group of adult males enjoy a conversat...
4997,The people are outside.,People on ATVs and dirt bikes are traveling al...,Two guys and one girl are sitting at a table i...
4998,The people are outside.,People on ATVs and dirt bikes are traveling al...,People sitting on black chairs on a bus.


# Load Model

In [None]:
# 1. Load a model to finetune with 2. (Optional) model card data

#popular embedding models:
#https://huggingface.co/nomic-ai/nomic-embed-text-v1
#https://huggingface.co/BAAI/bge-large-en

model = SentenceTransformer("BAAI/bge-large-en")



# Setting up Training Arguments

In [None]:
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers



In [None]:
# 3. Define a loss function
loss = MultipleNegativesRankingLoss(model)


In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/mpnet-base-all-nli-triplet",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

# Train

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss
)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
100,0.6799,0.576596
200,0.1644,0.797214
300,0.3899,1.128223


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

TrainOutput(global_step=313, training_loss=0.39497141016367526, metrics={'train_runtime': 288.6541, 'train_samples_per_second': 17.322, 'train_steps_per_second': 1.084, 'total_flos': 0.0, 'train_loss': 0.39497141016367526, 'epoch': 1.0})

# Test - Model any good?

In [None]:
from sentence_transformers.evaluation import TripletEvaluator

test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)

test_evaluator(model)


{'all-nli-test_cosine_accuracy': 0.8836435164170071,
 'all-nli-test_dot_accuracy': 0.11635648358299289,
 'all-nli-test_manhattan_accuracy': 0.8821304282039643,
 'all-nli-test_euclidean_accuracy': 0.8836435164170071,
 'all-nli-test_max_accuracy': 0.8836435164170071}

In [None]:

# 8. Save the trained model
model.save_pretrained("models/mpnet-base-all-nli-triplet/final")

# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub("mpnet-base-all-nli-triplet")