In [None]:
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from transformers import pipeline
import torch
import mlflow
import os
import warnings

os.environ["WANDB_API_KEY"] = "3b43fe4333c7cce0371f54dbf0875c8862787619"
os.environ["TOKENIZERS_PARALLELISM"]="false"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./"

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_csv_dir = "./datasets/train.csv"
test_csv_dir = "./datasets/test.csv"
val_csv_dir = "./datasets/val.csv"

train_dataset = Dataset.from_csv(train_csv_dir).train_test_split(test_size=0.01)["test"]
test_dataset = Dataset.from_csv(test_csv_dir).train_test_split(test_size=0.05)["test"]
eval_dataset = Dataset.from_csv(val_csv_dir).train_test_split(test_size=0.05)["test"]

train_dataset = train_dataset.remove_columns("category")
test_dataset = test_dataset.remove_columns("category")
eval_dataset = eval_dataset.remove_columns("category")

train_dataset, test_dataset, eval_dataset

(Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 1374
 }),
 Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 1718
 }),
 Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 955
 }))

In [4]:
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(f"./trained_models/all_mpnet_base_v2", device=DEVICE)
loss = MultipleNegativesRankingLoss(model)

In [5]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"./trained_models/{model_name}_local",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    bf16=False,  # Set to True if GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    dataloader_num_workers=os.cpu_count(),
    dataloader_pin_memory=True,
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    save_total_limit = 2,
    save_strategy = "epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=100,
    run_name=f"{model_name}-train"
)

In [6]:
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name=f"{model_name}-test",
)

score = test_evaluator(model)
score = list(score.values())[0]
with mlflow.start_run(run_name=f"{model_name}-test-eval-before-train"):
    mlflow.log_metric("triplet_evaluation_score", score)
    mlflow.log_param("model_name", model_name)

In [7]:
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name=f"{model_name}-dev",
)

In [8]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [9]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Embedding model train")

<Experiment: artifact_location='mlflow-artifacts:/394427933678411736', creation_time=1745252608808, experiment_id='394427933678411736', last_update_time=1745252608808, lifecycle_stage='active', name='Embedding model train', tags={}>

In [10]:
with mlflow.start_run(run_name=f"{model_name}-training") as run:
    trainer.train()

Epoch,Training Loss,Validation Loss,All-mpnet-base-v2-dev Cosine Accuracy
1,0.139,0.180979,0.993717
2,0.1229,0.194858,0.993717
3,0.0963,0.174957,0.995812
4,0.0645,0.188421,0.993717
5,0.0636,0.182382,0.993717


🏃 View run all-mpnet-base-v2-training at: http://127.0.0.1:5000/#/experiments/394427933678411736/runs/a67fab04b4a5413793c6a10da104524d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/394427933678411736


In [11]:
score = test_evaluator(model)
score = list(score.values())[0]
with mlflow.start_run(run_name=f"{model_name}-test-eval-after-train"):
    mlflow.log_metric("triplet_evaluation_score", score)
    mlflow.log_param("model_name", model_name)

🏃 View run all-mpnet-base-v2-test-eval-after-train at: http://127.0.0.1:5000/#/experiments/394427933678411736/runs/81ebcb1b946641529508521cb6fadc96
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/394427933678411736


In [None]:
# trainer.save_model("./best_model")