In [1]:
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
import torch
import os

os.environ["WANDB_API_KEY"] = "3b43fe4333c7cce0371f54dbf0875c8862787619"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./"

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_csv_dir = "/kaggle/input/startech-product-train-dataset/train.csv"
test_csv_dir = "/kaggle/input/startech-product-train-dataset/test.csv"
val_csv_dir = "/kaggle/input/startech-product-train-dataset/val.csv"

train_dataset = Dataset.from_csv(train_csv_dir)
test_dataset = Dataset.from_csv(test_csv_dir)
eval_dataset = Dataset.from_csv(val_csv_dir)
train_dataset, test_dataset, eval_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

(Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 17592
 }),
 Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 4398
 }),
 Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 2444
 }))

In [5]:
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(f"sentence-transformers/{model_name}", device=DEVICE)
loss = MultipleNegativesRankingLoss(model)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"./trained_models/{model_name}",
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    bf16=False,  # Set to True if GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    dataloader_num_workers=os.cpu_count(),
    dataloader_pin_memory=True,
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    save_total_limit = 2,
    save_strategy = "epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=100,
    run_name=f"{model_name}-train"
)

In [7]:
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name=f"{model_name}-test",
)
test_evaluator(model)

{'all-mpnet-base-v2-test_cosine_accuracy': 0.9351978171896317}

In [8]:
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name=f"{model_name}-dev",
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mani-atikur99[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


In [None]:
test_evaluator(model)

In [None]:
trainer.save_model("./best_model")