In [2]:
import logging
import random
import traceback
from datetime import datetime
import os

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import losses
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments
from sklearn.model_selection import train_test_split
from transformers import TrainerCallback

# Set logging
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

# Base model
model_name = "paraphrase-multilingual-miniLM-L12-V2"
model = SentenceTransformer(model_name)

# Training parameters
num_train_epochs = 3
batch_size = 32
output_dir = "output/training_multiple_negatives_ranking_loss-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

################### Load and Split Dataset ##################
full_pair_class_dataset = load_dataset("yahyaabd/statictable-triplets-all", split="train")

# Sampling: Ambil 100% dari data (sesuai perubahan Anda)
sample_size = int(1 * len(full_pair_class_dataset))
sample_indices = random.sample(range(len(full_pair_class_dataset)), sample_size)
sampled_dataset = full_pair_class_dataset.select(sample_indices)

# Split into train and validation (90% train, 10% validation)
train_indices, val_indices = train_test_split(
    range(len(sampled_dataset)),
    test_size=0.1,
    random_state=42
)
train_dataset = sampled_dataset.select(train_indices)
val_dataset = sampled_dataset.select(val_indices)

# Log dataset sizes
logging.info(f"Original dataset size: {len(full_pair_class_dataset)}")
logging.info(f"Sampled dataset size: {len(sampled_dataset)}")
logging.info(f"Train dataset size: {len(train_dataset)}")
logging.info(f"Validation dataset size: {len(val_dataset)}")

# Define the loss function
train_loss = losses.MultipleNegativesRankingLoss(model=model)

################### Development Evaluators ##################
new_ir_corpus = load_dataset("yahyaabd/bps-statictable", split="corpus")
new_ir_queries = load_dataset("yahyaabd/bps-statictable", split="queries")
new_ir_relevant_docs_data = load_dataset("yahyaabd/bps-statictable-qrels", split="validation")

new_ir_corpus = dict(zip(new_ir_corpus["id"], new_ir_corpus["title"]))
new_ir_queries = dict(zip(new_ir_queries["id"], new_ir_queries["text"]))

new_ir_relevant_docs = {}
for entry in new_ir_relevant_docs_data:
    qid = str(entry["query-id"])
    cid = str(entry["corpus-id"])
    score = float(entry["score"])
    if score > 0:
        if qid not in new_ir_relevant_docs:
            new_ir_relevant_docs[qid] = {}
        new_ir_relevant_docs[qid][cid] = score

# Log untuk debugging
logging.info(f"Number of queries with relevant docs: {len(new_ir_relevant_docs)}")
logging.info(f"Sample relevant_docs: {dict(list(new_ir_relevant_docs.items())[:3])}")

k_values = [1, 3, 5, 10]
ir_evaluator = InformationRetrievalEvaluator(
    queries=new_ir_queries,
    corpus=new_ir_corpus,
    relevant_docs=new_ir_relevant_docs,
    name="bps-statictable-ir",
    mrr_at_k=k_values,
    accuracy_at_k=k_values,
    precision_recall_at_k=k_values,
    ndcg_at_k=k_values,
    map_at_k=k_values,
    show_progress_bar=True,
    write_csv=True,
)

# Evaluate model without training
logging.info("Evaluate model without training")
ir_evaluator(model, epoch=0, steps=0)

# Define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,
    logging_steps=20,
    run_name="allstats-search-mini-v1-mnrl-run",
    eval_on_start=True,
    load_best_model_at_end=True,
    dataloader_num_workers=2,
    save_on_each_node=True,
    weight_decay=0.01,
    max_grad_norm=1.0
    # Menentukan metrik untuk model terbaik
    # metric_for_best_model="bps-statictable-ir_cosine_mrr@1",  # Ganti primary metric ke MRR@1
    # greater_is_better=True,  # MRR@1 lebih besar lebih baik
)

# Callback untuk memantau MRR@1 (opsional)
class MRRCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        mrr_at_1 = metrics.get("eval_bps-statictable-ir_cosine_mrr@1")
        logging.info(f"MRR@1 at step {state.global_step}: {mrr_at_1}")

# Create trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=train_loss,
    evaluator=ir_evaluator
)

# Train and handle potential errors
try:
    trainer.train()
except Exception as e:
    logging.error(f"Error during training: {str(e)}")
    traceback.print_exc()

# Finish Weights & Biases (jika digunakan)
try:
    import wandb
    wandb.finish()
except ImportError:
    logging.info("W&B not installed, skipping wandb.finish()")

# Save the trained model locally
final_output_dir = f"{output_dir}/final"
os.makedirs(final_output_dir, exist_ok=True)
model.save(final_output_dir)

# # Save to Hugging Face Hub
# model_name_short = model_name.split("/")[-1] if "/" in model_name else model_name
# try:
#     model.push_to_hub(f"{model_name_short}-mnrl-2")
# except Exception as e:
#     logging.error(
#         f"Error uploading model to the Hugging Face Hub: {str(e)}\n"
#         f"To upload manually, run `huggingface-cli login`, then load the model with "
#         f"`model = SentenceTransformer('{final_output_dir}')` and save it with "
#         f"`model.push_to_hub('{model_name_short}-mnrl-2')`."
#     )


2025-06-21 17:46:20 - Use pytorch device_name: cpu
2025-06-21 17:46:20 - Load pretrained SentenceTransformer: paraphrase-multilingual-miniLM-L12-V2


: 