In [None]:
pip install sentence-transformers

In [3]:
from typing import Iterable, Dict, Any, Optional
import torch
from torch import nn, Tensor
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.losses import MultipleNegativesRankingLoss

In [4]:
class CustomMultipleNegativesRankingLoss(nn.Module):
    def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct=util.cos_sim, 
                 alpha: float = 0.5, beta: float = 0.5, loss_option: str = "qc") -> None:
        super(CustomMultipleNegativesRankingLoss, self).__init__()
        self.model = model
        self.scale = scale
        self.similarity_fct = similarity_fct
        self.alpha = alpha
        self.beta = beta
        self.loss_option = loss_option
        self.mnr_loss = MultipleNegativesRankingLoss(model, scale, similarity_fct)

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Optional[Tensor] = None) -> Tensor:
        
        query = sentence_features[0]
        context = sentence_features[1]

        loss_qc = self.mnr_loss([query, context], labels)

        total_loss = torch.tensor(0.0, requires_grad=True)
        if self.loss_option == "qc":
            total_loss = loss_qc
        elif self.loss_option in ["qc-qa", "qc-qa-ac"]:
            answer = sentence_features[2]
            loss_qa = self.mnr_loss([query, answer], labels)
            if self.loss_option == "qc-qa":
                total_loss = (self.alpha * loss_qc + (1-self.alpha) * loss_qa) 
            else:  
                loss_ac = self.mnr_loss([answer, context], labels)
                total_loss = (self.alpha * loss_qc + self.beta * loss_qa + (1- self.alpha - self.beta) * loss_ac) 

        return total_loss

    def get_config_dict(self) -> Dict[str, Any]:
        return {
            "scale": self.scale,
            "similarity_fct": self.similarity_fct.__name__,
            "alpha": self.alpha,
            "beta": self.beta,
            "loss_option": self.loss_option
        }

In [5]:
import json
import torch
from datasets import Dataset
from sentence_transformers.datasets import NoDuplicatesDataLoader
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler
from sentence_transformers.evaluation import InformationRetrievalEvaluator
import logging
import os
import gc
import numpy as np

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]= "0"
os.environ["CUDA_LAUNCH_BLOCKING"]= "1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["WANDB_DISABLED"] = "true"
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, handlers=[LoggingHandler()])
BATCH_SIZE = 8

In [7]:
def fine_tune(model_name, train_dataset_file, val_dataset_file, output_dir, alpha, beta, learning_rate=1e-5, batch_size=BATCH_SIZE, epochs=3, save_steps=1):
    # Clear any existing GPU cache before starting
    torch.cuda.empty_cache()
    gc.collect()
    
    try:
        model = SentenceTransformer(model_name, trust_remote_code=True)
        # model.tokenizer.model_max_length = 256
        if torch.cuda.is_available():
            model = model.to(torch.device("cuda"))
        
        with open(train_dataset_file, 'r', encoding='utf-8') as f:
            train_dataset = json.load(f)
        
        train_examples = []
        for query_id, query in train_dataset['queries'].items():
            doc_id = train_dataset['relevant_docs'][query_id][0]
            context = train_dataset['corpus'][doc_id]
            complete_answer = train_dataset['answers'][query_id]
            train_examples.append(InputExample(texts=[query, context, complete_answer]))
        
        train_dataloader = NoDuplicatesDataLoader(train_examples, batch_size=batch_size)
        train_loss = CustomMultipleNegativesRankingLoss(model=model, alpha=alpha, beta=beta, loss_option="qc-qa-ac")
        
        with open(val_dataset_file, 'r', encoding='utf-8') as f:
            val_dataset = json.load(f)   
        
        val_evaluator = InformationRetrievalEvaluator(
            val_dataset['queries'], 
            val_dataset['corpus'], 
            val_dataset['relevant_docs'],
            accuracy_at_k=[1, 3, 5, 10, 15, 20],
            precision_recall_at_k=[1, 3, 5, 10, 15, 20],
            mrr_at_k=[1, 3, 5, 10, 15],
            ndcg_at_k=[1, 5, 10, 15, 20],
            map_at_k=[20, 100],
            name='validation'
        )
        
        with open(test_dataset_file, 'r', encoding='utf-8') as f:
            test_dataset = json.load(f)
        
        test_evaluator = InformationRetrievalEvaluator(
            test_dataset['queries'], 
            test_dataset['corpus'], 
            test_dataset['relevant_docs'],
            accuracy_at_k=[1, 3, 5, 10, 15, 20],
            precision_recall_at_k=[1, 3, 5, 10, 15, 20],
            mrr_at_k=[1, 3, 5, 10, 15],
            ndcg_at_k=[1, 5, 10, 15, 20],
            map_at_k=[20, 100],
            name='test'
        )
        
        warmup_steps = int(len(train_dataloader) * epochs * 0.1)
        metrics = []
        test_metrics = []
        
        # Create results directory
        results_dir = os.path.join(output_dir, f"{model_name.replace('/', '_')}_alpha_{alpha}_beta_{beta}_lr_{learning_rate}")
        os.makedirs(results_dir, exist_ok=True)
        
        for epoch in range(1, epochs + 1):
            logging.info(f"Epoch {epoch}/{epochs}")
            
            try:
                model.fit(
                    train_objectives=[(train_dataloader, train_loss)],
                    evaluator=val_evaluator,
                    epochs=1, 
                    warmup_steps=warmup_steps,
                    optimizer_params={'lr': learning_rate},  # Thêm learning rate ở đây
                    show_progress_bar=True
                )
                
                # Evaluate and save metrics
                metric = val_evaluator(model)
                metrics.append(metric)
                test_metric = test_evaluator(model)
                test_metrics.append(test_metric)
                
                # Save metrics after each epoch
                metrics_file = os.path.join(results_dir, "validation_metrics.json")
                test_metrics_file = os.path.join(results_dir, "test_metrics.json")
                
                with open(metrics_file, 'w', encoding='utf-8') as f:
                    json.dump(metrics, f, ensure_ascii=False, indent=4)
                with open(test_metrics_file, 'w', encoding='utf-8') as f:
                    json.dump(test_metrics, f, ensure_ascii=False, indent=4)
                # if epoch % save_steps == 0:
                #     epoch_dir = os.path.join(results_dir, f"epoch_{epoch}")
                #     os.makedirs(epoch_dir, exist_ok=True)
                #     logging.info(f"Saving model to {epoch_dir}")
                #     model.save(epoch_dir)
                    
            except RuntimeError as e:
                logging.error(f"RuntimeError during training: {e}")
                break
    
    except Exception as e:
        logging.error(f"Error in fine-tuning: {e}")
    
    finally:
        # Ensure GPU memory is cleared after processing each model configuration
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        # Explicitly delete model and move it off GPU
        if 'model' in locals():
            del model
            torch.cuda.empty_cache()

In [None]:
if __name__ == "__main__":
    
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    models = [
        # "keepitreal/vietnamese-sbert", 
        # "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
        # "bkai-foundation-models/vietnamese-bi-encoder", 
        # "hmthanh/VietnamLegalText-SBERT",
        # "BAAI/bge-base-en-v1.5",
        # "BAAI/bge-small-en-v1.5",
        # "colbert-ir/colbertv2.0",
        # "FPTAI/vibert-base-cased", 
        # "vinai/phobert-large",
        # "vinai/phobert-base", 
        # "dangvantuan/vietnamese-document-embedding"
    # "/kaggle/input/vilegalbert/pytorch/default/1/ViLegalBERT"
        # "huyydangg/DEk21_hcmute_embedding"
        # "jaeyong2/gte-multilingual-base-Viet-embedding"
        "Alibaba-NLP/gte-multilingual-base"
    ]

    # Alphas và betas
    # alphas = [0.5]
    # betas = [0.2, 0.3, 0.4, 0.5]

    # alphas = [0.5]
    # betas = [0.2, 0.3, 0.4]

    alphas = [0.2]
    betas = [0.2]
    

    train_dataset_file = '/kaggle/input/utehy-qca/train.json'
    val_dataset_file = '/kaggle/input/utehy-qca/val.json'
    test_dataset_file = '/kaggle/input/utehy-qca/test.json'
    output_base_dir = 'results/Alibaba'

    # train_dataset_file = '/kaggle/input/vilaw-256/ViBIDLAW_train_short.json'
    # val_dataset_file = '/kaggle/input/vilaw-256/ViBIDLAW_val_short.json'
    # test_dataset_file = '/kaggle/input/vilaw-256/ViBIDLAW_test_short.json'
    # # output_base_dir = 'results/dvt-vnse-doc-embedding'
    # output_base_dir = 'results/phobert-base'
    
    
    for model_name in models:
        for alpha in alphas:
            for beta in betas:
                if alpha == 0.5 and beta == 0.5:
                    continue
                    
                print(f"Starting fine-tune for model: {model_name}, alpha: {alpha}, beta: {beta}")
                try:
                    fine_tune(
                        model_name=model_name,
                        train_dataset_file=train_dataset_file,
                        val_dataset_file=val_dataset_file,
                        output_dir=output_base_dir,
                        alpha=alpha,
                        beta=beta,
                        save_steps=1
                    )
                    print(f"Completed fine-tune for model: {model_name}, alpha: {alpha}, beta: {beta}")
                    
                    # Additional GPU memory management between iterations
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    gc.collect()
                    
                except Exception as e:
                    print(f"Error during fine-tune for model: {model_name}, alpha: {alpha}, beta: {beta}")
                    print(f"Error details: {e}")

Starting fine-tune for model: Alibaba-NLP/gte-multilingual-base, alpha: 0.2, beta: 0.2


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to n

Step,Training Loss


In [None]:
import os
import zipfile

def zipdir(path, ziph):
    # Duyệt qua tất cả các file và thư mục con
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            # Lưu file vào zip mà không giữ đường dẫn gốc
            arcname = os.path.relpath(file_path, path)
            ziph.write(file_path, arcname)

os.chdir(r'/kaggle/working/results')
zip_file_name = "dvt-vnse-doc-embedding-T-only.zip"

with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    zipdir("/kaggle/working/results/dvt-vnse-doc-embedding-T-only", zipf)
