In [1]:
!nvidia-smi

Tue Oct 22 13:08:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40-24Q                 On  |   00000000:00:10.0 Off |                  N/A |
| N/A   N/A    P0             N/A /  N/A  |     670MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Import libraries

In [2]:
from sentence_transformers import (SentenceTransformer, models, 
                                   SentenceTransformerTrainingArguments,
                                   SentenceTransformerTrainer)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.util import dot_score

from sentence_transformers.losses import MultipleNegativesSymmetricRankingLoss
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_from_disk
import torch

import pandas as pd
import numpy as np

from utils import create_data_for_evaluator

  from tqdm.autonotebook import tqdm, trange
2024-10-22 13:08:30.132508: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 13:08:30.174096: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 13:08:30.174126: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 13:08:30.175140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 13:08:30.181948

A custom SentenceTransformer model is configured by defining and configuring several components, including a word embedding model, a grouping model, a normalization layer, and an adapter module. The word embedding model is initialized with a pre-trained transformer model from Sentence Transformers, with specific settings for a maximum sequence length and case-sensitivity. The pooling model is configured to use token averaging for pooling, with other pooling modes disabled. The normalization layer is defined to standardize the embeddings.

In [3]:
# Carga del modelo base de embeddings de palabras Sentence Transformer
model_name = "sentence-transformers/all-MiniLM-L6-v2"

base_model = SentenceTransformer(
    model_name,                  # Nombre del modelo preentrenado de Sentence Transformers
    device="cuda",               # Utilizar GPU para entrenar (dispositivo CUDA)
    model_kwargs={"torch_dtype": "float16"}  # Configurar el modelo para usar FP16 (precisión reducida) para optimizar la memoria
)

# Mostrar la arquitectura del modelo cargado
base_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
# Crear la capa de embeddings de palabras usando el mismo modelo preentrenado
word_embedding_model = models.Transformer(
    model_name_or_path=model_name,  # Usar el mismo modelo preentrenado (sentence-transformers/all-MiniLM-L6-v2)
    max_seq_length=512,  # Aumentar el tamaño máximo de secuencia de 256 a 512 para capturar más contexto
    do_lower_case=False,  # No convertir las palabras a minúsculas, mantener la capitalización original
)

# Definir los parámetros del modelo de pooling
pooling_model = models.Pooling(
    word_embedding_dimension=512,  # Dimensión de los embeddings (incrementada a 512 para el nuevo modelo)
    pooling_mode_cls_token=False,  # No usar el token [CLS] para el pooling
    pooling_mode_mean_tokens=True,  # Usar el promedio de los tokens como estrategia de pooling
    pooling_mode_max_tokens=False,  # No usar el token máximo para el pooling
    pooling_mode_mean_sqrt_len_tokens=False,  # No usar la raíz cuadrada de la longitud de los tokens para el pooling
    pooling_mode_weightedmean_tokens=False,  # No usar un promedio ponderado de los tokens
    pooling_mode_lasttoken=False,  # No usar el último token para el pooling
    include_prompt=True  # Incluir el "prompt" en el proceso de pooling
)

# Definir una capa de normalización para el modelo personalizado
normalize = models.Normalize()


# Definir el modelo personalizado de Sentence Transformer que incluye:
# - word_embedding_model: La capa de embeddings de palabras con el modelo MiniLM
# - pooling_model: El modelo de pooling personalizado
# - normalize: La capa de normalización para mantener la estabilidad del espacio de los embeddings
custom_domain_model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model, normalize],  # Definir las capas del modelo
    device="cuda"  # Usar GPU para entrenar
)

# Mostrar la arquitectura del modelo personalizado
custom_domain_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 512, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

Load the training and evaluation datasets for question answering tasks from the respective pickled files stored in the 'data' directory.

In [5]:
qa_train = load_from_disk('./data/train_dataset')
qa_eval = load_from_disk('./data/eval_dataset')
qa_test = load_from_disk('./data/test_dataset')

Create training examples using the question-answer pairs from the dataset `qa`, where each example consists of a question (`qa[0]`) and its corresponding answer (`qa[1]`).

In [6]:
print("Training lenght: ", len(qa_train))
print("Validation lenght: ", len(qa_eval))
print("Test lenght: ", len(qa_test))

Training lenght:  29547
Validation lenght:  3677
Test lenght:  3666


Prepares and configures the training and evaluation process for a custom SentenceTransformer model. Initially, a training data set is created by generating a list of `InputExample` instances, where each instance consists of a pair of texts (question and answer). This data set is then loaded into a "DataLoader", which shuffles the data at each epoch and sets the batch size to 256.

The training loss is defined using "MultipleNegativesSymmetricRankingLoss", which is suitable for information retrieval tasks involving positive text pairs. An evaluator is configured using "InformationRetrievalEvaluator", which evaluates the performance of the model on a set of queries and corpora, with the main scoring function specified as "dot_score".

In [7]:
eval_dataset_evaluator = create_data_for_evaluator(qa_eval)
test_dataset_evaluator = create_data_for_evaluator(qa_test)

In [8]:
dev_evaluator = InformationRetrievalEvaluator(
        queries=eval_dataset_evaluator['queries'],
        corpus=eval_dataset_evaluator['corpus'],
        relevant_docs=eval_dataset_evaluator['relevant_docs'],
        name='qa_eval', 
        map_at_k=[10],
        accuracy_at_k = [10],
        precision_recall_at_k = [10],
        score_functions={'dot_score':dot_score}
    )

test_evaluator = InformationRetrievalEvaluator(
        queries=test_dataset_evaluator['queries'],
        corpus=test_dataset_evaluator['corpus'],
        relevant_docs=test_dataset_evaluator['relevant_docs'],
        name='qa_test', 
        map_at_k=[10],
        accuracy_at_k = [10],
        precision_recall_at_k = [10],
        score_functions={'dot_score':dot_score}
    )

In [9]:
## Base model evaluation

results = dev_evaluator(base_model)

results #'qa_eval_dot_score_recall@10': 0.7040112386417982,

{'qa_eval_dot_score_accuracy@10': 0.78012912482066,
 'qa_eval_dot_score_precision@10': 0.08353658536585365,
 'qa_eval_dot_score_recall@10': 0.7040112386417982,
 'qa_eval_dot_score_ndcg@10': 0.5753032738384262,
 'qa_eval_dot_score_mrr@10': 0.571331215412994,
 'qa_eval_dot_score_map@10': 0.5153894240623078}

In [10]:
# Definición de la función de pérdida usando MultipleNegativesSymmetricRankingLoss
# Esta pérdida está diseñada para el entrenamiento de modelos de recuperación de información donde 
# se tienen pares de preguntas y respuestas, y se desea maximizar la similitud entre ellos.
# 'similarity_fct=dot_score' especifica que se utiliza el producto punto (dot product) como función de similitud
loss = MultipleNegativesSymmetricRankingLoss(custom_domain_model,
                                             similarity_fct=dot_score)

# Configuración de los parámetros de entrenamiento mediante SentenceTransformerTrainingArguments
args = SentenceTransformerTrainingArguments(
    output_dir="./results/domain_adaptation_model",  # Directorio donde se guardarán los resultados y checkpoints del modelo
    num_train_epochs=30,  # Número de épocas para el entrenamiento
    per_device_train_batch_size=64,  # Tamaño del batch para entrenamiento por dispositivo
    gradient_accumulation_steps=4,  # Acumulación de gradientes para simular un batch de tamaño más grande (4 acumulaciones para tener un batch equivalente a 512)
    per_device_eval_batch_size=512,  # Tamaño del batch para evaluación
    learning_rate=2e-5,  # Tasa de aprendizaje inicial
    warmup_ratio=0.1,  # Proporción de pasos de calentamiento (10% de los pasos de entrenamiento)
    bf16=True,  # Utiliza el formato bfloat16 para reducir el uso de memoria durante el entrenamiento en hardware compatible
    gradient_checkpointing=False,  # Checkpointing de gradientes para reducir el uso de memoria al costo de mayor cómputo
    optim="adamw_torch_fused",  # Optimizador AdamW con fusión de operaciones, más eficiente en hardware compatible
    lr_scheduler_type="cosine",  # Planificador de tasa de aprendizaje con decaimiento cosenoidal
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # Utiliza un muestreo de batches que garantiza que no haya duplicados
    eval_strategy="epoch",  # Realiza evaluación al final de cada época
    save_strategy="epoch",  # Guarda un checkpoint al final de cada época
    save_total_limit=1,  # Mantiene un límite de 3 checkpoints, eliminando los más antiguos
    logging_steps=20,  # Registra los resultados del entrenamiento cada 25 pasos
    metric_for_best_model="qa_eval_dot_score_map@10",  # Métrica clave utilizada para determinar el mejor modelo (MAP@10 en este caso)
    greater_is_better=True,  # Indica que un valor mayor de la métrica es mejor (se utiliza para seleccionar el mejor modelo)
    load_best_model_at_end=True,  # Carga automáticamente el mejor modelo al final del entrenamiento
)

# Creación del entrenador con los parámetros definidos
trainer = SentenceTransformerTrainer(
    model=custom_domain_model,  # Modelo que se va a entrenar (previamente definido)
    args=args,  # Argumentos de entrenamiento definidos previamente
    train_dataset=qa_train.select_columns(["anchor", "positive"]),  # Dataset de entrenamiento, solo las columnas "anchor" y "positive"
    loss=loss,  # Función de pérdida definida para el entrenamiento
    evaluator=dev_evaluator,  # Evaluador que se encargará de evaluar el rendimiento en cada época
)

# Inicia el proceso de entrenamiento
trainer.train()

# Guarda el modelo final entrenado
trainer.save_model()


Epoch,Training Loss,Validation Loss,Qa Eval Dot Score Accuracy@10,Qa Eval Dot Score Precision@10,Qa Eval Dot Score Recall@10,Qa Eval Dot Score Ndcg@10,Qa Eval Dot Score Mrr@10,Qa Eval Dot Score Map@10
1,1.4661,No log,0.841105,0.090782,0.761059,0.64096,0.646249,0.581098
2,0.8484,No log,0.857963,0.092647,0.776345,0.650177,0.653459,0.588439
3,1.1933,No log,0.867288,0.093651,0.785031,0.659421,0.661695,0.597852
4,0.9168,No log,0.87769,0.095301,0.797555,0.670376,0.672401,0.608339
5,0.4589,No log,0.882353,0.095696,0.800753,0.677022,0.68028,0.615857
6,0.9753,No log,0.879842,0.095481,0.798511,0.67709,0.681544,0.616409
7,0.6086,No log,0.883429,0.096162,0.80275,0.683876,0.68919,0.623771
8,0.2529,No log,0.882712,0.096055,0.801058,0.680863,0.684904,0.6207
9,0.7615,No log,0.886298,0.096593,0.803796,0.682142,0.685545,0.621221
10,0.4085,No log,0.888092,0.096987,0.805978,0.686176,0.690557,0.625906


                                                                                                                                                                             

The number of training epochs is set to 30 and the warm-up steps are calculated as 10% of the total training steps, determined by the length of the DataLoader and the number of epochs. This setup ensures that the model is properly prepared and evaluated during training.

## Evaluating the base model & the fine tunned model

In [11]:
custom_domain_model = SentenceTransformer('./results/domain_adaptation_model',
                                          device="cuda",
                                          model_kwargs={"torch_dtype": "float16"}
                                          )

In [12]:
dev_evaluator(custom_domain_model)

{'qa_eval_dot_score_accuracy@10': 0.8884505021520803,
 'qa_eval_dot_score_precision@10': 0.09695121951219511,
 'qa_eval_dot_score_recall@10': 0.8057687709230033,
 'qa_eval_dot_score_ndcg@10': 0.6853291294400377,
 'qa_eval_dot_score_mrr@10': 0.6894385803101717,
 'qa_eval_dot_score_map@10': 0.6248782812583028}

Evaluate the Mean Average Precision (MAP) at k=10 for both the base and custom domain models using the evaluator, and print the results for comparison.

In [13]:
eva_base_model = test_evaluator(base_model, output_path='results/base_model/')
print("Base model: ", eva_base_model)

eva_custom_model = test_evaluator(custom_domain_model, output_path='results/custom_model/')
print("Custom model: ", eva_custom_model)

Base model:  {'qa_test_dot_score_accuracy@10': 0.7674084709260589, 'qa_test_dot_score_precision@10': 0.08137114142139268, 'qa_test_dot_score_recall@10': 0.691947834410146, 'qa_test_dot_score_ndcg@10': 0.5668118816420239, 'qa_test_dot_score_mrr@10': 0.5607966134527511, 'qa_test_dot_score_map@10': 0.5094408170861867}
Custom model:  {'qa_test_dot_score_accuracy@10': 0.8822684852835606, 'qa_test_dot_score_precision@10': 0.09659009332376167, 'qa_test_dot_score_recall@10': 0.8053541517109355, 'qa_test_dot_score_ndcg@10': 0.6812722645623415, 'qa_test_dot_score_mrr@10': 0.6827608849234824, 'qa_test_dot_score_map@10': 0.620703337169597}


### Comparing QA

In [14]:
# Asumiendo que los embeddings están normalizados
question1 = "In the context of selling Direct Long-Term Insurance to Retail Clients, can you identify the rule that mandates insurers and insurance intermediaries to ensure that the insurance products are suitable for their clients?"
answer1 =  "An Insurer or an Insurance Intermediary must comply with the suitability requirement set out in Rule ‎3.4 when conducting any Insurance or Insurance Intermediation Business with or for a Retail Client in respect of Direct Long-Term Insurance."

question2 = 'Under what circumstances, as outlined in Rule ‎12.3.2, is a Fund Manager of a Domestic Fund not mandated to engage the services of an Eligible Custodian?'
answer2 =  'A Fund Manager of a Domestic Fund is not required to appoint an Eligible Custodian for the Fund pursuant to Rule ‎12.3.2 where it meets the requirements in either (2) and (3), or (4).'


emb_q1 = custom_domain_model.encode(question1)  # el embedding está normalizado
emb_q2 = custom_domain_model.encode(question2)  # el embedding está normalizado
ans_1 = custom_domain_model.encode(answer1)
ans_2 = custom_domain_model.encode(answer2)


print("q1", ans_1 @ emb_q1,"(answer1) --", ans_2 @ emb_q1, "(answer2)")
print("q2", ans_1 @ emb_q2, "(answer1) --", ans_2 @ emb_q2, "(answer2)")


print("------ Base Model ------")

emb_q1 = base_model.encode(question1)  # el embedding está normalizado
emb_q2 = base_model.encode(question2)  # el embedding está normalizado
ans_1 = base_model.encode(answer1)
ans_2 = base_model.encode(answer2)


print("q1", ans_1 @ emb_q1,"(answer1) --", ans_2 @ emb_q1, "(answer2)")
print("q2", ans_1 @ emb_q2, "(answer1) --", ans_2 @ emb_q2, "(answer2)")


q1 0.681 (answer1) -- 0.09 (answer2)
q2 0.2454 (answer1) -- 0.8154 (answer2)
------ Base Model ------
q1 0.732 (answer1) -- 0.1403 (answer2)
q2 0.3333 (answer1) -- 0.876 (answer2)


### The custom model mantain original capabilities

Encodes sample text inputs, including the title of an article, author names, and various concepts, using both the custom domain model and the base model. Also, the dot product between the coded vectors is calculated to measure the similarity between different pairs of concepts and between the paper and a concept. Print the similarity scores for each comparison to see the differences. 

In [15]:
paper = "Composable Lightweight Processors"

concept1 = "shark"
concept2 = "ocean"
concept3 = "strawberry"

In [16]:
custom_paper = custom_domain_model.encode(paper)

custom_concept1 = custom_domain_model.encode(concept1)
custom_concept2 = custom_domain_model.encode(concept2)
custom_concept3 = custom_domain_model.encode(concept3)

# Imprimir los resultados y explicaciones
print("------ Fine Tunned Model ------")
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(custom_concept1, custom_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(custom_concept1, custom_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(custom_paper, custom_concept2)}")

------ Fine Tunned Model ------
Producto punto entre dos conceptos (shark y ocean): 0.53515625
Producto punto entre dos conceptos (shark y strawberry): 0.290771484375
Producto punto entre el documento y un concepto (ocean): -0.030242919921875


In [17]:
base_paper = base_model.encode(paper)

base_concept1 = base_model.encode(concept1)
base_concept2 = base_model.encode(concept2)
base_concept3 = base_model.encode(concept3)  

# Imprimir los resultados y explicaciones
print("------ Base Model ------")
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(base_concept1, base_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(base_concept1, base_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(base_paper, base_concept2)}")

------ Base Model ------
Producto punto entre dos conceptos (shark y ocean): 0.552734375
Producto punto entre dos conceptos (shark y strawberry): 0.274169921875
Producto punto entre el documento y un concepto (ocean): -0.051483154296875
