In [1]:
import sys
sys.path.append("..")


In [2]:
import datasets
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 1. Load a model to finetune
model = SentenceTransformer(
    "BAAI/bge-small-zh-v1.5"
)

In [10]:
from eval.dataset import RAGDataset
dataset = RAGDataset.from_file("../data/airbench_qa_healthcare_zh_synthesis_hard_negative.json")
train_dataset, train_dataset_keys = dataset.get_train_dataset(split="train", negative_num=15, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")
eval_dataset, eval_dataset_keys = dataset.get_train_dataset(split="val", negative_num=1, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")

No negative docs found for query_id: q-595, skip this query
No negative docs found for query_id: q-1119, skip this query
No negative docs found for query_id: q-1517, skip this query
No negative docs found for query_id: q-1572, skip this query


In [6]:
for i in train_dataset[:2]:
    print(i)


{'query': '为这个句子生成表示以用于检索相关文章：脂溢性皮炎主要发生在哪些区域？', 'pos': '脂溢性皮炎，也称为脂溢性湿疹，是一种在皮脂腺中发生的慢性丘疹性鳞状炎症性皮肤病。这种疾病在成人和新生儿中很常见，并且经常发生在富含皮脂腺的区域，例如头部，躯干等。病因这种疾病的病因尚不完全清楚。脂溢性皮炎的发病机制可能与脂溢性，微生物，神经递质异常，生理和气候因素，营养缺乏和药物的作用有关。近年来，已经强调了糠P孢子菌与脂溢性皮炎之间的关系，并且认为卵形糠P孢子菌在脂溢性皮炎的发病机理中起重要作用。此外，精神因素，饮食习惯，B族维生素缺乏症和酒精成瘾也可能对疾病的发生和发展产生一定的影响。全身性和难治性脂溢性皮炎可能是HIV感染的重要皮肤症状。根据典型的临床症状和体征，诊断并不困难。应确定以下疾病：1.头部和面部的牛皮癣伤害散落成片状，边界清晰，鳞片很粗，触感粗糙，头发不脱落，短发聚集成束，严重的伤害可以连成大块，延伸到前发际，侵入额头几厘米。刮擦鳞片具有薄膜现象（即刮掉鳞片，下面有红色光泽薄膜）和出血现象（即在轻微刮擦薄膜时可能出现小的出血点）。胶片现象和出血现象是牛皮癣病变的重要特征。2.玫瑰糠疹它通常发生在颈部，躯干和四肢的近端部分，呈椭圆形黄斑，中心略带黄色，边缘略微凸起。它呈浅红色，上面覆盖着白色糠状鳞片。最初，病变是单个病变，称为母斑;母斑逐渐变大，直径为2-5厘米或更大。有时，2-3个母斑可以同时出现。1-2个月后，一个接一个出现较小的红斑，发生在躯干。皮疹的长轴与皮纹一致。一般来说，它可以在4-6周内自发消退而不会复发。3.体癣病变边缘抬高并变窄。边界清晰，并且有一个圆形损伤，中央愈合延伸到周围区域。瘙痒症是显而易见的，患者通常有手足癣和足癣的病史。4.红斑性天疱疮：它主要分布在面部，颈部，胸部和背部中间。从面部对称性红斑，面部鳞屑和结痂开始，颈部和胸部背部出现红斑，破裂后形成结痂，尼氏征阳性。', 'neg_0': '脂溢性皮炎，也称为脂溢性湿疹，通常生长在分泌皮脂腺的区域，例如头部，面部，后颈部和躯干。它是慢性炎症的皮肤病。成人和新生儿经常患有这种皮肤病。在头皮上生长时，毛囊上常有红色斑块，上面有一层油腻的银色或绒面革，有不同程度的瘙痒。如果它生长在脸上，颈部后部通常从头部扩散，有黄红色或油腻的白色鳞片和绒面革和瘙痒。这应该及时处理。用

In [7]:
loss = MultipleNegativesRankingLoss(model)

In [12]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="../checkpoint/bge-small-zh-v1.5-sft-airbench",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2, # global batch size = 32 * 2 = 64
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=100000,
    save_total_limit=1,
    logging_steps=20,
    seed=42,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
)

In [13]:
import torch
from typing import Any

# fix transformer not compatible with sentence_transformers
class CustomSentenceTransformerTrainer(SentenceTransformerTrainer):
    def compute_loss(
        self,
        model: SentenceTransformer,
        inputs: dict[str, torch.Tensor | Any],
        return_outputs: bool = False,
        num_items_in_batch: int = None,
    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, Any]]:
        return super().compute_loss(model, inputs, return_outputs)
       
    
trainer = CustomSentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=datasets.Dataset.from_list(train_dataset).select_columns(
        train_dataset_keys
    ),
    eval_dataset=datasets.Dataset.from_list(eval_dataset).select_columns(
        eval_dataset_keys
    ),
    loss=loss,
)
trainer.train()

  super().__init__(


Step,Training Loss,Validation Loss
20,5.9736,0.660329
40,5.1356,0.649673
60,5.1576,0.657064
80,5.2499,0.667316
100,4.9397,0.605523
120,4.72,0.599599
140,5.2255,0.608033


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=153, training_loss=5.205962486516416, metrics={'train_runtime': 1346.4257, 'train_samples_per_second': 7.286, 'train_steps_per_second': 0.114, 'total_flos': 0.0, 'train_loss': 5.205962486516416, 'epoch': 0.996742671009772})

In [14]:
model.save("../checkpoint/bge-small-zh-v1.5-sft-airbench")