In [1]:
# !pip install sentence_transformers -U

**Import thư viện**

In [2]:
import os
from sentence_transformers import SentenceTransformer, CrossEncoder
import pandas as pd
from sentence_transformers import models, losses, evaluation
import json
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers.evaluation import RerankingEvaluator

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

**Download model**

In [3]:
# model_name = 'vinai/phobert-base-v2'
# word_embedding_model = models.Transformer(model_name, max_seq_length=216)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [4]:
model_name = "bkai-foundation-models/vietnamese-bi-encoder"
model = SentenceTransformer(model_name)

In [9]:
TRAIN_DATASET_FPATH = 'data/train.json'
VAL_DATASET_FPATH = 'data/test_embedding.json'

In [10]:
with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

**Train dataset**

In [11]:
dataset = train_dataset

queries = list(pd.DataFrame(dataset)['question'])
relevant_docs = list(pd.DataFrame(dataset)['chunk'])

In [13]:
examples = []
for ques, passa in zip(queries, relevant_docs):
    example = InputExample(texts=[ques, passa], label=1)
    examples.append(example)

**Test dataset**

In [15]:
test = pd.DataFrame(val_dataset).sort_values(by=['label_chunk_id', 'question']).reset_index()
questions_test = test['question'].unique()

In [16]:
samples = []
for ques in questions_test:
  sample = test[test['question'] == ques].reset_index()
  pos = []
  neg = []
  for idx in range(len(sample)):
    if sample['label'][idx] == 1: pos.append(sample['passage'][idx])
    else: neg.append(sample['passage'][idx])
  samples.append({
      'query': ques,
      'positive': pos,
      'negative': neg
  })

**Fine-tune**

In [17]:
evaluator = RerankingEvaluator(samples=samples, mrr_at_k=1)

In [18]:
EPOCHS = 5
BATCH_SIZE = 4

In [19]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE, shuffle=True
)

In [20]:
train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path=f'BKAI/',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=int(len(loader)*0.1),
    use_amp=True
)

In [34]:
model.save('BKAI')