In [None]:
!git clone https://github.com/mymusise/sentence-transformers-tf.git
%cd sentence-transformers-tf
!pip install -q sentence-transformers datasets

# Initialize model

In [None]:
from sentence_transformers_tf import TFSentenceTransformer

tfstmodel = TFSentenceTransformer("sentence-transformers/stsb-xlm-r-multilingual")

# Get AFQMC dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("clue", "afqmc")
dataset

# Preprocess

In [None]:
def encode(examples):
  sent_ids1 = tfstmodel.model.tokenizer(examples['sentence1'], return_tensors="tf", max_length=32, padding="max_length", truncation=True)
  sent_ids2 = tfstmodel.model.tokenizer(examples['sentence2'], return_tensors="tf", max_length=32, padding="max_length", truncation=True)
  return {"input_ids": sent_ids1['input_ids'][0], "target_ids": sent_ids2['input_ids'][0]}

bs = 48
train_inputs = dataset['train'].map(encode, remove_columns=["idx", "sentence1", "sentence2"])
train_inputs = train_inputs.to_tf_dataset(columns=["input_ids", "target_ids"], label_cols=["label"], batch_size=bs, shuffle=True)

# Finetune

In [None]:
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
tfstmodel.compile(optimizer=optimizer)

tfstmodel.fit(train_inputs, epochs=5)

# Evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support, precision_score, f1_score
from scipy.spatial.distance import cosine
from tqdm import tqdm

eva_input = []
eva_y = []
eva_y_pred = []

test_set = dataset['validation']

vec1s = tfstmodel.encode(test_set['sentence1'])
vec2s = tfstmodel.encode(test_set['sentence2'])
label = test_set['label']
for v1, v2, l in tqdm(zip(vec1s, vec2s, label)):
    dis = cosine(v1, v2)
    sim = 1 - dis
    eva_y_pred.append(1 if sim > 0.5 else 0)
    # eva_input.append([sent1, sent2])
    eva_y.append(l)

precision = precision_score(eva_y, eva_y_pred)
f1 = f1_score(eva_y, eva_y_pred)
print(f"{precision=}, {f1=}")