In [1]:
!uv pip install -q mteb
!uv pip install -q sentence-transformers
!uv pip install -q pandas
!uv pip install -q rich

### Post-train Research

In this notebook, some inference tests and benchmarking of the fine-tuned and original models are performed. 

One of the benchmarks is STS-17 in the following configurations: en-en, en-ua (machine translated), ua-ua (machine translated). 

The machine translation of the STS-17 benchmark to the Ukrainian language was performed in this [notebook](../dataset_translation.ipynb).

#### Loading the models 

In [2]:
from sentence_transformers import SentenceTransformer 

# original 
xlm_roberta = SentenceTransformer('FacebookAI/xlm-roberta-base')
xlm_roberta

No sentence-transformers model found with name FacebookAI/xlm-roberta-base. Creating a new one with mean pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [3]:
# fine-tuned 
xlm_roberta_ua_distilled = SentenceTransformer('panalexeu/xlm-roberta-ua-distilled')
xlm_roberta_ua_distilled

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [4]:
# assure that the models have identical configurations 
assert xlm_roberta[0].get_config_dict() == xlm_roberta_ua_distilled[0].get_config_dict()
assert xlm_roberta[1].get_config_dict() == xlm_roberta_ua_distilled[1].get_config_dict()

In [5]:
# teacher model
multi_qa_mpnet = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
multi_qa_mpnet

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

#### STS-17 Evaluation

In [6]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator 

# STS Benchmark (Semantic Textual Similarity Benchmark)
en_en_dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
en_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-en-ua-gpt-4o.csv', split='train') # when loading from csv by default train split is assigned
ua_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-ua-ua-gpt-4o.csv', split='train')  # when loading from csv by default train split is assigned

# From documentation: Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels. 
en_en_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_en_dataset['sentence1'],
    sentences2=en_en_dataset['sentence2'],
    scores=[score / 5.0 for score in en_en_dataset['score']],  # normalizing to score from to 1
    show_progress_bar=False,
    name='sts17-en-en',
    batch_size=16
)

en_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_ua_dataset['sentence1'],
    sentences2=en_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in en_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-en-ua',
    batch_size=16
)

ua_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=ua_ua_dataset['sentence1'],
    sentences2=ua_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in ua_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-ua-ua',
    batch_size=16
)

# Composing evaluators in one chain! 
evaluator = SequentialEvaluator([en_en_eval, en_ua_eval, ua_ua_eval]) 

Using the latest cached version of the dataset since mteb/sts17-crosslingual-sts couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-en' at /home/oleksii/.cache/huggingface/datasets/mteb___sts17-crosslingual-sts/en-en/0.0.0/faeb762787bd10488a50c8b5be4a3b82e411949c (last modified on Wed Apr  9 19:03:01 2025).


**Original** 

In [7]:
from rich import print 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=xlm_roberta,
    evaluator=evaluator
).evaluate()

print(res)

**Fine-tuned** 

In [8]:
import pandas as pd 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=xlm_roberta_ua_distilled,
    evaluator=evaluator
).evaluate()

print(res)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


**multi-qa-mpnet-base-dot-v1** 

In [9]:
import pandas as pd 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=multi_qa_mpnet,
    evaluator=evaluator
).evaluate()

print(res)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Spearman scores table: 

| model                                | en-en | en-ua    | ua-ua    | 
| ------------------------------------ | ----- | -------- | -------- |  
| multi-qa-mpnet-base-dot-v1           | 75.8  | 12.9     | 62.3     |
| XLM-RoBERTa                          | 52.2  | 13.5     | 41.5     |
| xlm-roberta-ua-distilled*            | 73.1  | **62.0** | **64.5** |

#### MTEB

In [None]:
import mteb
tasks = mteb.get_tasks(tasks=['WebFAQRetrieval'], languages=['ukr'])  # mteb.get_tasks(task_types=['Retrieval'], languages=['ukr'])
evaluation = mteb.MTEB(tasks=tasks)
tasks

**Original model**

In [None]:
evaluation.run(xlm_roberta, encode_kwargs={"batch_size": 32})

**Fine-tuned** 

In [None]:
evaluation.run(xlm_roberta_ua_distilled, encode_kwargs={"batch_size": 32})

**all-MiniLM-L6-v2** 

In [None]:
evaluation.run(multi_qa_mpnet, encode_kwargs={"batch_size": 32})

#### Minimal test on quotes

In [None]:
quotes = [
    "An idiot admires complexity, a genius admires simplicity.",          # English
    "Ідіот захоплюється складністю, геній — простотою.",                  # Ukrainian
]

quotes_antonyms = [
    "Hello, World!",
    "Прощавай, Місяць."
]

# to check zero-shot crosslingual transfer effect 
quotes_extended = quotes + [
    "Идиот восхищается сложностью, гений — простотой.",                   # Russian
    "Ідыёт захапляецца складанасцю, геній — прастатой.",                  # Belarusian
    "Идиот се възхищава на сложността, гений — на простотата."            # Bulgarian
]
# P.S. I love Terry Davis.

**Close quotes**

In [None]:
embeds = xlm_roberta.encode(quotes)
xlm_roberta.similarity(embeds, embeds)

In [None]:
embeds = xlm_roberta_ua_distilled.encode(quotes)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

**Antonyms**

In [None]:
embeds = xlm_roberta.encode(quotes_antonyms)
xlm_roberta.similarity(embeds, embeds)

In [None]:
embeds = xlm_roberta_ua_distilled.encode(quotes_antonyms)
xlm_roberta_ua_distilled.similarity(embeds, embeds) 

**Zero-shot cross-lingual transfer**

In [None]:
embeds = xlm_roberta.encode(quotes_extended)
xlm_roberta.similarity(embeds, embeds)

In [None]:
embeds = xlm_roberta_ua_distilled.encode(quotes_extended)
xlm_roberta_ua_distilled.similarity(embeds, embeds)