In [1]:
!uv pip install -q mteb
!uv pip install -q sentence-transformers
!uv pip install -q pandas
!uv pip install -q rich

### Post-train Research

In this notebook, some inference tests and benchmarking of the fine-tuned and original models are performed. 

One of the benchmarks is STS-17 in the following configurations: en-en, en-ua (machine translated), ua-ua (machine translated). 

The machine translation of the STS-17 benchmark to the Ukrainian language was performed in this [notebook](../dataset_translation.ipynb).

#### Loading the models 

In [2]:
from sentence_transformers import SentenceTransformer 

# original 
xlm_roberta = SentenceTransformer('FacebookAI/xlm-roberta-base')
xlm_roberta

No sentence-transformers model found with name FacebookAI/xlm-roberta-base. Creating a new one with mean pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [3]:
# fine-tuned 
xlm_roberta_ua_distilled = SentenceTransformer('panalexeu/xlm-roberta-ua-distilled')
xlm_roberta_ua_distilled

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [4]:
# assure that the models have identical configurations 
assert xlm_roberta[0].get_config_dict() == xlm_roberta_ua_distilled[0].get_config_dict()
assert xlm_roberta[1].get_config_dict() == xlm_roberta_ua_distilled[1].get_config_dict()

In [5]:
# popular monolingual embedding model, with minimal memory consumption
all_mini_l6 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
all_mini_l6

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

#### STS-17 Evaluation

In [6]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator 

# STS Benchmark (Semantic Textual Similarity Benchmark)
en_en_dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
en_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-en-ua-gpt-4o.csv', split='train') # when loading from csv by default train split is assigned
ua_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-ua-ua-gpt-4o.csv', split='train')  # when loading from csv by default train split is assigned

# From documentation: Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels. 
en_en_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_en_dataset['sentence1'],
    sentences2=en_en_dataset['sentence2'],
    scores=[score / 5.0 for score in en_en_dataset['score']],  # normalizing to score from to 1
    show_progress_bar=False,
    name='sts17-en-en',
    batch_size=16
)

en_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_ua_dataset['sentence1'],
    sentences2=en_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in en_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-en-ua',
    batch_size=16
)

ua_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=ua_ua_dataset['sentence1'],
    sentences2=ua_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in ua_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-ua-ua',
    batch_size=16
)

# Composing evaluators in one chain! 
evaluator = SequentialEvaluator([en_en_eval, en_ua_eval, ua_ua_eval]) 

**Original** 

In [7]:
from rich import print 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=xlm_roberta,
    evaluator=evaluator
).evaluate()

print(res)

**Fine-tuned** 

In [8]:
import pandas as pd 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=xlm_roberta_ua_distilled,
    evaluator=evaluator
).evaluate()

print(res)

**all-MiniLM-L6-v2** 

In [9]:
import pandas as pd 
from sentence_transformers import SentenceTransformerTrainer 

res = SentenceTransformerTrainer(
    model=all_mini_l6,
    evaluator=evaluator
).evaluate()

print(res)

#### MTEB

In [10]:
import mteb
tasks = mteb.get_tasks(tasks=['WebFAQRetrieval'], languages=['ukr'])  # mteb.get_tasks(task_types=['Retrieval'], languages=['ukr'])
evaluation = mteb.MTEB(tasks=tasks)
tasks

MTEBTasks(WebFAQRetrieval(name='WebFAQRetrieval', languages=['ukr']),)

**Original model**

In [11]:
evaluation.run(xlm_roberta, encode_kwargs={"batch_size": 32})

Batches:   0%|          | 0/215 [00:00<?, ?it/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

[TaskResult(task_name=WebFAQRetrieval, scores=...)]

**Fine-tuned** 

In [12]:
evaluation.run(xlm_roberta_ua_distilled, encode_kwargs={"batch_size": 32})

Batches:   0%|          | 0/215 [00:00<?, ?it/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

[TaskResult(task_name=WebFAQRetrieval, scores=...)]

**all-MiniLM-L6-v2** 

In [21]:
evaluation.run(all_mini_l6, encode_kwargs={"batch_size": 32})

[TaskResult(task_name=WebFAQRetrieval, scores=...)]

#### Minimal test on quotes

In [22]:
quotes = [
    "An idiot admires complexity, a genius admires simplicity.",          # English
    "Ідіот захоплюється складністю, геній — простотою.",                  # Ukrainian
]

quotes_antonyms = [
    "Hello, World!",
    "Прощавай, Місяць."
]

# to check zero-shot crosslingual transfer effect 
quotes_extended = quotes + [
    "Идиот восхищается сложностью, гений — простотой.",                   # Russian
    "Ідыёт захапляецца складанасцю, геній — прастатой.",                  # Belarusian
    "Идиот се възхищава на сложността, гений — на простотата."            # Bulgarian
]
# P.S. I love Terry Davis.

**Close quotes**

In [23]:
embeds = xlm_roberta.encode(quotes)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9967],
        [0.9967, 1.0000]])

In [24]:
embeds = xlm_roberta_ua_distilled.encode(quotes)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

tensor([[1.0000, 0.9446],
        [0.9446, 1.0000]])

**Antonyms**

In [25]:
embeds = xlm_roberta.encode(quotes_antonyms)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9867],
        [0.9867, 1.0000]])

In [26]:
embeds = xlm_roberta_ua_distilled.encode(quotes_antonyms)
xlm_roberta_ua_distilled.similarity(embeds, embeds) 

tensor([[1.0000, 0.7311],
        [0.7311, 1.0000]])

**Zero-shot cross-lingual transfer**

In [27]:
embeds = xlm_roberta.encode(quotes_extended)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9967, 0.9970, 0.9958, 0.9964],
        [0.9967, 1.0000, 0.9983, 0.9971, 0.9978],
        [0.9970, 0.9983, 1.0000, 0.9970, 0.9987],
        [0.9958, 0.9971, 0.9970, 1.0000, 0.9964],
        [0.9964, 0.9978, 0.9987, 0.9964, 1.0000]])

In [28]:
embeds = xlm_roberta_ua_distilled.encode(quotes_extended)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

tensor([[1.0000, 0.9446, 0.9264, 0.9150, 0.9338],
        [0.9446, 1.0000, 0.9342, 0.9505, 0.9434],
        [0.9264, 0.9342, 1.0000, 0.9301, 0.9913],
        [0.9150, 0.9505, 0.9301, 1.0000, 0.9309],
        [0.9338, 0.9434, 0.9913, 0.9309, 1.0000]])