### Post train research

Let's compare the intermediate results of the fine-tuned and distilled version of the embedding model with the non-fine-tuned version.

#### Loading the models 

In [1]:
from sentence_transformers import SentenceTransformer 

xlm_roberta = SentenceTransformer('FacebookAI/xlm-roberta-base')
xlm_roberta

No sentence-transformers model found with name FacebookAI/xlm-roberta-base. Creating a new one with mean pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [2]:
xlm_roberta_ua_distilled = SentenceTransformer('panalexeu/xlm-roberta-ua-distilled')
xlm_roberta_ua_distilled

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [3]:
assert xlm_roberta[0].get_config_dict() == xlm_roberta_ua_distilled[0].get_config_dict()
assert xlm_roberta[1].get_config_dict() == xlm_roberta_ua_distilled[1].get_config_dict()

#### Evaluation on STS

In [4]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator 

# STS Benchmark (Semantic Textual Similarity Benchmark)
en_en_dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
en_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-en-ua-gpt-4o.csv', split='train') # when loading from csv by default train split is assigned
ua_ua_dataset = load_dataset('csv', data_files='../datasets/sts17-ua-ua-gpt-4o.csv', split='train')  # when loading from csv by default train split is assigned

# From documentation: Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels. 
en_en_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_en_dataset['sentence1'],
    sentences2=en_en_dataset['sentence2'],
    scores=[score / 5.0 for score in en_en_dataset['score']],  # normalizing to score from to 1
    show_progress_bar=False,
    name='sts17-en-en',
    batch_size=16
)

en_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_ua_dataset['sentence1'],
    sentences2=en_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in en_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-en-ua',
    batch_size=16
)

ua_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=ua_ua_dataset['sentence1'],
    sentences2=ua_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in ua_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-ua-ua',
    batch_size=16
)

# Composing evaluators in one chain! 
evaluator = SequentialEvaluator([en_en_eval, en_ua_eval, ua_ua_eval]) 

**Original** 

In [5]:
from sentence_transformers import SentenceTransformerTrainer 

SentenceTransformerTrainer(
    model=xlm_roberta,
    evaluator=evaluator
).evaluate()



{'eval_model_preparation_time': 0.0016,
 'eval_sts17-en-en_pearson_cosine': 0.361103392778967,
 'eval_sts17-en-en_spearman_cosine': 0.5217129934278466,
 'eval_sts17-en-ua_pearson_cosine': 0.12805519661342843,
 'eval_sts17-en-ua_spearman_cosine': 0.13454840278964744,
 'eval_sts17-ua-ua_pearson_cosine': 0.3024388738322073,
 'eval_sts17-ua-ua_spearman_cosine': 0.4151725913805182,
 'eval_sequential_score': 0.4151725913805182,
 'eval_runtime': 18.3665,
 'eval_samples_per_second': 0.0,
 'eval_steps_per_second': 0.0}

**Distilled and fine-tuned** 

In [6]:
from sentence_transformers import SentenceTransformerTrainer 

SentenceTransformerTrainer(
    model=xlm_roberta_ua_distilled,
    evaluator=evaluator
).evaluate()



{'eval_model_preparation_time': 0.0022,
 'eval_sts17-en-en_pearson_cosine': 0.6784819768189478,
 'eval_sts17-en-en_spearman_cosine': 0.7308493185913256,
 'eval_sts17-en-ua_pearson_cosine': 0.5925553784137351,
 'eval_sts17-en-ua_spearman_cosine': 0.6197606373137193,
 'eval_sts17-ua-ua_pearson_cosine': 0.6158998686613163,
 'eval_sts17-ua-ua_spearman_cosine': 0.6445750755380512,
 'eval_sequential_score': 0.6445750755380512,
 'eval_runtime': 15.4817,
 'eval_samples_per_second': 0.0,
 'eval_steps_per_second': 0.0}

#### Simple small test on quotes

In [7]:
quotes = [
    "An idiot admires complexity, a genius admires simplicity.",          # English
    "Ідіот захоплюється складністю, геній — простотою.",                  # Ukrainian
]

quotes_antonyms = [
    "Hello, World!",
    "Прощавай, Місяць."
]

# to check zero-shot crosslingual transfer effect 
quotes_extended = quotes + [
    "Идиот восхищается сложностью, гений — простотой.",                   # Russian
    "Ідыёт захапляецца складанасцю, геній — прастатой.",                  # Belarusian
    "Идиот се възхищава на сложността, гений — на простотата."            # Bulgarian
]

**Close quotes**

In [8]:
embeds = xlm_roberta.encode(quotes)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9967],
        [0.9967, 1.0000]])

In [9]:
embeds = xlm_roberta_ua_distilled.encode(quotes)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

tensor([[1.0000, 0.9446],
        [0.9446, 1.0000]])

**Antonyms**

In [10]:
embeds = xlm_roberta.encode(quotes_antonyms)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9867],
        [0.9867, 1.0000]])

In [11]:
embeds = xlm_roberta_ua_distilled.encode(quotes_antonyms)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

tensor([[1.0000, 0.7311],
        [0.7311, 1.0000]])

**Zero-shot cross-lingual transfer**

In [12]:
embeds = xlm_roberta.encode(quotes_extended)
xlm_roberta.similarity(embeds, embeds)

tensor([[1.0000, 0.9967, 0.9970, 0.9958, 0.9964],
        [0.9967, 1.0000, 0.9983, 0.9971, 0.9978],
        [0.9970, 0.9983, 1.0000, 0.9970, 0.9987],
        [0.9958, 0.9971, 0.9970, 1.0000, 0.9964],
        [0.9964, 0.9978, 0.9987, 0.9964, 1.0000]])

In [13]:
embeds = xlm_roberta_ua_distilled.encode(quotes_extended)
xlm_roberta_ua_distilled.similarity(embeds, embeds)

tensor([[1.0000, 0.9446, 0.9264, 0.9150, 0.9338],
        [0.9446, 1.0000, 0.9342, 0.9505, 0.9434],
        [0.9264, 0.9342, 1.0000, 0.9301, 0.9913],
        [0.9150, 0.9505, 0.9301, 1.0000, 0.9309],
        [0.9338, 0.9434, 0.9913, 0.9309, 1.0000]])