#### Evaluating teacher and student models before training distillation

In [1]:
!uv pip install -qU sentence-transformers
!uv pip install -q transformers
!uv pip install -q datasets 
!uv pip install -q ipywidgets
!uv pip install -q pandas 
!uv pip install -q 'accelerate>=0.26.0'

#### Uploading datasets 

In [2]:
from datasets import load_dataset

en_en_dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
en_en_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'score', 'lang'],
    num_rows: 250
})

In [3]:
en_en_dataset.set_format(type='pandas')
en_en_df = en_en_dataset[:]
en_en_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sentence1  250 non-null    object 
 1   sentence2  250 non-null    object 
 2   score      250 non-null    float64
 3   lang       250 non-null    object 
dtypes: float64(1), object(3)
memory usage: 7.9+ KB


In [9]:
en_en_df.head() 

Unnamed: 0,sentence1,sentence2,score,lang
0,A person is on a baseball team.,A person is playing basketball on a team.,2.4,en-en
1,Our current vehicles will be in museums when e...,The car needs to some work,0.2,en-en
2,A woman supervisor is instructing the male wor...,A woman is working as a nurse.,1.0,en-en
3,A bike is next to a couple women.,A child next to a bike.,2.0,en-en
4,The group is eating while taking in a breathta...,A group of people take a look at an unusual tree.,2.2,en-en


In [10]:
en_ua_dataset = load_dataset('csv', data_files='./datasets/sts17-en-ua-gpt-4o.csv', split='train') # when loading from csv by default train split is assigned
en_ua_dataset.set_format(type='pandas')
en_ua_df = en_ua_dataset[:]
en_ua_df.head()

Unnamed: 0,score,sentence1,sentence2,lang
0,2.4,A person is on a baseball team.,Одна людина грає в баскетбол у команді.,en-ua
1,0.2,Our current vehicles will be in museums when e...,Автомобіль потребує ремонту.,en-ua
2,1.0,A woman supervisor is instructing the male wor...,Жінка працює медсестрою.,en-ua
3,2.0,A bike is next to a couple women.,Дитина поруч із велосипедом.,en-ua
4,2.2,The group is eating while taking in a breathta...,Група людей оглядає незвичне дерево.,en-ua


In [11]:
ua_ua_dataset = load_dataset('csv', data_files='./datasets/sts17-ua-ua-gpt-4o.csv', split='train')  # when loading from csv by default train split is assigned
ua_ua_dataset.set_format(type='pandas')
ua_ua_df = ua_ua_dataset[:]
ua_ua_df.head()

Unnamed: 0,score,sentence1,sentence2,lang
0,2.4,Людина є членом бейсбольної команди.,Одна людина грає в баскетбол у команді.,ua-ua
1,0.2,"Наші теперішні автомобілі будуть в музеях, кол...",Автомобіль потребує ремонту.,ua-ua
2,1.0,Жінка-керівник керує чоловіками-працівниками.,Жінка працює медсестрою.,ua-ua
3,2.0,Велосипед стоїть поруч із кількома жінками.,Дитина поруч із велосипедом.,ua-ua
4,2.2,"Група їсть, насолоджуючись захоплюючим краєвидом.",Група людей оглядає незвичне дерево.,ua-ua


#### Defining evaluator 

In [12]:
from sentence_transformers.evaluation import SequentialEvaluator 
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

en_en_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_en_df['sentence1'],
    sentences2=en_en_df['sentence2'],
    scores=[score / 5.0 for score in en_en_df['score']],
    show_progress_bar=False,
    name='sts17-en-en'
)

en_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_ua_df['sentence1'],
    sentences2=en_ua_df['sentence2'],
    scores=[score / 5.0 for score in en_ua_df['score']],
    show_progress_bar=False,
    name='sts17-en-ua'
)

ua_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=ua_ua_df['sentence1'],
    sentences2=ua_ua_df['sentence2'],
    scores=[score / 5.0 for score in ua_ua_df['score']],
    show_progress_bar=False,
    name='sts17-ua-ua'
)

evaluator = SequentialEvaluator([en_en_eval, en_ua_eval, ua_ua_eval]) 

In [14]:
import torch
torch.cuda.manual_seed(0)

#### Student model 

In [15]:
import torch 
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling 

student_model_id = 'FacebookAI/xlm-roberta-base'
transformer_module = Transformer(student_model_id, model_args=dict(torch_dtype=torch.float16))
pooling_module = Pooling(
    word_embedding_dimension=transformer_module.get_word_embedding_dimension(),
    pooling_mode_cls_token=False,
    pooling_mode_mean_tokens=True
)
student_model = SentenceTransformer(modules=[transformer_module, pooling_module])
student_model.to('cuda')
student_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [16]:
student_model[0].auto_model.dtype 

torch.float16

#### Teacher model 

In [17]:
from sentence_transformers import SentenceTransformer

teacher_model_id = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
transformer_module = Transformer(teacher_model_id, model_args=dict(torch_dtype=torch.float16))
pooling_module = Pooling(
    word_embedding_dimension=transformer_module.get_word_embedding_dimension(),
    pooling_mode_cls_token=True,
    pooling_mode_mean_tokens=False
)
teacher_model = SentenceTransformer(modules=[transformer_module, pooling_module])
teacher_model.to('cuda')
teacher_model 

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [18]:
teacher_model[0].auto_model.dtype 

torch.float16

In [26]:
model = student_model # student/teacher 

#### Evaluation!

In [27]:
from sentence_transformers import SentenceTransformerTrainer 

trainer = SentenceTransformerTrainer(
    model=model,
    evaluator=evaluator
) 
trainer.evaluate()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_model_preparation_time': 0.0019,
 'eval_sts17-en-en_pearson_cosine': 0.3607233813347164,
 'eval_sts17-en-en_spearman_cosine': 0.5223426349785057,
 'eval_sts17-en-ua_pearson_cosine': 0.12850498333708454,
 'eval_sts17-en-ua_spearman_cosine': 0.13417707572131005,
 'eval_sts17-ua-ua_pearson_cosine': 0.3034736859893272,
 'eval_sts17-ua-ua_spearman_cosine': 0.41535823542678046,
 'eval_sequential_score': 0.41535823542678046,
 'eval_runtime': 0.7673,
 'eval_samples_per_second': 0.0,
 'eval_steps_per_second': 0.0}

#### Results 

The performance is measured using Spearman correlation between the predicted similarity score and the gold score for different model configurations.

| Model                                                | En - En | En - Ua | Ua - Ua | 
| ---------------------------------------------------- | ------- | ------- | ------- |
| XLM-RoBERTa (mean pooling, float 32)                 |  52.2   | -       | -       |
| **XLM-RoBERTa (mean pooling, float 16)**             |  52.2   | 13.4    | 41.5    |
| XLM-RoBERTa (cls token, float 32)                    |  5.8    | -       | -       |
| multi-qa-mpnet-base-dot-v1 (cls token, float 32)     |  76.8   | -       | -       |
| **multi-qa-mpnet-base-dot-v1 (cls token, float 16)** |  75.8   | 12.9    | 62.3    |
| multi-qa-mpnet-base-dot-v1 (mean pooling, float 32)  |  76.0   | -       | -       |
 

#### Conclusions

Loading models with lower `fp16` precision doesn't change the results.

`multi-qa-mpnet-base-dot-v1` (a monolingual model) achieves better performance on the STS Benchmark (Semantic Textual Similarity Benchmark) for `en-en` pairs.

Changing the pooling strategy for XLM-RoBERTa (initially loaded with mean pooling) to CLS token results in a significant performance decrease.

In contrast, changing the pooling strategy for `multi-qa-mpnet-base-dot-v1` to mean (initially loaded with CLS) results in only a minor decrease.
