In [1]:
!uv pip install -qU sentence-transformers
!uv pip install -q transformers
!uv pip install -q datasets 
!uv pip install -q ipywidgets
!uv pip install -q pandas 
!uv pip install -q accelerate 

### Research

In the provided notebook, research will be conducted on the performance and methods for fine-tuning a local, open-source embedding model to achieve better performance in the Ukrainian language.

### Multilingual Distillation

We will start by researching the approach proposed in the following [research paper](https://arxiv.org/pdf/2004.09813). 

### Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset('sentence-transformers/parallel-sentences-talks', 'en-uk')
dataset 

DatasetDict({
    dev: Dataset({
        features: ['english', 'non_english'],
        num_rows: 993
    })
    train: Dataset({
        features: ['english', 'non_english'],
        num_rows: 201883
    })
})

In [3]:
train_dataset = dataset['train']
train_dataset.set_format(type='pandas')
train_df = train_dataset[:]

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201883 entries, 0 to 201882
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   english      201883 non-null  object
 1   non_english  201883 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [5]:
train_df.head()

Unnamed: 0,english,non_english
0,I want you to know that I believe kids will ea...,"Я хочу, щоб ви знали, що я впевнений що діти б..."
1,I want you to know that there are farmers' mar...,"Я хочу, щоб ви знали, що є фермерські ринки як..."
2,"I want you to know that me, my brother and sis...","Я хочу, щоб ви знали, що я, мої брат та сестра..."
3,I try to share this everywhere I go.,"Я намагаюся поділитися цим всюди, куди б я не ..."
4,"Not too long ago, my uncle said that he offere...","Не так давно, мій дядько сказав, що він запроп..."


In [6]:
eval_dataset = dataset['dev']
eval_dataset.set_format(type='pandas')
eval_df = eval_dataset[:]

In [7]:
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   english      993 non-null    object
 1   non_english  993 non-null    object
dtypes: object(2)
memory usage: 15.6+ KB


In [8]:
eval_df.head()

Unnamed: 0,english,non_english
0,"Thank you so much, Chris.","Дуже дякую, Кріс!"
1,And it's truly a great honor to have the oppor...,"Справді, для мене це велика честь мати можливі..."
2,"I have been blown away by this conference, and...","Я в захваті від цієї конференції, і я хочу под..."
3,"And I say that sincerely, partly because (Mock...","І, щиро кажучи, частково тому що – (Схлипує) –..."
4,(Laughter) Put yourselves in my position.,(Сміх) Поставте себе на моє місце!


### Load teacher and student model 

In [28]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling 

teacher_model_id = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
transformer_module = Transformer(teacher_model_id, model_args=dict(torch_dtype=torch.float16))
pooling_module = Pooling(
    word_embedding_dimension=transformer_module.get_word_embedding_dimension(),
    pooling_mode_cls_token=True,
    pooling_mode_mean_tokens=False
)
teacher_model = SentenceTransformer(modules=[transformer_module, pooling_module])
teacher_model.to('cuda')
teacher_model 

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [29]:
teacher_model[0].auto_model.dtype 

torch.float16

In [30]:
import torch 
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling 

student_model_id = 'FacebookAI/xlm-roberta-base'
transformer_module = Transformer(student_model_id, model_args=dict(torch_dtype=torch.float16))
pooling_module = Pooling(
    word_embedding_dimension=transformer_module.get_word_embedding_dimension(),
    pooling_mode_cls_token=False,
    pooling_mode_mean_tokens=True
)
student_model = SentenceTransformer(modules=[transformer_module, pooling_module])
student_model.to('cuda')
student_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [31]:
student_model[0].auto_model.dtype 

torch.float16

In [32]:
# assert that transformer config matched for student and teacher model
assert student_model[0].get_config_dict() == teacher_model[0].get_config_dict()

### Dtaset Preprocessing

In [14]:
def prepare_dataset(batch):
    return {
        'english': batch['english'],
        'non_english': batch['non_english'],
        'label': teacher_model.encode(batch['english'], padding=True, truncation=True, show_progress_bar=False)
   } 

In [15]:
dataset_labeled = train_dataset.map(prepare_dataset, batched=True, batch_size=50000)

In [16]:
dataset_labeled

Dataset({
    features: ['english', 'non_english', 'label'],
    num_rows: 201883
})

In [17]:
print(len(dataset_labeled['label']))
dataset_labeled['label'][0][:5]

201883


array([ 0.1409  , -0.014275, -0.282   ,  0.1771  ,  0.4404  ],
      dtype=float16)

### Defining loss function and evaluators

In [33]:
from sentence_transformers.losses import MSELoss

# Computes loss between computed sentence embeddings by student ('english' and 'non_english') and target computed by teacher ('label') 
mse_loss = MSELoss(model=student_model)

In [34]:
from sentence_transformers.evaluation import MSEEvaluator, EmbeddingSimilarityEvaluator, SequentialEvaluator 

# From documentation: The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
mse_eval = MSEEvaluator(
    source_sentences=eval_dataset['english'],
    target_sentences=eval_dataset['non_english'],
    name='mse-en-ua',
    teacher_model=teacher_model,
    batch_size=8
)

# STS Benchmark (Semantic Textual Similarity Benchmark)
en_en_dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
en_ua_dataset = load_dataset('csv', data_files='./datasets/sts17-en-ua-gpt-4o.csv', split='train') # when loading from csv by default train split is assigned
ua_ua_dataset = load_dataset('csv', data_files='./datasets/sts17-ua-ua-gpt-4o.csv', split='train')  # when loading from csv by default train split is assigned

# From documentation: Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation in comparison to the gold standard labels. 
en_en_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_en_dataset['sentence1'],
    sentences2=en_en_dataset['sentence2'],
    scores=[score / 5.0 for score in en_en_dataset['score']],  # normalizing to score from to 1
    show_progress_bar=False,
    name='sts17-en-en'
)

en_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=en_ua_dataset['sentence1'],
    sentences2=en_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in en_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-en-ua'
)

ua_ua_eval = EmbeddingSimilarityEvaluator(
    sentences1=ua_ua_dataset['sentence1'],
    sentences2=ua_ua_dataset['sentence2'],
    scores=[score / 5.0 for score in ua_ua_dataset['score']],
    show_progress_bar=False,
    name='sts17-ua-ua'
)

# Composing evaluators in one chain! 
evaluator = SequentialEvaluator([mse_eval, en_en_eval, en_ua_eval, ua_ua_eval]) 

### Defining training arguments

In [35]:
from sentence_transformers import SentenceTransformerTrainingArguments

train_args = SentenceTransformerTrainingArguments(
    output_dir='./xlm-roberta-ua-distilled',
    fp16=True,
    num_train_epochs=3,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    eval_steps=4096, 
    eval_strategy='steps',

    save_steps=4096, 
    save_strategy='steps',
    save_total_limit=2,
    
    logging_steps=100 # change to 500 in future 
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Training!

In [36]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=student_model,
    args=train_args,
    train_dataset=dataset_labeled,
    eval_dataset=eval_dataset,
    loss=mse_loss,
    evaluator=evaluator 
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [37]:
trainer.train()

RuntimeError: Found dtype Half but expected Float

In [None]:
trainer.evaluate()

### Saving

In [None]:
student_model.save('./xlm-roberta-ua-distilled/final')