In [2]:
!uv pip install -qU sentence-transformers
!uv pip install -q transformers
!uv pip install -q datasets 
!uv pip install -q ipywidgets
!uv pip install -q pandas 
!uv pip install -q accelerate 

#### Research

In the provided notebook, research will be conducted on the performance and methods for fine-tuning a local, open-source embedding model to achieve better performance in the Ukrainian language.

### Multilingual Distillation

We will start by researching the approach proposed in the following [research paper](https://arxiv.org/pdf/2004.09813). 

#### Loading teacher model

In [2]:
from sentence_transformers import SentenceTransformer

teacher_model_id = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
teacher_model = SentenceTransformer(teacher_model_id)
teacher_model.to('cuda')
teacher_model 

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [13]:
teacher_model[0].auto_model.dtype

torch.float32

#### Dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset('sentence-transformers/parallel-sentences-talks', 'en-uk')
dataset 

DatasetDict({
    dev: Dataset({
        features: ['english', 'non_english'],
        num_rows: 993
    })
    train: Dataset({
        features: ['english', 'non_english'],
        num_rows: 201883
    })
})

In [4]:
train_dataset = dataset['train']
train_dataset.set_format(type='pandas')
train_df = train_dataset[:]

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201883 entries, 0 to 201882
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   english      201883 non-null  object
 1   non_english  201883 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [6]:
train_df.head()

Unnamed: 0,english,non_english
0,I want you to know that I believe kids will ea...,"Я хочу, щоб ви знали, що я впевнений що діти б..."
1,I want you to know that there are farmers' mar...,"Я хочу, щоб ви знали, що є фермерські ринки як..."
2,"I want you to know that me, my brother and sis...","Я хочу, щоб ви знали, що я, мої брат та сестра..."
3,I try to share this everywhere I go.,"Я намагаюся поділитися цим всюди, куди б я не ..."
4,"Not too long ago, my uncle said that he offere...","Не так давно, мій дядько сказав, що він запроп..."


In [7]:
eval_dataset = dataset['dev']
eval_dataset.set_format(type='pandas')
eval_df = eval_dataset[:]

In [8]:
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   english      993 non-null    object
 1   non_english  993 non-null    object
dtypes: object(2)
memory usage: 15.6+ KB


In [9]:
eval_df.head()

Unnamed: 0,english,non_english
0,"Thank you so much, Chris.","Дуже дякую, Кріс!"
1,And it's truly a great honor to have the oppor...,"Справді, для мене це велика честь мати можливі..."
2,"I have been blown away by this conference, and...","Я в захваті від цієї конференції, і я хочу под..."
3,"And I say that sincerely, partly because (Mock...","І, щиро кажучи, частково тому що – (Схлипує) –..."
4,(Laughter) Put yourselves in my position.,(Сміх) Поставте себе на моє місце!


#### Dtaset Preprocessing

In [19]:
res = teacher_model.encode('Thank you so much, Chris.', padding=True, truncation=True) 
print(len(res))
res[:5]


768


array([-0.33192992, -0.30908677, -0.41517597, -0.02691368,  0.0026244 ],
      dtype=float32)

In [20]:
def prepare_dataset(batch):
    return {
        'english': batch['english'],
        'non_english': batch['non_english'],
        'label': teacher_model.encode(batch['english'], padding=True, truncation=True, show_progress_bar=False)
   } 

In [21]:
dataset_labeled = train_dataset.map(prepare_dataset, batched=True, batch_size=30000)

Map:   0%|          | 0/201883 [00:00<?, ? examples/s]

In [24]:
dataset_labeled

Dataset({
    features: ['english', 'non_english', 'label'],
    num_rows: 201883
})

In [25]:
print(len(dataset_labeled['label'][0][:5]))
dataset_labeled['label'][0][:5]

5


array([ 0.14045112, -0.01442104, -0.28194457,  0.17714249,  0.44074324],
      dtype=float32)

#### Loading student model

In [26]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling 

student_model_id = 'FacebookAI/xlm-roberta-base'
transformer_module = Transformer(student_model_id)
pooling_module = Pooling(
    word_embedding_dimension=transformer_module.get_word_embedding_dimension(),
    pooling_mode_cls_token=True,
    pooling_mode_mean_tokens=False
)
student_model = SentenceTransformer(modules=[transformer_module, pooling_module])
student_model.to('cuda')
student_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [27]:
student_model[0].auto_model.dtype

torch.float32

In [28]:
# assert that transformer config matched for student and teacher model
assert student_model[0].get_config_dict() == teacher_model[0].get_config_dict()

In [29]:
# assert that pooling config matches for student and teacher model
assert student_model[1].get_config_dict() == teacher_model[1].get_config_dict()

#### Defining global training args

In [35]:
BATCH_SIZE = 64 

#### Defining loss function and evaluation

In [33]:
from sentence_transformers.losses import MSELoss

train_loss = MSELoss(model=student_model)

In [37]:
from sentence_transformers.evaluation import MSEEvaluator 

mse_eval = MSEEvaluator(
    source_sentences=eval_dataset['english'],
    target_sentences=eval_dataset['non_english'],
    name='en-ua-dev',
    teacher_model=teacher_model,
    batch_size=BATCH_SIZE
) 