# Notebook of things dealing with Sentence Transformer library

Trainers and Evaluation with Sentence Transformers Library on sentence embeddings. 

Contains
- Data Preparation tools according to the loss functions 
- Available Samplers as part of Data Preprocessing operations in this python library 


Resources:
- https://sbert.net/docs/package_reference/sentence_transformer/sampler.html
- https://huggingface.co/blog/train-sentence-transformers

In [23]:
import numpy as np 
import pandas as pd 
from datasets import Dataset

from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import BinaryClassificationEvaluator, EmbeddingSimilarityEvaluator

model_card = "paraphrase-multilingual-MiniLM-L12-v2"
data_path = "/Users/mimiphan/Projects/wsdm-cup-multilingual-chatbot-arena/train.parquet"

def new_experiment(model_card=model_card, num_samples=2000):
    model = SentenceTransformer(model_card)
    ds = pd.read_parquet(data_path).sample(num_samples) 
    ds.reset_index(drop=True, inplace=True)
    return model, ds

## Loss Target, Anchor and Negative

**Data Column labels** 
1. `sentence` 
2. `label` 

## Contrastive Loss

**Data Column labels**

1. texts: 2 texts labeled 'sentence1' and 'sentence2'
    - `sentence1`
    - `sentence2`

2. labels: either 0 or 1 where 0 indicates increasing the distance between the 2 embeddings
    - `labels` 
    
**Evaluation Tools**
- Embedding Evaluator

In [3]:
model, ds = new_experiment()

In [5]:
# sentence gives the winning text responses only 
ds['sentence1'] = [i['prompt'] + i['response_a'] if i['winner'] == 'model_a' else i['prompt'] + i['response_b'] for _, i in ds.iterrows()] # winning data rows
ds['sentence2'] = [i['prompt'] + i['response_b'] if i['winner'] == 'model_a' else i['prompt'] + i['response_a'] for _, i in ds.iterrows()] # losing data rows
ds['label'] = np.zeros(ds.shape[0], dtype=int)

In [7]:
loss = losses.ContrastiveLoss(model)

In [8]:
task_column = ['sentence1', 'sentence2', 'label']
data = Dataset.from_pandas(ds[task_column]).train_test_split(train_size=0.7, shuffle=True)
train_data, eval_data = data['train'], data['test']

In [9]:
train_data

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 1400
})

In [10]:
trainer = SentenceTransformerTrainer(
    model=model, 
    train_dataset=train_data, 
    eval_dataset=eval_data, 
    loss=loss
)

In [11]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mmimipynb[0m ([33mdeathstar[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers`

Step,Training Loss
500,0.0024


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=525, training_loss=0.0022897982316291226, metrics={'train_runtime': 388.6186, 'train_samples_per_second': 10.808, 'train_steps_per_second': 1.351, 'total_flos': 0.0, 'train_loss': 0.0022897982316291226, 'epoch': 3.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.034214526414871216,
 'eval_runtime': 12.3156,
 'eval_samples_per_second': 48.719,
 'eval_steps_per_second': 6.09,
 'epoch': 3.0}

In [16]:
trainer.predict(train_data)

PredictionOutput(predictions=None, label_ids=None, metrics={'test_loss': 0.030817203223705292, 'test_runtime': 28.7547, 'test_samples_per_second': 48.688, 'test_steps_per_second': 6.086})

In [18]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [19]:
embedding_eval = EmbeddingSimilarityEvaluator(
    sentences1=eval_data['sentence1'], 
    sentences2=eval_data['sentence2'], 
    scores=eval_data['label'], 
    name='embedding_evaluation',
    show_progress_bar=True
)


In [20]:
results = embedding_eval(model)

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)


In [None]:
dev_mse = MSEEvaluator(
    source_sentences=eval_data["english"],
    target_sentences=eval_data["non_english"],
    name="en-fr-dev",
    teacher_model=model,
    batch_size=32,
)
