In [None]:
!pip install --quiet -U sentence_transformers datasets torch pandas

In [None]:
import gdown
import torch
import pandas as pd

from pathlib import Path
from datetime import datetime
from torch.utils.data import DataLoader
from torch.cuda import is_available
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample, losses, models

In [None]:
# hyperparameters
train_batch_size = 32
num_epochs = 4
n_trainings = 5
sample_ratio = 0.05
augment = True

# arguments
model_checkpoint = 'dbmdz/bert-base-turkish-cased'
model_save_path = Path('model_checkpoint')
current_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
if augment:
  results_path = Path(f'/path/to/results')
else:
  num_epochs *= 2
  results_path = Path(f'/path/to/results')

In [None]:
results_path.mkdir(parents=True, exist_ok=True)
model_save_path.mkdir(parents=True, exist_ok=True)

# Download Model and Dataset

## STS-B

In [None]:
dataset_path = Path('/path/to/dataset')

In [None]:
raw_train_df = pd.read_csv(dataset_path / 'train_file_name.csv')
dev = pd.read_csv(dataset_path / 'dev_file_name.csv')
test = pd.read_csv(dataset_path / 'test_file_name.csv')

# # normalize score between [0-1]
raw_train_df['score'] = raw_train_df['score'] / 5
dev['score'] = dev['score'] / 5
test['score'] = test['score'] / 5

In [None]:
train = raw_train_df.sample(int(len(raw_train_df) * sample_ratio), random_state = 42)

In [None]:
train.head()

In [None]:
dev_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in dev.iterrows()]
test_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in test.iterrows()]
train_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in train.iterrows()]

In [None]:
if augment:
  train_samples = train_samples + [InputExample(texts=[row['sentence1_paraphrase'], row['sentence2_paraphrase']], label=row['score']) for _, row in train.iterrows()]

# Train

In [None]:
device = torch.device('cuda') if is_available() else torch.device('cpu')

In [None]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1 + .5) #10% of train data for warm-up

for i in range(n_trainings):
  dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name=f'berturk-stsb-dev-{i+1}')
  test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name=f'berturk-stsb-test{i+1}')

  # STS-b
  model = SentenceTransformer(model_checkpoint, device=device)
  train_loss = losses.CosineSimilarityLoss(model=model)
  model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=str(model_checkpoint))

  dev_evaluator(model, output_path=results_path)
  test_evaluator(model, output_path=results_path)