# Requirements

In [None]:
!pip install sentence_transformers pandas torch

In [None]:
import pandas as pd
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
import math
import torch
from sentence_transformers.cross_encoder import CrossEncoder

In [None]:
root = '/path/to/project'
batch_size = 32
num_epochs = 4
warmup_ratio = 0.1
model_save_path = f'{root}/path/to/saved/model'

# Preprocessing

In [None]:
df = pd.read_csv(f'{root}/path/to/dataset')
df.head()

In [None]:
df['score'] = df['score'] / 5

In [None]:
train = df[df['split'].str.match('train')][['score', 'sentence1', 'sentence2']].reset_index(drop=True)
dev = df[df['split'].str.match('dev')][['score', 'sentence1', 'sentence2']].reset_index(drop=True)
test = df[df['split'].str.match('test')][['score', 'sentence1', 'sentence2']].reset_index(drop=True)
train.head()

In [None]:
dev_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in dev.iterrows()]
test_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in test.iterrows()]
train_samples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in train.iterrows()]

In [None]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Training

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warmup_ratio) #10% of train data for warm-up

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model = CrossEncoder('dbmdz/bert-base-turkish-cased', num_labels=1).to(device)

In [None]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

# Evaluation

In [None]:
model = CrossEncoder(model_save_path)

In [None]:
evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='sts-test')

In [None]:
evaluator(model)