# Requirements and Initialization

In [None]:
!pip install matplotlib pandas seaborn sentence_transformers torch

In [None]:
import string
import torch

import seaborn as sns
import pandas as pd

from IPython.display import display
from matplotlib import pyplot as plt
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.cross_encoder import CrossEncoder
from torch.cuda import is_available

In [None]:
device = torch.device('cuda' if is_available() else 'cpu')

root = Path('..')
df_path = root / 'path/to/dataset.csv'
heval_df_path = root / 'path/to/human_annotations.csv'

df = pd.read_csv(df_path, index_col=0)
heval_df = pd.read_csv(heval_df_path)

In [None]:
sentence_pairs = [[row['src'], row['tgt']] for _, row in df.iterrows()]
sentences_src = [row['src'] for _, row in df.iterrows()]
sentences_tgt = [row['tgt'] for _, row in df.iterrows()]

# Pre-processing

In [None]:
def clean_extra_space(text):
  text = (text.replace(" .", ".")
    .replace(" ?", "?")
    .replace(" !", "!")
    .replace(" ,", ",")
    .replace(" ' ", "'"))
  return text

In [None]:
df['src'] = df['src'].apply(clean_extra_space)

# Cross-encoder 
The following blocks calculates scores from a cross-encoder model (berturk-cased + sts) and adds them to `df`

In [None]:
model = CrossEncoder('path/to/trained/crossencoder', device = 'cuda')
scores = model.predict(sentence_pairs)
df['berturk'] = scores

# Bi-encder

In [None]:
def get_similarity(src: str, tgt: str) -> float:
  src = src.translate(str.maketrans('', '', string.punctuation))
  tgt = tgt.translate(str.maketrans('', '', string.punctuation))
  src_embedding = model.encode(src)
  tgt_embedding = model.encode(tgt)
  return util.cos_sim(src_embedding, tgt_embedding)[0][0].item()

## distiluse-base-multilingual-cased-v2




In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2', device = device)

# compute cosine similarity
scores = [get_similarity(row['src'], row['tgt']) for _, row in df.iterrows()]
df['distiluse'] = scores

## paraphrase-multilingual-MiniLM-L12-v2 

In [None]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device = device)

# compute cosine similarity
scores = [get_similarity(row['src'], row['tgt']) for _, row in df.iterrows()]
df['multilingual-l12'] = scores

## emrecan/bert-base-turkish-cased-mean-nli-stsb-tr

In [None]:
model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')

# compute cosine similarity
scores = [get_similarity(row['src'], row['tgt']) for _, row in df.iterrows()]
df['emrecan'] = scores

# save and visualize

In [None]:
df = heval_df[heval_df['human'] > -1].reset_index(drop=True)
df.head()

In [None]:
dfm = df.drop(columns=['src', 'tgt'], axis=1)
dfm = pd.melt(dfm, id_vars=['dataset', 'human'], var_name='model', value_name='score')
dfm.head()

In [None]:
g = sns.FacetGrid(dfm, col='model', hue='dataset')
g.map(sns.scatterplot, 'human', 'score', alpha=.5)
g.add_legend()

In [None]:
for dataset_name in dfm['dataset'].unique():
  g = sns.FacetGrid(dfm[dfm['dataset'].str.match(dataset_name)], col='model')
  g.map(sns.scatterplot, 'human', 'score', alpha=.5)
  g.add_legend()
  g.fig.suptitle(dataset_name)

In [None]:
for dataset_name in dfm['dataset'].unique():
  sns.catplot(
    data=dfm[dfm['dataset'].str.match(dataset_name)], x='human', y='score',
    col='model', kind='box', col_wrap=4
  )
  plt.savefig(f'boxplot-{dataset_name}.pdf')
  plt.show()

In [None]:
for dataset_name in dfm['dataset'].unique():
  print(dataset_name)
  counts = df[df['dataset'].str.match(dataset_name)]['human'].value_counts().sort_index()
  print('\n'.join([f'{num}' for num in counts.values]))
  print('=' * 50)

## Stats

In [None]:
describe_df = pd.DataFrame.from_dict({'col': list(df.describe().index.values)})

for value in sorted(dfm['human'].unique()):
  for model_name in dfm['model'].unique():
    desc = dfm[(dfm['model'].str.match(model_name)) & (dfm['human'] == value)].drop(columns=['human'], axis=1).describe()
    describe_df[f'{model_name}_{value}'] = list(desc['score'].values)
display(describe_df)

In [None]:
df.corr(method='pearson')['human'][:-1]
for dataset_name in df['dataset'].unique():
  print('=' * 50)
  print(f'{dataset_name} correlation')
  print(df[df['dataset'].str.match(dataset_name)].corr(method='pearson')['human'][:-1])

## Choose boundaries

In [None]:
def print_stats(in_df = dfm, ratio = 0.9, model_name = 'berturk', dataset_name = ''):
  """
  Reports the required minimum score threshold to remove `ratio` of the undesired pairs.
  """
  temp_df = in_df[(in_df['dataset'].str.contains(dataset_name)) & (in_df['model'].str.match(f'{model_name}'))].sort_values(by=['score']).reset_index(drop=True)
  bad_df = temp_df[temp_df['human'] < 2]
  min_score = bad_df.iloc[int(len(bad_df) * ratio)]['score']
  print(f'MIN_SCORE: {min_score}')
  kept_ratio = len(temp_df[(temp_df['human'] > 1) & (temp_df['score'] > min_score)]) / len(temp_df[temp_df['human'] > 1])
  print(f'Keeping {kept_ratio} of valid pairs')

In [None]:
for model_name in dfm['model'].unique():
  print(f'{model_name}')
  print_stats(ratio=0.95, model_name = f'{model_name}')
  print('=' * 30)

for dataset_name in dfm['dataset'].unique():
  print(f'{dataset_name}')
  for model_name in dfm['model'].unique():
    print('-' * 30)
    print(f'{model_name}')
    print_stats(ratio=0.95, model_name = model_name, dataset_name = dataset_name)
  print('=' * 50)