In [2]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/163.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
import pandas as pd
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer, util
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
import math

In [31]:
class CustomDataset(Dataset):
    def __init__(self, file_path, scaler=None):
        self.data = pd.read_csv(file_path, sep='\t').dropna()
        self.scaler = scaler

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        score = item[0]
        sentence1 = str(item[1])
        sentence2 = str(item[2])
        if self.scaler:
            score = self.scaler.transform([[score]])[0][0]
        return sentence1, sentence2, score

def map_cos_to_zero_one(x):
    mapped_value = (x + 1) / 2
    return mapped_value

scaler = MinMaxScaler(feature_range=(0, 1)) 
validation_data = CustomDataset('Data/dev.csv', scaler=scaler)

scores = validation_data.data['score'].values.reshape(-1, 1)
scaler.fit(scores)

model = SentenceTransformer("all-MiniLM-L6-v2")

cos_similarities = []
labels = []

for sentence1, sentence2, score in validation_data:
    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)

    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    mapped_value = map_cos_to_zero_one(cosine_similarity)

    cos_similarities.append(mapped_value)
    labels.append(score)

pearsons_corr, _ = pearsonr(labels, cos_similarities)

print("Pearson's correlation coefficient on validation set:", pearsons_corr)


Pearson's correlation coefficient on validation set: 0.8631423871595579
