In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import pearsonr, spearmanr
import numpy as np
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.metrics.pairwise import (
    paired_cosine_distances,
    paired_euclidean_distances,
    paired_manhattan_distances,
)

In [20]:
def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
    assert len(scores) == len(labels)
    rows = list(zip(scores, labels))

    rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)

    max_acc = 0
    best_threshold = -1

    positive_so_far = 0
    remaining_negatives = sum(np.array(labels) == 0)

    for i in range(len(rows) - 1):
        score, label = rows[i]
        if label == 1:
            positive_so_far += 1
        else:
            remaining_negatives -= 1

        acc = (positive_so_far + remaining_negatives) / len(labels)
        if acc > max_acc:
            max_acc = acc
            best_threshold = (rows[i][0] + rows[i + 1][0]) / 2

    return max_acc, best_threshold
    
def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
    assert len(scores) == len(labels)

    scores = np.asarray(scores)
    labels = np.asarray(labels)

    rows = list(zip(scores, labels))

    rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)

    best_f1 = best_precision = best_recall = 0
    threshold = 0
    nextract = 0
    ncorrect = 0
    total_num_duplicates = sum(labels)

    for i in range(len(rows) - 1):
        score, label = rows[i]
        nextract += 1

        if label == 1:
            ncorrect += 1

        if ncorrect > 0:
            precision = ncorrect / nextract
            recall = ncorrect / total_num_duplicates
            f1 = 2 * precision * recall / (precision + recall)
            if f1 > best_f1:
                best_f1 = f1
                best_precision = precision
                best_recall = recall
                threshold = (rows[i][0] + rows[i + 1][0]) / 2

    return best_f1, best_precision, best_recall, threshold

In [14]:
def ap_score(scores, labels, high_score_more_similar: bool):
    return average_precision_score(labels, scores * (1 if high_score_more_similar else -1))

In [21]:
def _compute_metrics(scores, labels, high_score_more_similar):
    
    acc, acc_threshold = find_best_acc_and_threshold(
        scores, labels, high_score_more_similar
    )
    f1, precision, recall, f1_threshold = find_best_f1_and_threshold(
        scores, labels, high_score_more_similar
    )
    ap = ap_score(scores, labels, high_score_more_similar)

    return {
        "accuracy": acc,
        "accuracy_threshold": acc_threshold,
        "f1": f1,
        "f1_threshold": f1_threshold,
        "precision": precision,
        "recall": recall,
        "ap": ap,
    }

In [26]:
def compute_metrics(dataset,model):
    sentence1, sentence2, labels = zip(*dataset)
    sentences1 = list(sentence1)
    sentences2 = list(sentence2)
    labels = [int(x) for x in labels]
    
    sentences = list(set(sentences1 + sentences2))
        
    embeddings = np.asarray(model.encode(sentences))
    emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
    embeddings1 = [emb_dict[sent] for sent in sentences1]
    embeddings2 = [emb_dict[sent] for sent in sentences2]
    
    cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
    manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
    euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
    
    embeddings1_np = np.asarray(embeddings1)
    embeddings2_np = np.asarray(embeddings2)
    dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
    
    labels = np.asarray(labels)
    output_scores = {}
    for short_name, name, scores, reverse in [
        ["cos_sim", "Cosine-Similarity", cosine_scores, True],
        ["manhattan", "Manhattan-Distance", manhattan_distances, False],
        ["euclidean", "Euclidean-Distance", euclidean_distances, False],
        ["dot", "Dot-Product", dot_scores, True],
    ]:
        output_scores[short_name] = _compute_metrics(scores, labels, reverse)

    return output_scores

In [10]:
def cal_score(dataset, model):
    scores = compute_metrics(dataset,model)

    # Main score is the max of Average Precision (AP)
    main_score = max(scores[short_name]["ap"] for short_name in scores)
    scores["main_score"] = main_score
    return scores

In [4]:
model_name = 'kornwtp/ConGen-simcse-model-roberta-base-thai' # kornwtp/ConGen-simcse-model-roberta-base-thai
model = SentenceTransformer(model_name)
model.max_seq_length = 200

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/kornwtp_ConGen-simcse-model-roberta-base-thai. Creating a new one with MEAN pooling.


# Run data_set = 'test' and 'dev'

In [29]:
path = 'XNLI-1.0/xnli.{}.tsv' # download from https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip
data_set = 'test'

df = pd.read_csv(path.format(data_set),sep='\t')
df.loc[(df['language']=='th') & (df['gold_label']=='contradiction'),'label'] = '0'
df.loc[(df['language']=='th') & (df['gold_label']=='entailment'),'label'] = '1'

dataset = df[(df['language']=='th') & (df['gold_label']!='neutral')][['sentence1','sentence2','label']].values.tolist()

score = cal_score(dataset, model)
print(f"AP score:{score['cos_sim']['ap']:.4f}")


AP score:0.6666


In [30]:
path = 'XNLI-1.0/xnli.{}.tsv' # download from https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip
data_set = 'dev'

df = pd.read_csv(path.format(data_set),sep='\t')
df.loc[(df['language']=='th') & (df['gold_label']=='contradiction'),'label'] = '0'
df.loc[(df['language']=='th') & (df['gold_label']=='entailment'),'label'] = '1'

dataset = df[(df['language']=='th') & (df['gold_label']!='neutral')][['sentence1','sentence2','label']].values.tolist()

score = cal_score(dataset, model)
print(f"AP score:{score['cos_sim']['ap']:.4f}")

AP score:0.6526
