In [1]:
from sentence_transformers import CrossEncoder, InputExample
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from typing import Tuple

In [2]:
def load_data(path: str = "data/preprocessed.feather") -> Tuple[InputExample, InputExample, InputExample]:
    ds = pd.read_feather(path)
    ds.score /= 5
    data = [InputExample(texts=[s1, s2], label=score)
            for s1, s2, score in ds.loc[:, ["sen1", "sen2", "score"]].values]
    train, test = train_test_split(data, train_size=.9, random_state=0)
    train, valid = train_test_split(train, train_size=.8, random_state=0)
    return train, valid, test

_, _, test = load_data("../data/preprocessed.feather")
sentences, scores = [], []
for inp in test:
    sentences.append(inp.texts)
    scores.append(inp.label)

In [3]:
model_name = "cross-encoder/stsb-roberta-large"
model_path = f"../models/{model_name}/"

model = CrossEncoder(model_path)

In [4]:
predictions = model.predict(sentences, convert_to_numpy=True)
print(f"Pearson correlation: {np.corrcoef(predictions, scores)[0,1]: .4f}")

Pearson correlation:  0.9270


In [5]:
df = pd.DataFrame(data={
    "sen1": [s[0] for s in sentences],
    "sen2": [s[1] for s in sentences],
    "scores": scores,
    "preds": predictions,
    "mae": np.abs(predictions-scores)
})

In [6]:
df = df.sort_values(by="mae", ascending=False)
df.head(10)

Unnamed: 0,sen1,sen2,scores,preds,mae
2,the act of substituting one thing for another,the act of contacting one thing with another.,0.16,0.687839,0.527839
158,geography as defined by its use.,a district that has been developed to serve so...,0.48,0.009411,0.470589
224,"intuitions, a hunch or feeling","maintain (a theory, thoughts, or feelings).",0.28,0.745484,0.465484
358,an agent or entity becomes sufficient enough t...,fill or meet a want or need,0.24,0.663957,0.423957
347,the act of substituting one thing for another,the act of adding one thing to another.,0.28,0.682764,0.402764
285,Cause to move forward with force.,vary or move from a fixed point or course.,0.76,0.363971,0.396029
307,an open motorboat used for transport,a motorboat with an open deck or a half deck.,0.45,0.819588,0.369588
83,"the reduction of the extent of something, e.g,...",change toward something smaller or lower.,0.88,0.516474,0.363526
301,Generalise the structure of an optimal solutio...,Use these optimal solutions to construct an op...,0.2,0.540433,0.340433
0,(cause to) appear suddenly or briefly,break open or apart suddenly and forcefully.,0.08,0.415022,0.335022


In [7]:
def to_latex(df: pd.DataFrame):
    inner = ""
    for idx, row in df.iterrows():
        inner += "\t" + row.sen1.replace('&', '\&') + " & " + row.sen2.replace('&', '\&') + f" & {row.scores: .2f} & {row.preds: .2f}\\\\\n\t\hline\n"
    outer = ("\\begin{tabular}{|p{5cm}|p{5cm}|p{1cm}|p{1.2cm}|}\n"
             "    \hline\n"
             "    Sentence1 & Sentence2 & Score & Prediction \\\\\n"
             "    \hline\hline\n"
             f"{inner}"
             "\end{tabular}\n")
    print(outer)

to_latex(df.head(10))

\begin{tabular}{|p{5cm}|p{5cm}|p{1cm}|p{1.2cm}|}
    \hline
    Sentence1 & Sentence2 & Score & Prediction \\
    \hline\hline
	the act of substituting one thing for another & the act of contacting one thing with another. &  0.16 &  0.69\\
	\hline
	geography as defined by its use. & a district that has been developed to serve some purpose; &  0.48 &  0.01\\
	\hline
	intuitions, a hunch or feeling & maintain (a theory, thoughts, or feelings). &  0.28 &  0.75\\
	\hline
	an agent or entity becomes sufficient enough to meet a standard. alternatively, an action or occurrence can be thought of as event that fulfills a standard. in this frame, the agent, entity, or event can be conceptualized as being on a scale where a change of value or circumstances brings it in line with the position of the standard on the scale. & fill or meet a want or need &  0.24 &  0.66\\
	\hline
	the act of substituting one thing for another & the act of adding one thing to another. &  0.28 &  0.68\\
	\hline
	Cause 