In [None]:
# !pip install -U sentence-transformers

In [None]:
import math
from datetime import datetime

import pandas as pd
from sentence_transformers import (
    InputExample,
    LoggingHandler,
    SentenceTransformer,
    evaluation,
    losses,
    models,
    util,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [None]:
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

In [None]:
df = pd.read_csv("vuamc.csv")


def preprocess(df):
    train = df[df["partition"] == "train"]
    train, dev = train_test_split(train, test_size=0.33, random_state=42)
    test = df[df["partition"] == "test"]
    train = train[["sentence", "verb", "y"]]
    test = test[["sentence", "verb", "y"]]
    return train, test, dev

In [None]:
def create_training_sample(train, test, dev):
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False,
    )
    train_samples = []
    for index, row in train.iterrows():
        train_samples.append(
            InputExample(texts=[row["sentence"], row["verb"]], label=row["y"])
        )
    train_dataloader = DataLoader(
        train_samples, shuffle=True, batch_size=train_batch_size
    )
    train_loss = losses.OnlineContrastiveLoss(
        model=model, distance_metric=distance_metric, margin=margin
    )
    dev_sentences1 = []
    dev_sentences2 = []
    dev_labels = []
    for index, row in dev.iterrows():
        dev_sentences1.append(row["sentence"])
        dev_sentences2.append(row["verb"])
        dev_labels.append(int(row["y"]))

    evaluators = []

    evaluators.append(
        evaluation.BinaryClassificationEvaluator(
            dev_sentences1, dev_sentences2, dev_labels
        )
    )
    evaluators.append(
        evaluation.EmbeddingSimilarityEvaluator(
            dev_sentences1, dev_sentences2, dev_labels
        )
    )
    dev_evaluator = evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: scores[-1]
    )
    return train_dataloader, train_loss, dev_evaluator

In [None]:
model_name = "bert-base-uncased"  #'xlm-roberta-base'
train_batch_size = 16
model_save_path = "output/metaphor"
word_embedding_model = models.Transformer(model_name)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
margin = 0.5

train, test, dev = preprocess(df)
train_dataloader, train_loss, dev_evaluator = create_training_sample(train, test, dev)

In [None]:
num_epochs = 5
warmup_steps = math.ceil(
    len(train_dataloader) * num_epochs * 0.1
)  # 10% of train data for warm-up

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

In [None]:
all_genre = ["news", "fiction", "academic", "conversation"]
test = df[df["partition"] == "test"]
for genre in all_genre:
    test_samples = []
    genre_test = test[test["genre"] == genre]
    for index, row in genre_test.iterrows():
        test_samples.append(
            InputExample(texts=[row["sentence"], row["verb"]], label=row["y"])
        )
    model = SentenceTransformer(model_save_path)
    test_evaluator = evaluation.BinaryClassificationEvaluator.from_input_examples(
        test_samples, batch_size=train_batch_size, name="metaphor-test"
    )
    score = test_evaluator(model, output_path=model_save_path)
    print(f"Genre : {genre}\tScore : {score}")

In [None]:
# Genre : news	Score : 0.7593295445439239
# Genre : fiction	Score : 0.583352792743637
# Genre : academic	Score : 0.8010891703480427
# Genre : conversation	Score : 0.4309286215122053