In [1]:
import sys
import os
sys.path.append(os.path.abspath("../"))
import simi

import pandas as pd
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

EMBEDDING = models.Transformer('math-similarity/Bert-MLM_arXiv') # use model hosted on huggingface
# EMBEDDING = models.Transformer('./bert+re-train_mlm_abstracts_arxiv') # use locally generated model
POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension()) # MEAN pooling
# POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension(), pooling_mode="cls") # CLS pooling

MODEL = SentenceTransformer(modules=[EMBEDDING, POOLING])

RANDOM_STATE = 1

2023-05-10 16:26:20.611782: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of the model checkpoint at ./bert+re-train_mlm_abstracts_arxiv were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you

In [2]:
df = pd.read_csv("anchor-arxiv-dataset.csv", index_col=0)
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df.sample(5)

Unnamed: 0,title,abstract,categories,doi,a,p,n
40067,The Disk-Based Origami Theorem and a Glimpse o...,This paper describes a mechanism by which a ...,"('math.GT',)",,The fibers of $O$ have the cardinality $(n+1)$...,"The knowledge of the map $O$, together with th...",A homotopy equivalence between a hyperbolic 3...
36564,On the gradient of Schwarz symmetrization of f...,Let S be a Sobolev or Orlicz-Sobolev space o...,"('math.AP', 'math.FA')",,We also prove that the rearrangement of any fu...,Let S be a Sobolev or Orlicz-Sobolev space of...,Then there exists a relation between the assoc...
68602,Stochastic Evolution Equation Driven by Teugel...,The paper is concerned with a class of stoch...,"('math.PR',)",,Here Teugels martingales are a family of pairw...,The second is to establish the stochastic maxi...,"It is shown that, up to terms of lower order, ..."
85933,TITAN: A Spatiotemporal Feature Learning Frame...,Critical incident stages identification and ...,"('cs.LG', 'stat.ML')",10.1145/3347146.3359381,We develop an algorithm based on the alternati...,"Second, we propose novel constraints on featur...","However, there are often several plausible inp..."
34556,Irregular and multi--channel sampling of opera...,The classical sampling theorem for bandlimit...,"('math.FA', 'math.CA')",,"Here, we discuss operator sampling versions of...","In multi-channel operator sampling, we obtain ...","This is a corrected version of the paper [1], ..."


In [3]:
X_treval, X_test = train_test_split(df, train_size=0.9, random_state=RANDOM_STATE)
X_train, X_eval = train_test_split(X_treval, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train), "eval:", len(X_eval), "test:", len(X_test))

train: 73760 eval: 9221 test: 9221


In [4]:
# prepare evaluation data
eval_examples = X_eval.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["a"], r["p"], r["n"]]), axis=1)
evaluator = evaluation.TripletEvaluator.from_input_examples(eval_examples)

In [5]:
# prepare re-training: training data, loss
retrain_examples = X_train.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["a"], r["p"], r["n"]]), axis=1)
retrain_dataloader = DataLoader(retrain_examples, shuffle=True, batch_size=16)
retrain_loss = losses.TripletLoss(model=MODEL, triplet_margin=5) # TODO: triplet_margin: hyperparameter to optimize

In [6]:
%%time
# finetune model
MODEL.fit(train_objectives=[(retrain_dataloader, retrain_loss)], evaluator=evaluator, epochs=10,
        output_path="bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_anchor_arxiv") 

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

CPU times: user 6h 49min 11s, sys: 1h 48min 44s, total: 8h 37min 55s
Wall time: 5h 1min 40s


In [7]:
%%time
# evaluate Spearman-Pearson-rank-coefficient on test data
spear_ranc = MODEL.evaluate(evaluator)
spear_ranc

CPU times: user 3min 20s, sys: 12 s, total: 3min 32s
Wall time: 54.2 s


0.897516538336406