In [1]:
import simi

import pandas as pd
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

MODEL = SentenceTransformer('all-mpnet-base-v2')

RANDOM_STATE = 1

2024-01-08 11:58:42.595212: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
df1 = pd.read_csv("anchor-arxiv-dataset.csv", index_col=0)
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df1.sample(5)

Unnamed: 0,title,abstract,categories,doi,a,p,n
91518,Robust detection of exotic infectious diseases...,When animals are transported and pass throug...,"('stat.ME', 'math.ST', 'stat.TH')",10.1016/j.ijar.2012.06.020,When animals are transported and pass through...,"Secondly, we explore and compare three decisio...",Comparisons between MRAD method and other clas...
29310,Contact Geometry of Hyperbolic Equations of Ge...,We study the contact geometry of scalar seco...,"('math.DG', 'math-ph', 'math.AP', 'math.MP')",10.3842/SIGMA.2008.058,An enumeration of several submaximally symmetr...,We study the contact geometry of scalar secon...,The classification depends on the causal chara...
35741,Embeddings of Orlicz-Lorentz spaces into $L_1$,"In this article, we show that Orlicz-Lorentz...","('math.FA', 'math.CO')",,Our approach is based on combinatorial averagi...,This includes the embedding of some Lorentz sp...,We give an alternative proof of W. T. Gowers'...
61580,Fell bundles associated to groupoid morphisms,Given a continuous open surjective morphism ...,"('math.OA',)",,"The case $H=X$, a locally compact space, was t...",We conclude that $C^*_r(G)$ is strongly Morita...,It turns out that this relative K-homology car...
54172,A multisymplectic approach to defects in integ...,We introduce the concept of multisymplectic ...,"('math-ph', 'hep-th', 'math.MP', 'nlin.SI')",10.1007/JHEP02(2015)088,"Taking the nonlinear Schr\""odinger (NLS) equat...",It allows us to reinterpret the defect conditi...,We determine various new equivalence pairs for...


In [3]:
X_treval1, X_test1 = train_test_split(df1, train_size=0.9, random_state=RANDOM_STATE)
X_train1, X_eval1 = train_test_split(X_treval1, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train1), "eval:", len(X_eval1), "test:", len(X_test1))

train: 73760 eval: 9221 test: 9221


In [4]:
df2 = pd.read_csv("class-zbmath-dataset.csv", index_col=0)
df2["label"] = df2["label"].astype("float")
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df2.sample(5)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label
350089,Boundary-value problems for general elliptic s...,35J55,"('35Q15', '47A53', '30E25')",Fredholm property of a class of boundary-value...,35J55,"('35R20', '47A53', '47F05')",1.0
232641,Geometric realization of \(\gamma \)-vectors o...,05E45,"('52B05',)",Edgewise Cohen-Macaulay connectivity of partia...,05E45,"('06A07', '52B05')",1.0
415781,Lusternik-Schnirelmann category and systolic c...,55M30,"('57T99', '55S30')",On the Lusternik-Schnirelmann category of Lie ...,55M30,"('57M05', '57T99')",1.0
250446,Basic derivations for unitary reflection groups,20H15,"('51F15', '15A72', '16W20')",On the non-negativity of the first coefficient...,20G05,"('20F55',)",0.0
157295,A little more on stabilized \(Q_ 1Q_ 1\) for t...,76M10,"('76D05',)",Efficient preconditioning of the linearized Na...,76M10,"('76D05', '65M55')",1.0


In [5]:
X_treval2, X_test2 = train_test_split(df2, train_size=0.9, random_state=RANDOM_STATE)
X_train2, X_eval2 = train_test_split(X_treval2, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train2), "eval:", len(X_eval2), "test:", len(X_test2))

train: 351472 eval: 43935 test: 43935


In [6]:
# prepare re-training: training data, loss
retrain_examples1 = X_train1.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["a"], r["p"], r["n"]]), axis=1)
retrain_dataloader1 = DataLoader(retrain_examples1, shuffle=True, batch_size=16)
retrain_loss1 = losses.TripletLoss(model=MODEL, triplet_margin=5) # TODO: triplet_margin: hyperparameter to optimize

In [7]:
# prepare evaluation data2
eval_examples = X_eval2.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)

In [8]:
# prepare re-training: training data, loss
retrain_examples2 = X_train2.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
retrain_dataloader2 = DataLoader(retrain_examples2, shuffle=True, batch_size=16)
retrain_loss2 = losses.CosineSimilarityLoss(model=MODEL)

In [None]:
%%time
# finetune model
MODEL.fit(train_objectives=[(retrain_dataloader2, retrain_loss2), (retrain_dataloader1, retrain_loss1)], evaluator=evaluator, epochs=15,
        output_path="sbert+retrain_class_zbmath_anchor_arxiv") 

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4610 [00:00<?, ?it/s]

In [12]:
%%time
# evaluate Spearman-Pearson-rank-coefficient on test data
spear_ranc = MODEL.evaluate(evaluator)
spear_ranc

CPU times: user 10min 7s, sys: 35.4 s, total: 10min 43s
Wall time: 1min 30s


0.597061166377549

In [13]:
spear_ranc

0.597061166377549