In [1]:
import simi

import pandas as pd
from transformers import set_seed, enable_full_determinism
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

EMBEDDING = models.Transformer('math-similarity/Bert-MLM_arXiv') # use model hosted on huggingface
# EMBEDDING = models.Transformer('./bert+re-train_mlm_abstracts_arxiv') # use locally generated model
POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension()) # MEAN pooling
# POOLING = models.Pooling(EMBEDDING.get_word_embedding_dimension(), pooling_mode="cls") # CLS pooling

MODEL = SentenceTransformer(modules=[EMBEDDING, POOLING])

RANDOM_STATE = 1

#set_seed(RANDOM_STATE)
#enable_full_determinism(RANDOM_STATE)

2023-12-11 19:21:51.830421: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of the model checkpoint at ./bert+re-train_mlm_abstracts_arxiv were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you

In [2]:
df1 = pd.read_csv("anchor-arxiv-dataset.csv", index_col=0)
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df1.sample(5)

Unnamed: 0,title,abstract,categories,doi,a,p,n
11395,Isometries of lattices and automorphisms of K3...,The aim of this paper is to give necessary a...,"('math.NT', 'math.AG', 'math.DS')",,The aim of this paper is to give necessary an...,This result has applications applications to a...,"We show that divergence-free jacobian maps, r..."
37861,Green correspondence on centric Mackey functor...,In this paper we give a definition of centri...,"('math.RT', 'math.GR')",,In this paper we give a definition of centric...,As a means to prove the Green correspondence g...,"Because of this, it is very important to study..."
16431,On the supercritical defocusing NLW outside a ...,We study a defocusing semilinear wave equati...,"('math.AP', 'math-ph', 'math.MP')",,We study a defocusing semilinear wave equatio...,We prove that if $p>n+4$ and the initial data ...,These equations are relevant for modelling col...
26952,Densities of currents and complex dynamics,We extend the Dinh-Sibony notion of densitie...,"('math.CV', 'math.DS')",,We extend the Dinh-Sibony notion of densities...,"As an application, we introduce the notion of ...",We show that if the group of holomorphic auto...
91950,Remark on the finite-dimensional character of ...,This note shows that some assumption on smal...,"('math.ST', 'stat.TH')",,This note shows that some assumption on small...,To complete this result an example of L2 proce...,"More precisely, we address the problem of char..."


In [3]:
X_treval1, X_test1 = train_test_split(df1, train_size=0.9, random_state=RANDOM_STATE)
X_train1, X_eval1 = train_test_split(X_treval1, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train1), "eval:", len(X_eval1), "test:", len(X_test1))

train: 73760 eval: 9221 test: 9221


In [4]:
df2 = pd.read_csv("class-zbmath-dataset.csv", index_col=0)
df2["label"] = df2["label"].astype("float")
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df2.sample(5)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label
59818,Easy computational approach to solution of sys...,65R20,"('45B05',)",Application of Lagrange interpolation for nonl...,65R20,"('45B05', '45J05', '65L05')",1.0
159263,Local and global Maass relations,11F70,"('22E50', '11F55', '11F37')",Siegel modular forms of small weight and the W...,11F46,"('11F27',)",0.0
174009,Parallel scientific computing environment for ...,68U20,"('68N99', '65Y05', '65Y20', '85-08')",CDR: A rewriting based tool to design FPLA cir...,68Q42,"('68W30', '94C10')",0.0
305223,"Anisotropic \(p,q\)-Laplacian equations when \...",35J92,"('35J25', '35B65')",Approximations to wave propagation through a l...,35P25,"('35J05',)",0.0
105926,Strategic advertising: the fat-cat effect and ...,91A10,"('91B38', '90B60', '91A80', '91B42')",Does trust matter for R\&D cooperation? A game...,91A10,"('91B24', '91B38')",1.0


In [5]:
X_treval2, X_test2 = train_test_split(df2, train_size=0.9, random_state=RANDOM_STATE)
X_train2, X_eval2 = train_test_split(X_treval2, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train2), "eval:", len(X_eval2), "test:", len(X_test2))

train: 351472 eval: 43935 test: 43935


In [6]:
df3 = pd.read_csv("class-arxiv-dataset.csv", index_col=0)
df3["label"] = df3["label"].astype("float")
# TODO: the mathberta tokenizer also recognizes latex within [MATH]...[/MATH] -> convert latex markupp in titles accordingly
df3.sample(5)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b
31838,0.0,"('math.HO',)",Course of analytical geometry. This book is a ...,"('math.DG',)",Helicoidal minimal surfaces of prescribed genu...
13384,1.0,"('math.CO',)",On reformulated zagreb indices with respect to...,"('math.CO',)",Cycle decompositions of pathwidth-6 graphs. Ha...
48292,1.0,"('math.RA',)",Remarks on Vector Space Generated by the Multi...,"('math.RA',)",Explicit Description of Centralizers for a Mat...
38103,0.0,"('math.NT',)",Bounds on the largest prime factor of a negati...,"('math.OC',)",Approximate dynamic programming for profit est...
18383,1.0,"('math.CT',)",Non abelian cohomology: the point of view of g...,"('math.CT',)",Modal Descent. Any modality in homotopy type t...


In [7]:
X_treval3, X_test3 = train_test_split(df3, train_size=0.9, random_state=RANDOM_STATE)
X_train3, X_eval3 = train_test_split(X_treval3, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train3), "eval:", len(X_eval3), "test:", len(X_test3))

train: 43572 eval: 5447 test: 5447


In [8]:
# prepare evaluation data2
eval_examples2 = X_eval2.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
#eval_examples3 = X_eval3.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["text_a"], r["text_b"]], label=r["label"]), axis=1)
#eval_examples = pd.concat([eval_examples2, eval_examples3], ignore_index=True)
#eval_examples = eval_examples.sample(len(eval_examples)).reset_index(drop=True)
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples2)

In [9]:
# prepare re-training: training data, loss
retrain_examples1 = X_train1.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["a"], r["p"], r["n"]]), axis=1)
retrain_dataloader1 = DataLoader(retrain_examples1, shuffle=True, batch_size=8)
retrain_loss1 = losses.TripletLoss(model=MODEL, triplet_margin=5) # TODO: triplet_margin: hyperparameter to optimize

In [10]:
# prepare re-training: training data, loss
retrain_examples2 = X_train2.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
retrain_dataloader2 = DataLoader(retrain_examples2, shuffle=True, batch_size=8)
retrain_loss2 = losses.CosineSimilarityLoss(model=MODEL)

In [11]:
# prepare re-training: training data, loss
retrain_examples3 = X_train3.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["text_a"], r["text_b"]], label=r["label"]), axis=1)
retrain_dataloader3 = DataLoader(retrain_examples3, shuffle=True, batch_size=8)
retrain_loss3 = losses.CosineSimilarityLoss(model=MODEL)

In [None]:
%%time
# finetune model
MODEL.fit(train_objectives=[(retrain_dataloader3, retrain_loss3), (retrain_dataloader2, retrain_loss2), (retrain_dataloader1, retrain_loss1)],
          evaluator=evaluator, epochs=20,
        output_path="bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_class_arxiv_class_zbmath_anchor_arxiv") 

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5447 [00:00<?, ?it/s]

In [None]:
%%time
# evaluate Spearman-Pearson-rank-coefficient on test data
spear_ranc = MODEL.evaluate(evaluator)
spear_ranc

In [None]:
spear_ranc