In [1]:
import simi

import pandas as pd
import random
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
import sqlalchemy
from tqdm.auto import tqdm
tqdm.pandas()

RANDOM_STATE = 1

random.seed(RANDOM_STATE)

2023-12-16 20:57:50.179623: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
df = pd.read_csv("titles-zbmath-dataset-2000-2019.csv", index_col=0)
df = df.reset_index(drop=True)
df.sample(5)

Unnamed: 0,title,MSC,MSC2
704788,On the Grüss inequality for unital 2-positive ...,46L05,"('47A63', '47B65')"
333538,Modulated exploratory dynamics can shape self-...,93C40,"('37N25', '68T05')"
193915,Numerical simulation of the viscous nonequilib...,76M20,"('76V05', '76L05')"
54174,Liftings and mean value theorems for automorph...,11F66,"('11M41',)"
283941,Robust stability criteria for uncertain neutra...,93D09,"('93C20', '34K20')"


In [3]:
len(df)

1037993

## Model definitions

In [9]:
# Prerequisites for the models
bert_base = models.Transformer('bert-base-uncased')
bert_mp = SentenceTransformer(modules=[bert_base, models.Pooling(bert_base.get_word_embedding_dimension())], device="cuda:1")

mathbert_base = models.Transformer('witiko/mathberta')
mathbert_mp = SentenceTransformer(modules=[mathbert_base, models.Pooling(mathbert_base.get_word_embedding_dimension())], device="cuda:1")
mathbert_cls = SentenceTransformer(modules=[mathbert_base, models.Pooling(mathbert_base.get_word_embedding_dimension(), pooling_mode="cls")], device="cuda:1")

bert_mlm_base = models.Transformer("./bert+re-train_mlm_abstracts_arxiv")
bert_mlm_mp = SentenceTransformer(modules=[bert_mlm_base, models.Pooling(bert_mlm_base.get_word_embedding_dimension())], device="cuda:1")

sbert = SentenceTransformer('all-mpnet-base-v2', device="cuda:1")
# alternative: sbert = SentenceTransformer('all-distilroberta-v1')

specter2_base = models.Transformer('allenai/specter2_base')

# Definition of models that are evaluated
# combine models into dict
eval_models = {
    "Bert+MP": bert_mp,
    "Bert+MP+class-arx": SentenceTransformer('./bert+mean-pooling+retrain_class_arxiv', device="cuda:1"),
    "Bert+MP+class-zbm": SentenceTransformer('./bert+mean-pooling+retrain_class_zbmath', device="cuda:1"),
    
    "Mathbert+CLS": mathbert_cls,
    "Mathbert+MP+class-arx": SentenceTransformer('./mathbert+mean-pooling+retrain_class_arxiv', device="cuda:1"),
    "Mathbert+MP+class-zbm": SentenceTransformer('./mathbert+mean-pooling+retrain_class_zbmath', device="cuda:1"),
    
    "Bert+TSDAE+MP": SentenceTransformer('./bert+mean-pooling+re-train_tsdae_abstracts_arxiv', device="cuda:1"),
    "Bert+TSDAE+MP+class-arx": SentenceTransformer('./bert+mean-pooling+re-train_tsdae_abstracts_arxiv+retrain_class_arxiv', device="cuda:1"),
    "Bert+TSDAE+MP+class-zbm": SentenceTransformer('./bert+mean-pooling+re-train_tsdae_abstracts_arxiv+retrain_class_zbmath', device="cuda:1"),
    
    "Bert+MLM+MP": bert_mlm_mp,
    "Bert+MLM+MP+class-arx": SentenceTransformer('./bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_class_arxiv', device="cuda:1"),
    "Bert+MLM+MP+class-zbm": SentenceTransformer('./bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_class_zbmath', device="cuda:1"),
    "Bert+MLM+MP+class-zbm+anch-arx": SentenceTransformer('./bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_class_zbmath_anchor_arxiv', device="cuda:1"),
    "Bert+MLM+MP+class-arx+anch-arx+class-zbm": SentenceTransformer('./bert+re-train_mlm_abstracts_arxiv+mean-pooling+retrain_class_arxiv_anchor_arxiv_class_zbmath', device="cuda:1"),
    
    "SBert": sbert,
    "SBert+ret_class-arx": SentenceTransformer('./sbert+retrain_class_arxiv', device="cuda:1"),
    "SBert+ret_class-zbm": SentenceTransformer('./sbert+retrain_class_zbmath', device="cuda:1"),
    
    "Specter 2+MP": SentenceTransformer(modules=[specter2_base, models.Pooling(specter2_base.get_word_embedding_dimension())], device="cuda:1"),
    "Specter 2.0+MP+class-arx": SentenceTransformer('./specter2+mp+retrain_class_arxiv', device="cuda:1"),
    "Specter 2.0+MP+class-zbm": SentenceTransformer('./specter2+mp+retrain_class_zbmath', device="cuda:1"),
    "Specter 2+MP+class-zbm+anch-arx": SentenceTransformer('./specter2+mp+retrain_class_zbmath_anchor_arxiv', device="cuda:1"),
    "Specter 2+MP+anch-arx+class-arx+class-zbm": SentenceTransformer('./specter2+mp+retrain_anchor_arxiv_class_arxiv_class_zbmath', device="cuda:1"),
    
    "SGPT": SentenceTransformer('Muennighoff/SGPT-125M-weightedmean-nli-bitfit', device="cuda:1"),
    "SGPT+class-arx": SentenceTransformer('./gpt+retrain_class_arxiv', device="cuda:1"),
    "SGPT+class-zbm": SentenceTransformer('./gpt+retrain_class_zbmath', device="cuda:1"),
}

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at witiko/mathberta were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight'

In [5]:
%%time

with tqdm(total=len(eval_models), desc="Models") as pb:
    for n,m in eval_models.items():
        df["embedding ({})".format(n)] = pd.Series(list(simi.model_embeddings(m, df["title"], show_progress_bar=True)))
        pb.update()

Models:   0%|          | 0/24 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

Batches:   0%|          | 0/32438 [00:00<?, ?it/s]

CPU times: user 1d 12h 21min 59s, sys: 2h 11min 49s, total: 1d 14h 33min 49s
Wall time: 6h 15min 25s


In [6]:
df.sample(5)

Unnamed: 0,title,MSC,MSC2,embedding (Bert+MP),embedding (Bert+MP+class-arx),embedding (Bert+MP+class-zbm),embedding (Mathbert+CLS),embedding (Mathbert+MP+class-arx),embedding (Mathbert+MP+class-zbm),embedding (Bert+TSDAE+MP),...,embedding (SBert),embedding (SBert+ret_class-arx),embedding (SBert+ret_class-zbm),embedding (Specter 2+MP),embedding (Specter 2.0+MP+class-arx),embedding (Specter 2.0+MP+class-zbm),embedding (Specter 2+MP+class-zbm+anch-arx),embedding (Specter 2+MP+anch-arx+class-arx+class-zbm),embedding (SGPT),embedding (SGPT+class-zbm)
957273,Quantum polarization characterization and tomo...,81P18,"('81V80',)","[-0.063432164, -0.23202914, -0.34278366, -0.02...","[-0.4152547, -0.51304394, 0.9487813, 0.3881619...","[-0.08613621, 0.00960132, -0.3179743, -0.19455...","[-0.018293507, 0.030732457, -0.019949751, -0.0...","[0.27872553, 0.47079825, -0.006680915, -0.1797...","[0.7827023, 1.0948861, -0.3761158, -0.29462618...","[0.16412546, 0.30016264, -0.4956088, -0.117247...",...,"[-0.038737766, 0.058935776, -0.037607074, -0.0...","[0.041402385, -0.0059386375, -0.0036383953, 0....","[-0.0017820672, 0.0047394894, 0.06710125, -0.0...","[0.7955619, 0.5021455, 0.3540604, 0.24400601, ...","[0.10685164, 0.51360416, 1.9823642, 0.43255758...","[1.8304728, 1.4720514, -0.16571407, 1.3255017,...","[1.1301416, 0.8735996, -0.3935191, 1.2676924, ...","[1.7516968, 1.5288013, 0.030454254, 0.6168692,...","[0.14845072, -1.4453992, -0.45418882, -1.03501...","[0.61106586, -0.79988855, -0.073546216, -0.002..."
164267,Bounded hyperbolic components of quadratic rat...,37F45,"('37F10', '37C25')","[-0.14796267, -0.53204465, -0.14608075, -0.256...","[-0.793318, -0.29023993, 1.7244405, 0.48608264...","[0.103297345, 1.375718, -0.3898696, -0.3682838...","[-0.027765492, 0.024854114, -0.009458648, -0.0...","[0.21786176, 0.11572891, 0.121252276, 0.030319...","[0.5492998, -0.016390346, -0.6217372, 0.266680...","[-0.30539706, -0.29071075, 0.03325594, 0.13029...",...,"[-0.08279195, 0.019909127, 0.009512547, -0.047...","[0.01633139, -0.013048032, 0.0016313468, 0.048...","[0.013032851, -0.018160518, -0.005918124, -0.0...","[0.020724215, 1.4792604, -0.29052952, 0.392878...","[-1.5636588, 1.0203491, -0.71646416, 0.9646309...","[-1.1285477, 1.857897, -0.9151073, 1.5079598, ...","[0.38202873, 1.224297, -0.7986539, 1.6800655, ...","[0.80824673, 2.355677, -0.6491099, 2.2779999, ...","[-0.57877046, -0.7882738, 0.36466536, -0.93821...","[0.112344325, -0.62387514, 0.59256107, -0.0223..."
951711,Rushing into the American dream? House prices ...,91B25,"('62P20',)","[0.06689129, -0.011257838, 0.16491859, 0.07954...","[-0.25303072, 0.29620928, -0.119773366, -0.253...","[-0.050613496, 0.17086436, 0.12243285, 0.35160...","[-0.023410691, 0.009712576, -0.024745816, -0.0...","[0.39348617, -0.56463546, 0.08516597, 1.210385...","[-0.80175453, 0.14457472, 0.0436727, 0.4019577...","[0.17429371, -0.19718169, -0.068741985, -0.066...",...,"[-0.07140249, 0.07277227, -0.011042521, -0.055...","[0.01741535, -0.08810293, 0.055815548, 0.03411...","[0.013911463, 0.012276799, -0.0018635014, 0.03...","[-0.1757004, 0.28939566, 0.05120557, -0.096809...","[0.70469916, -0.9402779, -0.53231543, -0.19127...","[-0.9788738, -1.5956415, -0.93073213, 0.937217...","[-1.0030318, -1.4083282, 0.07066967, -0.083234...","[-1.6226932, -0.5294861, -0.99726117, 0.244339...","[1.4541242, -1.2586735, 0.52305883, -1.4765854...","[0.44467658, -0.6038761, -0.34741497, -0.17426..."
878943,Prediction of RNA-protein interactions by comb...,92C40,"('62P10', '68T05')","[-0.2535846, -0.0038842533, -0.042445537, -0.1...","[-0.0653365, 0.14150672, -0.37443593, -0.22999...","[-0.13712293, -0.3714516, 0.5721169, -0.312481...","[0.0056417612, 0.011302962, -0.016942276, 0.00...","[0.5455376, -0.5641251, -0.6781758, 0.8701684,...","[0.6714612, -0.008678148, 0.31519553, 0.489537...","[0.14119385, 0.16409616, -0.34254953, -0.13352...",...,"[0.006874907, 0.05061671, 0.008243228, 0.02956...","[0.0008126384, -0.07839495, -0.036007833, 0.01...","[0.007974392, -0.056348976, 0.0010901447, 0.00...","[0.27691087, 0.20159784, 0.46666667, 0.0040864...","[0.5882764, -0.57319015, 0.2556239, -0.1156153...","[0.8337276, 0.76826704, -0.6331696, 0.07680680...","[1.3773249, 1.2913536, -0.56751895, 1.0790995,...","[1.1509567, 0.5862368, 0.20008343, 0.5047765, ...","[0.5079371, -0.29196393, 1.4672449, -1.7914301...","[0.8955961, -1.2344561, 0.24118966, 0.4870594,..."
562187,Spherically symmetric inflation,83F05,"('83C15', '83C05')","[-0.1413009, -0.085025825, 0.120352775, -0.104...","[-0.56511503, 0.062857084, 1.1818341, 0.188911...","[0.03375798, 0.40502822, -0.15965164, 0.498430...","[-0.0345734, 0.013859525, -0.0035557444, 0.001...","[0.4196525, 0.7829451, 0.1395383, 0.66512984, ...","[-1.0526856, -0.098272085, 0.156767, 1.1378037...","[-0.0991321, 0.2848365, -0.15209004, -0.015777...",...,"[0.00091834355, -0.035839442, 0.007231686, -0....","[0.045204114, -0.0502862, 0.007350561, 0.08112...","[0.0068418975, -0.06641615, -0.04271351, 0.005...","[-0.054985374, 0.87324524, 0.820823, -0.007153...","[-1.9894906, 2.0581782, -0.9888094, -0.0089059...","[-0.6989663, -1.2046441, -0.6671269, 0.5242559...","[-1.6867175, -0.95734733, 0.16807926, 0.397490...","[-2.2935095, -0.48272637, 0.20029896, 0.412387...","[0.21448907, 0.29152203, -0.5660283, 0.5424022...","[-0.79926556, -0.3510254, -0.14339758, 0.14261..."


In [11]:
df.to_pickle("zbmath-embeddings-2000-2019-multi.pkl")