In [1]:
import simi

import pandas as pd
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

MODEL = SentenceTransformer('all-mpnet-base-v2')

RANDOM_STATE = 1

def similarity_scoring(df, model, append=False):
    scores = df.apply(lambda r: simi.pairwise_cosine_similarity(simi.model_embeddings(model, [r["title_a"], r["title_b"]]))[0][0], axis=1)
    scores = pd.Series(scores, index=df.index, name="cosine-sim")
    if append:
        return df.merge(scores.to_frame(), left_index=True, right_index=True)
    return scores

2024-01-10 12:52:28.359300: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
df = pd.read_csv("class-zbmath-dataset.csv", index_col=0)
df["label"] = df["label"].astype("float")
df.sample(5)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label
293106,Optimal risk allocation in reinsurance networks,91B30,"('60E15',)",Belief-weighted Nash aggregation of Savage pre...,91B14,"('91B08',)",0.0
225724,Total acyclicity for complexes of representati...,16G20,"('18E30', '16G10', '18A40', '55U35')",Root vectors of the composition algebra of the...,16G20,"('17B37', '16G10', '16G70')",1.0
62323,Linear optimal control of time delay systems v...,49J15,"('49N05', '90C20')",Linear optimal control problem in plane,49J15,"('49K15', '49N05')",1.0
355805,On \(\zeta(3)\),11M06,"('11M36',)",On a mollifier of the perturbed Riemann zeta-f...,11M06,"('11M36', '11N64')",1.0
73181,Approximation algorithms for max-bisection on ...,68R10,"('68W25',)",Deeper local search for better approximation o...,68R10,"('05C85', '68W25')",1.0


In [3]:
X_treval, X_test = train_test_split(df, train_size=0.9, random_state=RANDOM_STATE)
X_train, X_eval = train_test_split(X_treval, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train), "eval:", len(X_eval), "test:", len(X_test))

train: 351472 eval: 43935 test: 43935


In [4]:
# prepare evaluation data
eval_examples = X_eval.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)

In [5]:
# prepare re-training: training data, loss
retrain_examples = X_train.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["title_a"], r["title_b"]], label=r["label"]), axis=1)
retrain_dataloader = DataLoader(retrain_examples, shuffle=True, batch_size=16)
#retrain_loss = losses.SoftmaxLoss(model=MODEL, num_labels=2,
#        sentence_embedding_dimension=MODEL.get_sentence_embedding_dimension())
#retrain_loss = losses.ContrastiveLoss(model=MODEL)
retrain_loss = losses.CosineSimilarityLoss(model=MODEL)

In [6]:
%%time
# finetune model
MODEL.fit(train_objectives=[(retrain_dataloader, retrain_loss)], evaluator=evaluator, epochs=10,
        output_path="sbert+retrain_class_zbmath") 

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21967 [00:00<?, ?it/s]

CPU times: user 21h 24min 55s, sys: 1h 7s, total: 22h 25min 2s
Wall time: 6h 25min 57s


In [7]:
%%time
# evaluate Spearman-Pearson-rank-coefficient on test data
spear_ranc = MODEL.evaluate(evaluator)
spear_ranc

CPU times: user 9min 56s, sys: 36.5 s, total: 10min 33s
Wall time: 1min 30s


0.5719917817109932

In [4]:
%%time
X_test_score = similarity_scoring(X_test, MODEL, append=True)

CPU times: user 12min 54s, sys: 23 s, total: 13min 17s
Wall time: 8min 12s


In [6]:
pd.set_option('display.max_colwidth', 160)

In [10]:
X_test_score.sample(15, random_state=RANDOM_STATE)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label,cosine-sim
15960,Global exact quadratization of continuous-time nonlinear control systems,93C10,"('93C15', '93A10', '34H05', '34H99', '34A34', '53A04')",Mixed \(\mathcal{H}_2/\mathcal{H}_\infty\) control of hidden Markov jump systems,93E03,"('60J75', '93B36', '93C55', '93C05')",0.0,-0.032483
186573,Edge operators with conditions of Toeplitz type,58J40,"('35S15', '47G30', '35A17', '35J70', '58J32')",A Bismut type theorem for subelliptic heat semigroups,58J20,"('35H20', '47D06')",0.0,0.156134
181977,1-cohomology and splitting of group extensions,20E22,"('20J99', '20E07')",On some products of nilpotent groups,20E22,"('20F16', '20F18', '20E07', '20F14', '20H25')",1.0,0.487127
299961,Reputation in the long-run with imperfect monitoring,91A20,"('91A05',)",Parallel repetition via fortification: analytic view and the quantum case,91A20,"('81P40', '81P45', '91A05', '91A06', '91A12', '91A80')",1.0,0.480388
226982,Traces and quasi-traces on the Boutet de Monvel algebra.,58J42,"('35S15',)",The local and global parts of the basic zeta coefficient for operators on manifolds with boundary,58J42,"('35S15',)",1.0,0.230848
306772,"A cyclic weight algorithm of decoding the \((47, 24, 11)\) quadratic residue code",94B35,"('94B40',)",A result on the weight distributions of binary quadratic residue codes,94B35,"('94B40',)",1.0,0.882133
205892,Properties of the Székely-Móri symmetry criterion statistics in the case of binary vectors,60E05,"('62E20', '62H10')",On deformation technique of the hyperbolic secant distribution,60E05,"('60E10', '62E17', '62E20')",1.0,0.595832
218358,Simultaneous visibility representations of plane \(st\)-graphs using L-shapes,05C62,"('05C10', '05C85', '68R10')",On the minimum order of graphs with given semigroup,05C99,"('05C65', '20M30')",0.0,0.55866
77078,The unsteady MHD boundary-layer flow on a shrinking sheet,76W05,"('76N20', '76M45')",Meridional trapping and zonal propagation of inertial waves in a rotating fluid shell,76U05,"('76B55', '86A05')",0.0,0.033146
63150,On unified contact metric manifold,53C15,"('53C25',)",Two characterizations of the Chern connection,53C10,"('53A55', '53B05', '58A20', '58A32')",0.0,0.233177


In [11]:
X_test_score[X_test_score["label"]==1].describe()

Unnamed: 0,label,cosine-sim
count,21860.0,21860.0
mean,1.0,0.600657
std,0.0,0.281971
min,1.0,-0.401192
25%,1.0,0.391832
50%,1.0,0.653625
75%,1.0,0.844304
max,1.0,0.999023


In [12]:
X_test_score[X_test_score["label"]==0].describe()

Unnamed: 0,label,cosine-sim
count,22075.0,22075.0
mean,0.0,0.214638
std,0.0,0.263402
min,0.0,-0.374664
25%,0.0,0.005915
50%,0.0,0.159615
75%,0.0,0.382822
max,0.0,0.993558


In [13]:
X_test_score.sort_values("cosine-sim").head(25)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label,cosine-sim
327880,Classification of Engel knots,53D10,"('53D15', '57R17')",Riemannian geometry on contact Lie groups,53D10,"('53D35', '53C50', '53C25', '57R17')",1.0,-0.401192
128374,Bases for commutator subgroups of a free group,20E05,"('20F05', '20F12', '20E07', '20F14')",The complexities of some simple modules of the symmetric groups.,20C30,"('20C20', '20C05')",0.0,-0.374664
434848,On the large deviation rates of non-entropy-approachable measures,37C40,"('60F10', '28D20', '37A05')",Mixed rational-soliton solutions of two differential-difference equations in Casorati determinant form.,37K40,"('39A12', '34A05')",0.0,-0.358477
194467,Self-affine fractals and fractal dimension.,28A80,"('54F45', '60G20')",Fréchet-space-valued measures and the AL-property,28B05,"('46A04', '46A40', '46B42', '46G10', '46B22')",0.0,-0.344602
22361,"Two-dimensional projective linear group and flag-transitive 4-\((v,k,2)\) designs.",05B05,"('20B25',)",The distribution of degrees in random graphs,05C80,"('05-02',)",0.0,-0.344111
401101,Differentiation of measures on Hilbert spaces,28C20,"('28A15',)",Covering numbers of different points in Dvoretzky covering,28A80,"('28A78', '60G44', '60G57')",0.0,-0.343002
254857,Determination of finite plastic deformations in single crystals,74C15,"('74C20', '74E10')",On the existence of infinitely many periodic solutions for an equation of a rectangular thin plate,74K20,"('35B10', '35J05', '35A15')",0.0,-0.342991
63463,Finite-type invariants of classical and virtual knots,57M27,"('57M25',)",On the fundamental group of a compact Kähler manifold,57R19,"('53C55',)",0.0,-0.342869
205242,Semi-\(g^*\)-closed sets and a new separation axiom in the spaces,54A05,"('54C08', '54C05', '54D10')",Conley index for discrete multi-valued dynamical systems,54H20,"('54C60', '37C80')",0.0,-0.339679
205397,The Burnside ring of a compact Lie group. I,57S15,"('22C05', '57S10')",Knots with bounded cusp volume yet large tunnel number,57M25,"('57M50',)",0.0,-0.337681


In [7]:
X_test_score[X_test_score["label"]==1].sort_values("cosine-sim").head(25)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label,cosine-sim
308518,\(C\)-totally real pseudo-parallel submanifolds of Sasakian space forms,53B25,"('53B20', '53B50')",Covariant decompositions of symmetric tensors in the theory of gravitation,53B25,"('53B30', '53B20', '53A45')",1.0,-0.282786
265436,On the class groups of imaginary abelian fields,11R29,"('11R20', '11S40')",Some arithmetic properties of generalized Bernoulli numbers,11R29,"('11R20', '11B68')",1.0,-0.237474
434640,The coloring of the cozero-divisor graph of a commutative ring,05E40,"('05C15', '05C25', '05C69', '13A70', '16N20')","Laplacian ideals, arrangements, and resolutions",05E40,"('05C25', '05C50', '13D02', '52C35')",1.0,-0.167298
199454,Growth functions for Artin monoids.,20M05,"('20F36', '20F05', '17B22')",On generalized quaternion groups and the Hadamard matrices,20M05,"('20F05', '15B57')",1.0,-0.153195
429292,Divisors of generic hypersurfaces of general type,14J70,"('14C20', '14J29')",Identifiability beyond Kruskal's bound for symmetric tensors of degree 4,14J70,"('14C20', '14N05', '15A69', '15A72')",1.0,-0.152072
70499,"Modeling, analysis and timetable design of a helicopter maintenance process based on timed event Petri nets and max-plus algebra",65F15,"('15A80',)",Tropical roots as approximations to eigenvalues of matrix polynomials,65F15,"('15A22', '15A80', '15A12', '65F35')",1.0,-0.146388
256760,On the use of interval mathematics in fuzzy expert systems,68T35,"('65G30',)",Computing the value of a Boolean expression with interval inputs is NP-hard,68T35,"('65G30',)",1.0,-0.1423
356897,Geometric isomorphism check for symmetric factorial designs,65C60,"('65K15',)",Estimation of arbitrary order central statistical moments by the multilevel Monte Carlo method,65C60,"('65C05', '65C30', '60H15', '35R60', '65K15', '49J40', '62J10')",1.0,-0.14206
333197,Calibrations and manifolds with special holonomy,53C38,"('53C29',)",Deformations of associative submanifolds with boundary,53C38,"('35J56', '53C29', '58J32')",1.0,-0.135976
282836,Representation theory of \(U_ 1(H)\) in the symmetric tensors,46L05,"('47D03',)",A note on generators of semigroups,46L05,"('47D03', '22A25')",1.0,-0.132824


In [14]:
X_test_score[X_test_score["label"]==0].sort_values("cosine-sim").tail(25)

Unnamed: 0,title_a,MSC_a,MSC2_a,title_b,MSC_b,MSC2_b,label,cosine-sim
41907,The stability analysis of a general viral infection model with distributed delays and multi-staged infected progression,92D25,"('37N25',)",Dynamical analysis of approximate solutions of HIV-1 model with an arbitrary order,92D30,"('92C60', '34A08')",0.0,0.956253
202847,"Generalized phase retrieval: measurement number, matrix recovery and beyond",42C15,"('94A12', '15A63', '15A83')",Some properties of windowed linear canonical transform and its logarithmic uncertainty principle,42C40,"('30G35',)",0.0,0.956792
81023,Bootstrap-based estimates of uncertainty in subspace identification methods,93B30,"('93C41',)",Blind identification using the kurtosis with applications to field data,93E11,"('94A12',)",0.0,0.959381
231435,Belief base change operations for answer set programming,68N17,"('03B42', '68T27')",Ordered binary decision diagrams representing knowledge-bases,68T30,"('68T35',)",0.0,0.960011
85208,Predator-prey dynamical behavior and stability analysis with square root functional response,34D20,"('34C60', '34C05', '92D25', '37N25')",A predator-prey model with Ivlev's functional response,34C05,"('92D25',)",0.0,0.963358
401173,Vectored injection into compressible laminar and turbulent boundary layers,76N20,"('76J20', '76D05', '65Z05')",Geometrical structure analysis of a zero-pressure-gradient turbulent boundary layer,76F40,"('76F10',)",0.0,0.965251
300812,Behavior of upwind scheme in the low Mach number limit. III: Preconditioned dissipation for a five equation two phase model,76M12,"('76G25',)",A numerical technique for low-speed homogeneous two-phase flow with sharp interfaces,76T99,"('65Z05',)",0.0,0.966594
216151,Disjunctive closures for knowledge compilation,68T30,"('68Q25', '68T27')",Reminiscences on the anniversary of 30 years of nonmonotonic reasoning,68T27,"('68-03',)",0.0,0.967163
368605,Quantum dynamics in phase space: Moyal trajectories 2,81S30,"('53D55', '81S10', '37K10', '83C15', '81Q05')",Coherent states in fermionic Fock-Krein spaces and their amplitudes,81R30,"('30H20', '46C20')",0.0,0.967242
149901,Positive solutions to a Kirchhoff problem with sign-changing and non-Lipschitz nonlinearities,35J20,"('35J25',)",Positive solutions for \(p\)-Laplacian equations of Kirchhoff type problem with a parameter,35B30,"('35B09', '35J50', '35J62')",0.0,0.967411
