In [1]:
import simi

import pandas as pd
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

MODEL = SentenceTransformer('all-mpnet-base-v2')

RANDOM_STATE = 1

def similarity_scoring(df, model, append=False):
    scores = df.apply(lambda r: simi.pairwise_cosine_similarity(simi.model_embeddings(model, [r["text_a"], r["text_b"]]))[0][0], axis=1)
    scores = pd.Series(scores, index=df.index, name="cosine-sim")
    if append:
        return df.merge(scores.to_frame(), left_index=True, right_index=True)
    return scores

2024-01-05 16:21:18.093091: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
df = pd.read_csv("class-arxiv-dataset.csv", index_col=0)
df["label"] = df["label"].astype("float")
df.sample(5)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b
28650,1.0,"('math.GR',)","On a conjecture by Haipeng Qu. In this note, w...","('math.GR',)",Subgroup Theorems for the Baer-invariant of Gr...
46,1.0,"('math.AC',)",Bounds for the degree and Betti sequences alon...,"('math.AC',)",Silting complexes and Gorenstein projective mo...
22760,0.0,"('math.DG',)",Anisotropic tensor calculus. We introduce the ...,"('math.NA',)",Symmetric spaces and Lie triple systems in num...
13115,1.0,"('math.CO',)",Eulerian partitions for configurations of skew...,"('math.CO',)",On perfect packings in dense graphs. We say th...
54004,0.0,"('stat.ME',)",Exploring elastic net and multivariate regress...,"('math.NT',)",On fully split lacunary polynomials in finite ...


In [3]:
X_treval, X_test = train_test_split(df, train_size=0.9, random_state=RANDOM_STATE)
X_train, X_eval = train_test_split(X_treval, train_size=0.888889, random_state=RANDOM_STATE)
print("train:", len(X_train), "eval:", len(X_eval), "test:", len(X_test))

train: 43572 eval: 5447 test: 5447


In [4]:
# prepare evaluation data
eval_examples = X_eval.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["text_a"], r["text_b"]], label=r["label"]), axis=1)
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)

In [5]:
# prepare re-training: training data, loss
retrain_examples = X_train.reset_index(drop=True).apply(lambda r: InputExample(texts=[r["text_a"], r["text_b"]], label=r["label"]), axis=1)
retrain_dataloader = DataLoader(retrain_examples, shuffle=True, batch_size=16)
retrain_loss = losses.CosineSimilarityLoss(model=MODEL)

In [6]:
%%time
# finetune model
MODEL.fit(train_objectives=[(retrain_dataloader, retrain_loss)], evaluator=evaluator, epochs=10,
        output_path="sbert+retrain_class_arxiv") 

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2724 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [7]:
%%time
# evaluate Spearman-Pearson-rank-coefficient on test data
spear_ranc = MODEL.evaluate(evaluator)
spear_ranc

CPU times: user 7min 17s, sys: 7.19 s, total: 7min 25s
Wall time: 1min 18s


0.789947714463526

In [8]:
%%time
X_test_score = similarity_scoring(X_test, MODEL, append=True)

CPU times: user 22min 20s, sys: 3.24 s, total: 22min 23s
Wall time: 1min 46s


In [9]:
pd.set_option('display.max_colwidth', 160)

In [10]:
X_test_score.sample(15, random_state=RANDOM_STATE)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b,cosine-sim
14175,1.0,"('math.CO',)","On Weak Chromatic Polynomials of Mixed Graphs. A \emph{mixed graph} is a graph with directed edges, called arcs, and undirected edges. A $k$-coloring of the...","('math.CO',)","Projective Equivalences of k-neighbourly Polytopes. We prove the following theorem, which is related to McMullen's problem on projective transformations of ...",0.989835
29240,1.0,"('math.GR',)","Galois Theory - a first course. These notes are a self-contained introduction to Galois theory, designed for the student who has done a first course in abst...","('math.GR',)",On p-stability in groups and fusion systems. The aim of this paper is to generalise the notion of p-stability to fusion systems. We study the question how Q...,0.358276
31864,1.0,"('math.KT',)",Controlled $K$-theory and $K$-Homology. An operator $T$ in a $C^*$-algebra is called an $\epsilon$-projection if it satisfies $T=T^*$ and $\|T^2-T\|<\epsilo...,"('math.KT',)","Cyclic homology of braided Hopf crossed products. Let k be a field, A a unitary associative k-algebra and V a k-vector space endowed with a distinguished el...",0.929833
48373,1.0,"('math.RA',)",Simple subalgebras of simple special Jordan algebras. In this paper we determine all types and the canonical forms of simple subalgebras for each type of si...,"('math.RA',)",Normalization of Quaternionic Polynomials. Quaternionic polynomials are generated by quaternionic variables and the quaternionic product. This paper propose...,0.933051
11994,1.0,"('math.CA',)","On some Hadamard-Type Inequalities for Co-ordinated Convex Functions. In this paper, we prove some new inequalities of Hadamard-type for convex functions on...","('math.CA',)",Non-differentiable solutions for local fractional nonlinear Riccati differential equations. We investigate local fractional nonlinear Riccati differential e...,0.584509
48427,1.0,"('math.RA',)",Basic superranks for varieties of algebras. We introduce the notion of basic superrank for varieties of algebras which generalizes that of basic rank. First...,"('math.RA',)","A note on solvable maximal subgroups in subnormal subgroups of ${\mathrm GL}_n(D)$. Let $D$ be a non-commutative division ring, $G$ a subnormal subgroup of ...",0.953668
22315,0.0,"('math.DG',)","Tensor tomography on Cartan-Hadamard manifolds. We study the geodesic X-ray transform on Cartan-Hadamard manifolds, and prove solenoidal injectivity of this...","('math.OC',)","A new conical internal evolutive LP algorithm. In a previous paper, published in 1992, a primal conical LP algorithm with exact finite coonvergence was pres...",-0.008478
51367,1.0,"('stat.AP',)","Data-driven dynamic treatment planning for chronic diseases. In order to deliver effective care, health management must consider the distinctive trajectorie...","('stat.AP',)",Statistical analysis of stellar evolution. Color-Magnitude Diagrams (CMDs) are plots that compare the magnitudes (luminosities) of stars in different wavele...,0.986977
39212,0.0,"('math.NT',)",The distribution of spacings between the fractional parts of $\boldsymbol{n^d\alpha}$. We study the distribution of spacings between the fractional parts of...,"('math.FA',)",Canonical graph contractions of linear relations on Hilbert spaces. Given a closed linear relation $T$ between two Hilbert spaces $\mathcal H$ and $\mathcal...,0.044511
51254,1.0,"('stat.AP',)",Bayesian model averaging for mortality forecasting using leave-future-out validation. Predicting the evolution of mortality rates plays a central role for l...,"('stat.AP',)",Selection of multiple donor gauges via Graphical Lasso for estimation of daily streamflow time series. A fundamental challenge in estimations of daily strea...,0.97038


In [11]:
X_test_score[X_test_score["label"]==1].describe()

Unnamed: 0,label,cosine-sim
count,2766.0,2766.0
mean,1.0,0.771245
std,0.0,0.310622
min,1.0,-0.107418
25%,1.0,0.636887
50%,1.0,0.946416
75%,1.0,0.990949
max,1.0,0.999492


In [12]:
X_test_score[X_test_score["label"]==0].describe()

Unnamed: 0,label,cosine-sim
count,2681.0,2681.0
mean,0.0,0.060589
std,0.0,0.179872
min,0.0,-0.176028
25%,0.0,-0.008492
50%,0.0,0.002141
75%,0.0,0.029445
max,0.0,0.996909


In [13]:
X_test_score.sort_values("cosine-sim").head(25)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b,cosine-sim
33652,0.0,"('math.MG',)",Coverings with congruent and non-congruent hyperballs generated by doubly truncated Coxeter orthoschemes. After the investigation of the congruent and non-c...,"('math.AC',)","About multiplicities and applications to Bezout numbers. Let $(A,\mathfrak{m},\Bbbk)$ denote a local Noetherian ring and $\mathfrak{q}$ an ideal such that $...",-0.176028
39838,0.0,"('math.OA',)",Projective representations of groups using Hilbert right C*-modules. The projective representation of groups was introduced in 1904 by Issai Schur. It diffe...,"('math.AG',)",On the classification of degree 1 elliptic threefolds with constant $j$-invariant. We describe the possible Mordell-Weil groups for degree 1 elliptic threef...,-0.159296
704,0.0,"('math.AC',)","On finite molecularization domains. In this paper, we advance an ideal-theoretic analogue of a ""finite factorization domain"" (FFD), giving such a domain the...","('math.GT',)",Intrinsic knotting and linking of almost complete graphs. We introduce new sufficient conditions for intrinsic knotting and linking. A graph on n vertices w...,-0.158624
40121,0.0,"('math.OA',)",Subproduct systems with quantum group symmetry. We introduce a class of subproduct systems of finite dimensional Hilbert spaces whose fibers are defined by ...,"('math.AG',)",Invariance of plurigenera fails in positive and mixed characteristic. We construct smooth families of elliptic surface pairs with terminal singularities ove...,-0.15568
793,0.0,"('math.AC',)","A principal ideal theorem for compact sets of rank one valuation rings. Let $F$ be a field, and let Zar$(F)$ be the space of valuation rings of $F$ with res...","('math.AT',)","$E_2$-cells and mapping class groups. We prove a new kind of stabilisation result, ""secondary homological stability"", for the homology of mapping class grou...",-0.131835
49896,0.0,"('math.RT',)","A System of Third-Order Differential Operators Conformally Invariant under $\mathfrak{so}(8,\mathbb{C})$. In earlier work, Barchini, Kable, and Zierau const...","('math.LO',)",Possible Size of an ultrapower of omega. Let omega be the first infinite ordinal (or the set of all natural numbers) with the usual order <. In section 1 we...,-0.11002
25352,1.0,"('math.FA',)",On control measures of multimeasures. Let $M$ be a multimeasure defined on a $\sigma$-algebra and taking values in the family of bounded non-empty subsets o...,"('math.FA',)","Caffarelli-Kohn-Nirenberg type inequalities for the weighted biharmonic operator: existence of extremal functions, breaking positivity and breaking symmetry...",-0.107418
3982,0.0,"('math.AG',)",Hodge filtration and Hodge ideals for $\mathbb{Q}$-divisors with weighted homogeneous isolated singularities. We give an explicit formula for the Hodge filt...,"('math.GR',)","Garside structure for the braid group of G(e,e,r). We give a new presentation of the braid group $B$ of the complex reflection group $G(e,e,r)$ which is pos...",-0.10418
3931,0.0,"('math.AG',)","Two polarized K3 surfaces associated to the same cubic fourfold. For infinitely many $d$, Hassett showed that special cubic fourfolds of discriminant $d$ ar...","('math.GR',)","Schur multipliers of special p-groups of rank 2. A group G is called special p-group of rank k if the commutator subgroup [G,G] and centre Z(G) are equal, w...",-0.103824
22671,0.0,"('math.DG',)",Topological invariants for closed hypersurfaces. We consider closed and orientable immersed hypersurfaces of translational manifolds. Given a vector field o...,"('math.RA',)",Iseki spaces of semirings. The aim of this paper is to study Iseki spaces of distinguished classes of ideals of a semiring endowed with a topology. We show ...,-0.103679


In [14]:
X_test_score[X_test_score["label"]==0].sort_values("cosine-sim").tail(25)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b,cosine-sim
48763,0.0,"('math.RA',)",On $\delta$-derivations of Lie algebras and superalgebras. We study $\delta$-derivations -- a construction simultaneously generalizing derivations and centr...,"('math.RT',)","On Universal Deformation Rings for Gorenstein Algebras. Let $\mathbf{k}$ be an algebraically closed field, and let $\Lambda$ be a finite dimensional $\mathb...",0.942985
21498,0.0,"('math.DG',)","The curvature and the integrability of almost-Kahler manifolds: a survey. We survey some recent results and constructions of almost-K\""ahler manifolds whose...","('math.DS',)",Multiple orthogonal geodesic chords in nonconvex Riemannian disks using obstacles. We use nonsmooth critical point theory and the theory of geodesics with o...,0.943611
51674,0.0,"('stat.AP',)",Application of Benford-Newcomb Law with Base Change to Electoral Fraud Detection. The invariance of Benford-Newcomb law under base changing is employed to t...,"('stat.ME',)",Modelling sources of ecological fallacy within a revised Brown and Payne model of voting transitions. We present a model of voting behaviour based on a vers...,0.945088
26448,0.0,"('math.FA',)",Integration of rough paths - the truncated variation approach. Using truncated variation techniques we obtain an improved version of the Loeve-Young inequal...,"('math.CV',)",On logarithmic Holder continuity of mappings on the boundary. We study mappings satisfying the so-called inverse Poletsky inequality. Under integrability of...,0.94615
49027,0.0,"('math.RA',)",On the max-algebraic core of a nonnegative matrix. The max-algebraic core of a nonnegative matrix is the intersection of column spans of all max-algebraic m...,"('math.RT',)","Symmetric subcategories, tilting modules and derived recollements. For any good tilting module $T$ over a ring $A$, there exists an $n$-symmetric subcategor...",0.94865
16159,0.0,"('math.CO',)","On the separation conjecture in Avoider-Enforcer games. Given a fixed graph $H$ with at least two edges and positive integers $n$ and $b$, the strict $(1 \c...","('math.GR',)","Coxeter systems with two-dimensional Davis-Vinberg complexes. In this paper, we study Coxeter systems with two-dimensional Davis-Vinberg complexes. We show ...",0.952991
48832,0.0,"('math.RA',)",Evaluation of Polynomials over Finite Rings via Additive Combinatorics. We give an improved polynomial bound on the complexity of the equation solvability p...,"('math.RT',)","Generalised Temperley-Lieb algebras of type $G(r,1,n)$. In this paper, we define a quotient of the cyclotomic Hecke algebra of type $G(r,1,n)$ as a generali...",0.95553
11043,0.0,"('math.AT',)",A geometric approach to equivariant factorization homology and nonabelian Poincar\'e duality. Fix a finite group G and an n-dimensional orthogonal G-represe...,"('math.GT',)",Upsilon invariants from cyclic branched covers. We extend the construction of upsilon-type invariants to null-homologous knots in rational homology three-sp...,0.958515
9503,0.0,"('math.AP',)",Semirelativistic Choquard equations with singular potentials and general nonlinearities arising from Hartree-Fock theory. We are interested in the general C...,"('math.DG',)",Vanishing distance phenomena and the geometric approach to SQG. In this article we study the induced geodesic distance of fractional order Sobolev metrics o...,0.959228
26506,0.0,"('math.FA',)",The Hardy Operator and Boyd Indices. We give necessary and sufficient conditions for the Hardy operator to be bounded on a rearrangement invariant quasi-Ban...,"('math.CA',)",Some toy Furstenberg sets and projections of the four-corner Cantor set. We give lower bounds for the Hausdorff dimensions of some model Furstenberg sets.,0.964434
