In [1]:
import numpy as np
from numpy.random import default_rng
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import torch
from recommender import Recommender
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap
import hdbscan
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from pylatexenc.latex2text import LatexNodes2Text

In [2]:
torch.manual_seed(42)
np.random.seed(42)
torch.manual_seed(42)
transformers.set_seed(42)
tqdm.pandas()
df = pd.read_csv('best_arxiv.csv')

In [3]:
model_name = 'all-mpnet-base-v2'
# sentence_model = SentenceTransformer(model_name, device="cuda")
# embeddings = np.array([sentence_model.encode(x) for x in tqdm(df['abstract'].tolist())])

In [4]:
# np.save('embeddings_for_arxiv.npy', embeddings)

In [6]:
rsys = Recommender(model_name, df['abstract_uncleaned'].tolist(), df['topics'].tolist(),
                   embeddings_file='embeddings_for_arxiv.npy',
                   model_path='best_model_arxiv')
result = rsys.recommend(['black hole', 'deep learning', 'dogs'])
result

Getting recommendations:   0%|          | 0/3 [00:00<?, ?it/s]

{'black hole': ['  We demonstrate that cosmic string loops may provide a joint resolution of two\nmysteries surrounding recently observed black holes. For a string tension in an\nappropriate range, large radius string loops have the potential to provide the\nnonlinearities in the early universe which seed supermassive black holes. The\nmore numerous smaller radius string loops can then seed intermediate mass black\nholes, including those with a mass in the region between 65 and 135 solar\nmasses in which standard black hole formation scenarios predict no black holes\nare able to form, but which have recently been detected by the LIGO/VIRGO\ncollaboration. We find that there could be as many as $10^6$ of intermediate\nmass black holes per galaxy, providing a tantalizing target for gravitational\nwave observatories to look for.\n',
  '  Gravitational slingshots around a neutron star in a compact binary have been\nproposed as a means of accelerating large masses to potentially relativisti

In [7]:
result = rsys.recommend(['materials', 'machine learning', 'economy'])
result

Getting recommendations:   0%|          | 0/3 [00:00<?, ?it/s]

{'materials': ['  Expanding on our former hypothesis that, in the current information age,\nteaching physics should become more intuition-based and aiming at pattern\nrecognition skills, we present multiple examples of qualitative methods in\ncondensed matter physics. They include the subjects of phonons, thermal and\nelectronic properties of matter, electron-phonon interactions and some\nproperties of semiconductors.\n',
  "  A major motivation for the scientific study of artworks is to understand\ntheir states of preservation and ongoing degradation mechanisms. This enables\npreservation strategies to be developed for irreplaceable works. Intensely-hued\ncadmium sulphide (CdS) yellow pigments are of particular interest because these\nare key to the palettes of many important late 19th and early 20th century\nmasters, including Vincent Van Gogh, Pablo Picasso, Henri Matisse, and Edvard\nMunch. As these paintings age, their cadmium yellow paints are undergoing\nsevere fading, flaking, 

In [8]:
result = rsys.recommend(['covid'])
result

Getting recommendations:   0%|          | 0/1 [00:00<?, ?it/s]

{'covid': ['  Solid estimates describing the clinical course of SARS-CoV-2 infections are\nstill lacking due to under-ascertainment of asymptomatic and mild-disease\ncases. In this work, we quantify age-specific probabilities of transitions\nbetween stages defining the natural history of SARS-CoV-2 infection from 1,965\nSARS-CoV-2 positive individuals identified in Italy between March and April\n2020 among contacts of confirmed cases. Infected contacts of cases were\nconfirmed via RT-PCR tests as part of contact tracing activities or\nretrospectively via IgG serological tests and followed-up for symptoms and\nclinical outcomes. In addition, we provide estimates of time intervals between\nkey events defining the clinical progression of cases as obtained from a larger\nsample, consisting of 95,371 infections ascertained between February and July\n2020. We found that being older than 60 years of age was associated with a\n39.9% (95%CI: 36.2-43.6%) likelihood of developing respiratory symp

In [10]:
result = rsys.recommend(['animal'])
result

Getting recommendations:   0%|          | 0/1 [00:00<?, ?it/s]

{'animal': ['  We present LSeg, a novel model for language-driven semantic image\nsegmentation. LSeg uses a text encoder to compute embeddings of descriptive\ninput labels (e.g., "grass" or "building") together with a transformer-based\nimage encoder that computes dense per-pixel embeddings of the input image. The\nimage encoder is trained with a contrastive objective to align pixel embeddings\nto the text embedding of the corresponding semantic class. The text embeddings\nprovide a flexible label representation in which semantically similar labels\nmap to similar regions in the embedding space (e.g., "cat" and "furry"). This\nallows LSeg to generalize to previously unseen categories at test time, without\nretraining or even requiring a single additional training sample. We\ndemonstrate that our approach achieves highly competitive zero-shot performance\ncompared to existing zero- and few-shot semantic segmentation methods, and even\nmatches the accuracy of traditional segmentation alg