In [1]:
import pandas as pd

import string
from gensim import corpora

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
# nltk.download('stopwords')
# nltk.download('wordnet')

from gensim import corpora, utils, models, similarities
from collections import defaultdict

from gensim.models import LdaMulticore

from tqdm import tqdm_notebook as tqdm

import random

In [2]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [3]:
articles = pd.read_csv('../data/arxiv_math.csv')

In [4]:
class Cleaner:
    def __init__(self):
        # Punctuations and stopwords
        self.punctuation = set(string.punctuation)
        self.stoplist = set(stopwords.words('english'))
        self.math_stoplist = ['proof', 'paper', 'result', 'show', 'new', 'equation', 'certain', 'assumption', 'approach',
                             'solution', 'underlying', 'case', 'result']

        # LDA
        self.dictionary = corpora.Dictionary()
        self.lemma = WordNetLemmatizer()
    
    def remove_math(self, text):
        import re
        return re.sub('\$.*?\$', '', text)
    def remove_punctuation(self, text):
        return ''.join([char.lower() for char in text if ((char not in self.punctuation) or (char == '-'))])
    def remove_numbers(self, text):
        return ''.join([char for char in text if not char.isdigit()])
    def remove_stopwords(self, text):
        return ' '.join([word for word in text.split() if word not in self.stoplist])
    def remove_single_chars(self, text):
        return ' '.join([word.lower() for word in text.split() if len(word) > 1])
    def lemmatize(self, text):
        return ' '.join([self.lemma.lemmatize(word) for word in text.split()])
    def remove_math_stopwords(self, text):
        return ' '.join([word for word in text.split() if word not in self.math_stoplist])
    def clean_text(self, text):
        text = text.replace('\n', ' ')
        text = self.remove_math(text)
        text = self.remove_punctuation(text)
        text = self.remove_numbers(text)
        text = self.remove_stopwords(text)
        text = self.remove_single_chars(text)
        text = self.remove_math_stopwords(text)
        text = self.lemmatize(text)
        return text


In [5]:
cleaner = Cleaner()

In [10]:
random_subset = articles.loc[random.sample(list(articles.index), 5000)]

In [12]:
texts = [cleaner.clean_text(articles.loc[i, 'title'] + ' ' + articles.loc[i, 'abstract']).split() for i in tqdm(list(random_subset.index))]

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [13]:
common_dictionary = Dictionary(texts)
common_dictionary.filter_extremes(no_below=5, no_above=0.8)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]
corpus = [common_dictionary.doc2bow(text) for text in texts]

In [33]:
full_texts = [cleaner.clean_text(articles.loc[i, 'title'] + ' ' + articles.loc[i, 'abstract']).split() for i in tqdm(range(len(articles)))]

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))




In [34]:
full_corpus = [common_dictionary.doc2bow(text) for text in full_texts]

In [14]:
lda = LdaMulticore(common_corpus, num_topics=100, id2word=common_dictionary, passes=10)

  diff = np.log(self.expElogbeta)


In [15]:
topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

(27, '0.018*"rank" + 0.014*"nonnegative" + 0.011*"real" + 0.009*"tensor" + 0.008*"coxeter"')
(45, '0.046*"group" + 0.016*"lie" + 0.014*"property" + 0.012*"quantum" + 0.011*"product"')
(85, '0.021*"function" + 0.015*"set" + 0.015*"measure" + 0.013*"theorem" + 0.009*"property"')
(5, '0.055*"function" + 0.018*"partition" + 0.012*"number" + 0.010*"convex" + 0.010*"order"')
(23, '0.053*"signal" + 0.042*"sparse" + 0.019*"time" + 0.015*"problem" + 0.015*"power"')
(95, '0.019*"stability" + 0.017*"operator" + 0.010*"condition" + 0.010*"matrix" + 0.009*"result"')
(96, '0.026*"model" + 0.022*"estimator" + 0.019*"estimation" + 0.016*"parameter" + 0.016*"algorithm"')
(2, '0.025*"system" + 0.015*"state" + 0.011*"vertex" + 0.011*"three" + 0.010*"solution"')
(50, '0.044*"random" + 0.024*"brownian" + 0.023*"process" + 0.021*"motion" + 0.014*"time"')
(60, '0.017*"problem" + 0.013*"energy" + 0.009*"invariant" + 0.009*"model" + 0.008*"set"')
(67, '0.024*"affine" + 0.020*"hecke" + 0.017*"de" + 0.014*"le" +

In [16]:
len(articles)

384444

In [17]:
articles.tail()

Unnamed: 0,abstract,categories,created,doi,id,title
384439,For the unitary ensembles of $N\times N$ Hermi...,"['solv-int', 'hep-th', 'math.SP', 'nlin.SI']",1998-04-03,10.1023/A:1004536018336,solv-int/9804005,"On the relation between orthogonal, symplectic..."
384440,For the elliptic Gaudin model (a degenerate ca...,"['solv-int', 'hep-th', 'math.QA', 'nlin.SI']",1998-07-23,10.1007/s002200050635,solv-int/9807008,Separation of Variables in the Elliptic Gaudin...
384441,"In this article, we study and settle several s...","['solv-int', 'math-ph', 'math.MP', 'math.SP', ...",1998-10-26,10.1063/1.533012,solv-int/9810017,Algebraic Exact Solvability of trigonometric-t...
384442,A wide class of N=2 reductions of the supersym...,"['solv-int', 'hep-th', 'math-ph', 'math.MP', '...",1999-07-29,10.1016/S0550-3213(99)00653-7,solv-int/9907021,Supersymmetric KP hierarchy in N=1 superspace ...
384443,The asymptotic lattices and their transformati...,"['solv-int', 'math.DG', 'nlin.SI']",1999-09-16,10.1016/S0393-0440(00)00070-X,solv-int/9909015,Discrete asymptotic nets and W-congruences in ...


In [59]:
query = articles.loc[124573, 'abstract']

In [62]:
query

'We relate the geometrical construction of (2+1)-spacetimes via grafting to\nphase space and Poisson structure in the Chern-Simons formulation of\n(2+1)-dimensional gravity with vanishing cosmological constant on manifolds of\ntopology $R\\times S_g$, where $S_g$ is an orientable two-surface of genus\n$g>1$. We show how grafting along simple closed geodesics \\lambda is\nimplemented in the Chern-Simons formalism and derive explicit expressions for\nits action on the holonomies of general closed curves on S_g. We prove that\nthis action is generated via the Poisson bracket by a gauge invariant\nobservable associated to the holonomy of $\\lambda$. We deduce a symmetry\nrelation between the Poisson brackets of observables associated to the Lorentz\nand translational components of the holonomies of general closed curves on S_g\nand discuss its physical interpretation. Finally, we relate the action of\ngrafting on the phase space to the action of Dehn twists and show that grafting\ncan be v

In [61]:
words = common_dictionary.doc2bow(cleaner.clean_text(query).split())
print("Top words identified: ")
for word in words:
    print("{} {}".format(word[0], common_dictionary[word[0]]))

Top words identified: 
47 generated
61 prove
78 holonomy
82 manifold
90 constant
113 parameter
213 geodesic
219 structure
230 physical
232 simple
235 associated
256 derive
268 along
316 explicit
324 implemented
344 relation
391 construction
442 via
510 topology
512 -dimensional
524 space
573 closed
591 vanishing
618 action
638 general
664 invariant
682 finally
716 poisson
897 formalism
899 gauge
913 phase
1016 discus
1020 formal
1021 formulation
1206 curve
1227 expression
1238 genus
1249 relate
1298 dehn
1315 deduce
1368 symmetry
1733 component
1755 chern-simons
1829 observables
1831 bracket
1845 geometrical
1859 observable
1939 cosmological
2317 viewed
2433 satisfying
2718 lambda
2724 interpretation
2821 twist
2841 orientable
2911 gravity
3673 lorentz


In [46]:
index = similarities.MatrixSimilarity(lda[full_corpus])

  if np.issubdtype(vec.dtype, np.int):


In [35]:
def get_similarity(lda, query_vector):
    index = similarities.MatrixSimilarity(lda[full_corpus])
    sims = index[query_vector]
    return sims

In [47]:
def get_similarity(index, query_vector):
    #index = similarities.MatrixSimilarity(lda[full_corpus])
    sims = index[query_vector]
    return sims

In [63]:
query_vector = lda[words]
#sims = get_similarity(lda, query_vector)
sims = get_similarity(index, query_vector)

In [64]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [69]:
query

'We relate the geometrical construction of (2+1)-spacetimes via grafting to\nphase space and Poisson structure in the Chern-Simons formulation of\n(2+1)-dimensional gravity with vanishing cosmological constant on manifolds of\ntopology $R\\times S_g$, where $S_g$ is an orientable two-surface of genus\n$g>1$. We show how grafting along simple closed geodesics \\lambda is\nimplemented in the Chern-Simons formalism and derive explicit expressions for\nits action on the holonomies of general closed curves on S_g. We prove that\nthis action is generated via the Poisson bracket by a gauge invariant\nobservable associated to the holonomy of $\\lambda$. We deduce a symmetry\nrelation between the Poisson brackets of observables associated to the Lorentz\nand translational components of the holonomies of general closed curves on S_g\nand discuss its physical interpretation. Finally, we relate the action of\ngrafting on the phase space to the action of Dehn twists and show that grafting\ncan be v

In [70]:
print(articles.loc[sims[2][0], 'abstract'])

We extend B. Hassett's theory of weighted stable pointed curves ([Has03]) to
weighted stable maps. The space of stability conditions is described
explicitly, and the wall-crossing phenomenon studied. This can be considered as
a non-linear analog of the theory of stability conditions in abelian and
triangulated categories.
  We introduce virtual fundamental classes and thus obtain weighted
Gromov-Witten invariants. We show that by including gravitational descendants,
one obtains an $\LL$-algebra as introduced in [LM04] as a generalization of a
cohomological field theory.


In [195]:
print(query)

For the unitary ensembles of $N\times N$ Hermitian matrices associated with a
weight function $w$ there is a kernel, expressible in terms of the polynomials
orthogonal with respect to the weight function, which plays an important role.
For the orthogonal and symplectic ensembles of Hermitian matrices there are
$2\times2$ matrix kernels, usually constructed using skew-orthogonal
polynomials, which play an analogous role. These matrix kernels are determined
by their upper left-hand entries. We derive formulas expressing these entries
in terms of the scalar kernel for the corresponding unitary ensembles. We also
show that whenever $w'/w$ is a rational function the entries are equal to the
scalar kernel plus some extra terms whose number equals the order of $w'/w$.
General formulas are obtained for these extra terms. We do not use
skew-orthogonal polynomials in the derivations.


In [203]:
list(common_dictionary.token2id.keys())

['automaton',
 'bijection',
 'count',
 'cycle',
 'determinant',
 'evaluate',
 'involution',
 'involves',
 'lattice',
 'marked',
 'number',
 'path',
 'stirling',
 'thedeterminant',
 'tocertain',
 'unlabeled',
 'arbitrarydimension',
 'bipartite',
 'calculated',
 'cartesian',
 'characterization',
 'characterize',
 'construct',
 'cube',
 'dimension',
 'established',
 'expansion',
 'given',
 'graph',
 'hypercubes',
 'important',
 'isometric',
 'knownresults',
 'mean',
 'obtained',
 'old',
 'one',
 'operation',
 'partial',
 'particular',
 'pasting',
 'process',
 'product',
 'proof',
 'relation',
 'role',
 'structure',
 'subgraphs',
 'theory',
 'utilized',
 'also',
 'analogue',
 'application',
 'bracket',
 'certain',
 'class',
 'classified',
 'cusp',
 'derive',
 'distribution',
 'doesnot',
 'extend',
 'form',
 'hurwitz',
 'integral',
 'main',
 'modular',
 'modulo',
 'modulus',
 'odd',
 'ofsingular',
 'ono',
 'overpartitions',
 'prime',
 'property',
 'rankin-cohen',
 'recently',
 'satisfy',
 '

In [99]:
def find_GT(categories):
    return "math.AG" in categories

In [100]:
articles_MG = articles.loc[articles.categories.apply(find_GT)].reset_index(drop = True)

In [101]:
articles_MG

Unnamed: 0,abstract,categories,created,doi,id,title
0,In this note we give a new method for getting ...,"['math.PR', 'math.AG']",2007-03-31,,0704.0019,Approximation for extinction probability of th...
1,We prove a duality theorem for certain graded ...,"['math.AC', 'math.AG']",2007-04-01,,0704.0102,Duality and Tameness
2,These notes accompany a lecture about the topo...,"['math.SG', 'math.AG']",2007-04-02,,0704.0257,Orbifold cohomology of abelian symplectic redu...
3,As main result we show that for each g > 1 the...,"['math.GT', 'math.AG']",2007-04-03,,0704.0416,Origamis with non congruence Veech groups
4,In this note we describe the natural coordinat...,"['math.SG', 'math.AG']",2007-04-03,,0704.0430,Reduced phase space and toric variety coordina...
5,We introduce a new class of canonical AZD's (c...,"['math.AG', 'math.CV']",2007-04-04,,0704.0566,Canonical singular hermitian metrics on relati...
6,We study birational maps with empty base locus...,"['math.AC', 'math.AG']",2007-04-04,,0704.0608,On the homology of two-dimensional elimination
7,Consider the family S of irreducible plane cur...,['math.AG'],2007-04-04,,0704.0618,Number of moduli of irreducible families of pl...
8,Let S be the variety of irreducible sextics wi...,['math.AG'],2007-04-04,,0704.0622,On the number of moduli of plane sextics with ...
9,Every finite branch solutions to the sixth Pai...,"['math.AG', 'math.CA']",2007-04-05,,0704.0679,Finite branch solutions to Painleve VI around ...


In [102]:
cleaner = Cleaner()

In [103]:
texts = [cleaner.clean_text(articles_MG.loc[i, 'title'] + ' ' + articles_MG.loc[i, 'abstract']).split() for i in tqdm(range(len(articles_MG)))]

HBox(children=(IntProgress(value=0, max=34704), HTML(value='')))

In [104]:
common_dictionary = Dictionary(texts)
common_dictionary.filter_extremes(no_below=5, no_above=0.8)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]
corpus = [common_dictionary.doc2bow(text) for text in texts]

In [105]:
lda_MG = LdaMulticore(common_corpus, num_topics=10, id2word=common_dictionary, passes=10)

In [106]:
topics = lda_MG.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.035*"polynomial" + 0.014*"ideal" + 0.013*"set" + 0.011*"number" + 0.010*"matrix"')
(1, '0.024*"theory" + 0.016*"invariant" + 0.013*"calabi-yau" + 0.012*"function" + 0.012*"mirror"')
(2, '0.062*"surface" + 0.042*"curve" + 0.016*"point" + 0.015*"degree" + 0.015*"plane"')
(3, '0.053*"variety" + 0.028*"singularity" + 0.020*"toric" + 0.015*"projective" + 0.013*"divisor"')
(4, '0.064*"space" + 0.056*"bundle" + 0.053*"modulus" + 0.024*"vector" + 0.020*"stable"')
(5, '0.041*"manifold" + 0.030*"complex" + 0.016*"space" + 0.015*"compact" + 0.015*"tropical"')
(6, '0.027*"de" + 0.021*"real" + 0.021*"algebraic" + 0.019*"geometry" + 0.012*"theorem"')
(7, '0.053*"algebra" + 0.033*"category" + 0.017*"derived" + 0.016*"module" + 0.015*"ring"')
(8, '0.040*"field" + 0.038*"curve" + 0.020*"point" + 0.019*"function" + 0.017*"number"')
(9, '0.055*"group" + 0.018*"variety" + 0.018*"cohomology" + 0.014*"algebraic" + 0.010*"field"')


In [97]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [107]:
import pyLDAvis.gensim

pyLDAvis.gensim.prepare(lda_MG, corpus, common_dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
