In [1]:
import spacy
import pandas as pd
import re

In [86]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
articles = pd.read_csv('../data/arxiv_math.csv')

In [4]:
#Preprocessing

def preprocess_abstract(abstract):
    abstract = abstract.replace('\n', ' ') #remove new line characters
    abstract = abstract.replace('$K$-theory', 'k-theory').replace('$C^*$-algebra', 'C-algebra').replace('\\emph', '')
    abstract = re.sub('\$.*?\$', '', abstract)
    abstract = abstract.replace('such a', ' ').replace('previously known', ' ').replace('so called', ' ').replace('more general', ' ').replace('all the', ' ').replace('all these', ' ').replace('very challenging', ' ')
    abstract = abstract.replace('so-called', ' ').replace('well known', ' ').replace('particularly nice', ' ')
    abstract = abstract.replace('"', '').replace("'", '').replace('`','').replace('\\', '').replace('--', '-').replace('^*', '')
    abstract = re.sub('\[.*?\]', '', abstract)
    abstract = re.sub('\s[a-zA-Z]{1}\s', ' ', abstract)
    abstract = re.sub('\s[0-9]+\s', ' ', abstract)
    abstract = re.sub('\(.*?\)', '', abstract)
    abstract = re.sub('\s[A-Z]{1}\.\s', ' ', abstract)
    abstract = abstract.replace('*', '').replace('{', '').replace('}', '')
    abstract = re.sub(' +', ' ', abstract)
    return abstract

In [5]:
from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

In [9]:
articles.abstract = articles.abstract.progress_apply(preprocess_abstract)

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))




In [6]:
remove_adjectives = ['certain', 'new', 'corresponding', 'their', 'our', 'such', 'whose', 'following',
                    'known','different', 'its', 'so-called', 'only', 'namely', 'just', 'same', 'particular',
                    'various', 'interesting', 'given', 'underlying', 'this', 'explicit', 'other', 'celebrated',
                    'respectively', 'associated', 'above', 'many', 'claimed', 'useful', 'that', 'and', 'conjectured',
                    'two', 'efficient', 'his', 'her']

Use spacy's noun chunker to created potential tags. Remove things such as articles and pronouns from the beginning and end as well as certain adjectives.

In [7]:
def shorten_abstract(abstract):
    doc = nlp(abstract)
    shortened = []
    for chunk in doc.noun_chunks:
        if ((doc[chunk.start].text in remove_adjectives) or (doc[chunk.start].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X'])) and (doc[chunk.end - 1].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X']):
            shortened.append('_'.join(chunk.text.split(' ')[1:-1]))

        elif ((doc[chunk.start].text in remove_adjectives) or (doc[chunk.start].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X'])):
            shortened.append('_'.join(chunk.text.split(' ')[1:]))

        elif (doc[chunk.end - 1].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X']):
            shortened.append('_'.join(chunk.text.split(' ')[:-1]))

        else: 
            shortened.append('_'.join(chunk.text.split(' ')))
    return ' '.join(shortened).strip()
    

In [8]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
#stop_words = set(stopwords.words('english')) 

stop_words = ['proof', 'first', 'new', 'the', 'a', 'an','certain', 'new', 'corresponding', 'their', 'our', 'such', 'whose', 'following',
                    'known','different', 'its', 'so-called', 'only', 'namely', 'just', 'same', 'particular',
                    'various', 'interesting', 'given', 'underlying', 'this', 'explicit', 'other', 'celebrated',
                    'respectively', 'associated', 'above', 'many', 'claimed', 'useful', 'that', 'conjectured',
                    'two', 'efficient', 'his', 'her', 'several', 'related', 'purely', 'well-known', 'important', 'technical',
             'assumption', 'notion', 'textin', 'doe', 'aforementioned', 'specific', 'nice', 'priori']

In [9]:
def remove_stopwords(sent):
    return ' '.join([w for w in word_tokenize(sent) if w.lower() not in stop_words])

In [10]:
def remove_stopwords_and_lemmatize(sent):
    return ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(sent) if w.lower() not in stop_words])

## Using Jason King's tokenizer function to tokenize the shortened abstracts

In [11]:
def get_ngrams(tokens, n=2):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    ngrams = [' '.join(ngram) for ngram in ngrams]
    return ngrams

def tokenize(text, ngram_range=(1,1)):
    tokens = re.findall(r'[a-z0-9_\'-]+', text.lower())
    ngrams = []
    for n in range(ngram_range[0], ngram_range[1]+1):
        ngrams += get_ngrams(tokens, n)
    return ngrams

In [12]:
def label_tokenizer(abstract):
    return tokenize(shorten_abstract(remove_stopwords(abstract)))

In [13]:
import random

In [14]:
def get_tags(abstract):
    tags = list(set([x.replace('_', ' ').strip() for x in label_tokenizer(abstract) if x not in stop_words_tags]))
    tags = [x[1:] if x.startswith('-') else x for x in tags]
    #tags = [x[:-1] if x.endswith('s') else x for x in tags]
    return [x for x in tags if len(x) > 2]
    #return [x for x in label_tokenizer(abstract) if x not in stop_words_tags]

In [91]:
abstract = """
Global weak solutions for quantum isothermal fluids
We construct global weak solutions to isothermal quantum Navier-Stokes equations, with or without Korteweg term, in the whole space of dimension at most three. Instead of working on the initial set of unknown functions, we consider an equivalent reformulation, based on a time-dependent rescaling, that we introduced in a previous paper to study the large time behavior, and which provides suitable a priori estimates, as opposed to the initial formulation where the potential energy is not signed. We proceed by working on tori whose size eventually becomes infinite. On each fixed torus, we consider the equations in the presence of drag force terms. Such equations are solved by regularization, and the limit where the drag force terms vanish is treated by resuming the notion of renormalized solution developed by I. Lacroix-Violet and A. Vasseur. We also establish global existence of weak solutions for the isothermal Korteweg equation (no viscosity), when initial data are well-prepared, in the sense that they stem from a Madelung transform. 
"""

In [92]:
print(get_tags(abstract))

['initial formulation', 'fixed torus', 'time-dependent rescaling', 'drag force terms', 'initial set', 'initial data', 'isothermal korteweg equation', 'lacroix-violet', 'regularization', 'renormalized solution', 'global weak solutions', 'unknown functions', 'potential energy', 'isothermal quantum', 'viscosity', 'sense', 'navier-stokes equations', 'global existence', 'equivalent reformulation', 'suitable estimates', 'vasseur', 'quantum isothermal fluids', 'large time behavior', 'korteweg term', 'weak solutions', 'tori size']


In [31]:
articles['title_and_abstract'] = articles.title + '. ' + articles.abstract

In [32]:
articles['tags'] = articles.title_and_abstract.progress_apply(get_tags)

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))




In [33]:
articles.to_csv('../data/tagged.csv', index = False)

In [62]:
i = random.choice(range(len(articles)))
print(articles.loc[i, 'tags'])

['constant mean curvature foliation', '3-manifolds', 'stable spheres', 'positive mass', 'einstein equation', 'timeslices', 'foliations', 'anti-de sitter-schwarzschild metrics', 'negative cosmological constant', 'constant mean curvature']


In [128]:
articles.loc[articles.tags.apply(lambda x: 'taskmaster' in x)]

Unnamed: 0,abstract,categories,created,doi,id,title,title_and_abstract,tags
75470,Today's massively-sized datasets have made it ...,"['cs.IT', 'cs.DC', 'math.IT']",2017-06-16,,1706.05436,Improving Distributed Gradient Descent Using R...,Improving Distributed Gradient Descent Using R...,"[competing schemes, taskmaster, stragglers, cl..."
78756,Solving a large-scale system of linear equatio...,"['cs.LG', 'cs.DC', 'math.NA']",2017-08-04,,1708.01413,Distributed Solution of Large-Scale Linear Sys...,Distributed Solution of Large-Scale Linear Sys...,"[scaled version, taskmaster, accelerated proje..."
297204,Digital crowdsourcing (CS) is a modern approac...,"['cs.LG', 'cs.IT', 'math.IT']",2016-08-25,,1608.07328,Fundamental Limits of Budget-Fidelity Trade-of...,Fundamental Limits of Budget-Fidelity Trade-of...,"[taskmaster, information theoretic rate-distor..."


In [69]:
articles.loc[0, 'abstract']

'We show that a determinant of Stirling cycle numbers counts unlabeled acyclic\nsingle-source automata. The proof involves a bijection from these automata to\ncertain marked lattice paths and a sign-reversing involution to evaluate the\ndeterminant.'

In [63]:
articles.loc[0, 'tags']

['lattice path',
 'bijection',
 'automaton',
 'stirling cycle number count unlabeled acyclic single-source automaton',
 'sign-reversing involution']

In [84]:
articles.loc[:10000][articles.loc[:10000].tags.map(lambda x: 'korteweg term' in x)]

Unnamed: 0,abstract,categories,created,doi,id,title,title_and_abstract,tags


In [29]:
#abstract = articles.loc[4050,'abstract']
i = random.choice(range(len(articles)))
print(articles.loc[i, 'abstract'])
print(get_tags(articles.loc[i, 'title'] + '. ' + articles.loc[i, 'abstract']))
#top_tfidf_feats(label_tfidf.transform([articles.loc[i, 'abstract']])[0], label_tfidf.get_feature_names(), 20)

This is the companion article to the Bourbaki talk of the same name given in
March 2009. The main theme of the talk and the article is to explain the
interplay between homotopy theory and algebraic geometry through the
Hopkins-Miller-Lurie theorem on topological modular forms, from which we learn
that the Deligne-Mumford moduli stack for elliptic curves is canonically
realized as an object in derived algebraic geometry.
['name', 'march', 'miller', 'lurie', 'companion article', 'aftern hopkins', 'main theme', 'algebraic geometry', 'object', 'bourbaki talk', 'homotopy theory', 'derived algebraic geometry', 'deligne-mumford', 'elliptic curves', 'topological modular forms', 'interplay']


In [26]:
stop_words_tags = ['map', 'mathcal',
                'people', 'constructive_way', 'few_example', 'criterion', 'old_solution', '_recent_result',
    'relationship', 'difficulties', 'such_problems', 'basic_understanding', 'same_parameter_values', 'physical_motivations',
              'self', 'attention', 'alternative', 'second-named_author', 'recent_breakthrough_result', 'show', 'previous_result',
                   'our_main_new_result', 'several_related_results', 'same_approach', 'streamlined_manner', 'increases', 'increase', ' certain_algebraic_invariants',
              'implications', 'presence', 'data', 'notice', 'difficulty', 'to_one_correspondence', 'plenty', 'cn', 'constant_c',
              'arguement', 'arguments', 'mild_hypotheses', 'bound', 'bounds', 'assume', 'major_ingredient', 'general_framework', 'time',
              'objective', 'objectives', 'novel', 'relation', 'unified_approach', 'suitable_variant', 'our_results', 'similar_result',
              'rise', 'statement', 'statements', 'correspond', 'corresponds', 'entry', 'entries', 'original_approach', 'current_paper',
              'boundedness', 'function', 'functions', 'symbol', 'symbols', 'who', 'important_ideas', 'main_concern', 'physical_importance',
              'key_concept', 'usual_way', 'newly_developed_logic', 'existing_results', 'exposition', 'minor_changes', 'previous_version',
              'small_corrections', 'research_work', 'novelty', 'several_important_classes', 'special_instances', 'state', 'states',
              'mini-course', 'intuitive_idea', 'old-standing_problem', 'journal', 'open_problem', 'such_formulas', 'special_cases',
              'related_formulas', 'recent_result', 'simple_application', 'enough_and_sufficient_condition', 'several_other_identities',
              'brief_introduction', 'his_formula', 'brussels', 'pqr2003_euroschool', 'employed_mathematical_tools', 
              'additional_condition', 'complete_classification', 'ratio', 'generalises', 'similar_results', 'pqr2003_euroschool',
              '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'short-comings',
              'length', 'positive_answer', 'completely_elementary_way', 'present_writer',
              'generality', 'previous_paper_cite', 'such_sets', 'our_aim', 'unusual_simplification', 'unexpectedly_neat_manner',
              'formulation', 'appropriate_site', 'original_map', 'typical_examples', 'period', 'periods',
              'dimension', 'weight', 'power', 'present_new_geometric_approach', 'proposed scheme', 'intersection', 'interesections',
              'uniqueness', 'inequality', 'metrics','structures', 'brief_note', 'function', 'large_class', 'investigation',
              'proof', 'proofs', 'analogue', 'paper', 'papers', 'result', 'results', 'note', 'notes', '_', 
              'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
              'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
              'article', 'articles','examples', 'simplified_proofs', 'open_question', 'seemingly_new_method',
              'developed_method', 'method', 'methods', 'present_paper', 'solution', 'solutions', 'example', 'examples',
              'part', 'parts', 'case', 'cases', 'our_main_result', 'our_emphasis', 'recent_progress', 'use',
              'exactly_this_question', 'following_interpretation', 'different_kinds', 'various_examples', 
              'failure', 'rich_and_interesting_algebraic_structure', 'first_step', 'search', 'its_action', 'arbitrary_', 
              'remark', 'remarks', 'detail', 'details', 'conjecture', 'conjectures', 'tool', 'tools', 'others',
              'context', 'contexts', 'problem', 'problems', 'above_mentioned_results', 'theorem', 'theorems',
              'our_arguments', 'consequence', 'consequences', 'term', 'terms', 'our_classification', 'fact', 'facts',
              'lecture', 'lectures', 'technique', 'techniques', 'necessary_and_sufficient_conditions', 'good_performance',
              'use', 'uses', 'diagram', 'diagrams', 'application', 'applications', 'past_two_decades', 'special_case', 
              'arguement', 'arguments', 'simple_necessary_and_sufficient_condition', 'condition', 'conditions',
              'analogous_result', 'property', 'properties', 'class', 'classes', 'achieved_result', 'other_components', 'component', 'components',
              'math', 'main_result', 'part', 'parts', 'first_part', 'second_part', 'our_examples', 'form', 'forms', 'construction', 'constructions',
              'first_application', 'consideration', 'considerations', 'explicit_formula', 'explicit_formulas', 'definition', 'definitions',
              'present_authors', 'talk', 'talks', 'theory', 'recent_conjecture', 'our_main_result', 'practical_purpose', 'our_problem',
              'our_theory', 'proposed_algorithm', 'intriguing_novelty', 'point', 'points', 
              '_systematic_and_explicit_way', 'new', 'previous_papers', 'author', 'authors', 'complete_description', 'study', 'studies',
              'general_sufficient_condition', 'main_theorem', 'sufficient_condition', 'necessary_condition',
              'typical_example', 'discussed_problem', 'what', 'novel_approach', 'our_proposed_approach',
              'prior_studies', 'our_simulations', 'advantage', 'advantages', 'last_conjecture', 'nonempty_sum', 'right_value',
              'left_value', 'all_the_elements', 'distinct_elements', 'element', 'elements', 'subset', 'subsets',
              'way', 'ways', 'point', 'points', 'relatively_effective_method', 'their_calculation', 'the_structure',
              'full_set', 'above_result', 'our_result', 'notion', 'notions', 'set', 'sets', 'space', 'spaces', 'respect', 'respects',
              'goal', 'goals', 'value', 'values', 'extended_review', 'particularly_easy_construction', 'background_material', 'recent_work',
              'course', 'affirmative_answer', 'generalization', 'generalizations', 'analogous_results', 'analogous_result', 'our_result',
              'many_important_practical_applications', 'our_best_knowledge', 'last_years', 'a_priori', 'much_attention',
              'body', 'work', 'original_parameter', 'term', 'terms', 'new_procedure', 'unknown', 'unknowns', 'similar_properties',
              'former_paper', 'two_oversights', 'overlooked_results', 'original_paper', 'simple_proof', 'unit', 'units',
              'joint_papers', 'recent_results', 'rigorous_derivation', 'number', 'numbers', 'first_section', 'next_section', 'column', 'columns',
              'paper_studies', 'easy_counter-example', 'open_questions', 'open_question', 'recent_joint-work', 'classical_method',
              'new_technique', 'stronger_ones', 'question', 'questions', 'important_consequences', 'classical_constructions', 
              'quite_recent_subject', 'available_results', 'researcher', 'researchers', 'other_main_ingredient', 'ingredient', 'ingredients',
              'idea', 'ideas', 'rapid_introduction', 'exercise', 'exercises', 'glimps', 'glimpse', 'following_classes', 'following_class',
              '-spaces', 'main_point', 'formula', 'formulas', 'formulae', 'further_questions', 'last_section', 'first_author',
              'previous_work', 'contrast', 'family', 'families', 'best_known_example', 'the_sum', 'contributions', 'contribution',
              'revised_version_mistake', 'other_hand', 'hand', 'affairs', 'lecture_notes', 'current_state',
              'basic_properties', 'setting', 'settings', 'concept', 'concepts', 'existence', 'then_any_model', 'earlier_result_math',
              'upper_and_lower_bounds', 'classification', 'system', 'several_different_classes', 'previous_paper',
              'whole_space', 'improved_versions', 'version', 'versions', 'variable', 'variables', 'two_kinds', 'kind', 'kinds',
              'same_or_longer_length', 'two_distinct_vectors', 'lower_bounds', 'vector', 'vectors', 'excellent_performance', 
              'cite', 'purpose', 'research_project', 'literature', 'framework', 'classification', 'characterization', 'characterizations',
              'concept', 'past_decades', 'recent_reports', 'similar_way', 'same_canonical_role', 'invent', 'blog', 'my_results',
              'converse', 'long-standing_conjecture', 'simpler_proof', 'corollaries', 'long_and_complicated_expressions', 'most_accurate_results',
              'wide_range', 'the_set', 'very_first_results', 'universal', 'processes', 'processe', 'theories', 'model', 'models',
              'well-known_results', 'support', 'second_author', 'last_results', 'great_efforts', 'theoretical_properties', 
              'future_work', 'several_authors', 'one_hand', 'review', 'general_case', 'that', 'parameter', 'parameters', 'approach', 'upper_bound',
              'propose', 'structure', 'main_contribution', 'previous_works', 'announcements', 'more_detailed_look', 'brief_history',
              'newly_apparent_role', 'programs_successes', 'several_sufficient_conditions', 'it', 'second_named_author',
              'similar_features', 'other_contexts', 'emphasis', 'fulfills', 'fulfill', 'reason', 'reasons', 'well_known_fact',
              'equation', 'equations', 'appendix', 'aim', 'size', 'extended_english_abstract',
              'systematic_way', 'general_results', 'approach', 'previous_results', 'his_joint_work', 'his_own_work', 'short_note',
              'first_attempt', 'new_applications', 'more_difficult_problem', 'behavior',
              'alternative_proofs', 'approach', 'merit', 'original_proof', 'several_applications', 'that_equation', 'general_result',
              'law', 'interpretation', 'estimate', 'estimates', 'viewpoint', 'type', 'types', 'short_introduction',
              'novel_estimation_procedure', 'novel_notion', 'new_series', 'new_examples', 'several_properties',
              'very_short_paper', 'same_result', 'arxiv', 'somewhat_stronger_and_more_precise_version', 'first_paper',
              'total_number', 'assumption', 'assumptions', 'important_case', 'main_difficulty', 'settles_question',
              'his_result', 'stronger_conditions', 'partial_positive_answer', 'rather_recent_subject', 'short_proofs', 'new_proof',
              'main_results', 'keywords', '_textit', 'textsc', 'uniqueness_and_existence_results', 'rather_general_setting',
              'right-hand_side', 'above_condition', 'very_explicit_way', 'present_papers', 'second_application',
              'extensive_recent_literature', 'introductory_level_survey', 'previous_preprint', 'insight', 'insights',
              'and_related_questions', 'negative_consistency_results', 'positive_results', 'already_published_results',
              'evaluation', 'depth', 'subclass', 'join', 'multiplicity', 'degree', 'degrees',
             ]

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [65]:
label_tfidf = TfidfVectorizer(tokenizer = label_tokenizer, stop_words=stop_words_tags)
#label_tfidf = CountVectorizer(tokenizer = label_tokenizer, stop_words=stop_words)

In [66]:
tfidf = label_tfidf.fit_transform(articles.loc[:10000, 'abstract'])

  'stop_words.' % sorted(inconsistent))


In [None]:
from IPython.display import Audio

Audio('tolling-bell_daniel-simion.wav', autoplay = True)

In [67]:
import numpy as np
def top_tfidf_feats(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row.toarray())[0,::-1][:top_n]
    top_feats = [(features[i], row[0,i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [127]:
tags = []

In [128]:
for i in tqdm(range(len(articles))):
    try:
        tags.extend(get_tags(articles.loc[i, 'title'] + '. ' + articles.loc[i, 'abstract']))
    except:
        pass

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))

In [129]:
from collections import Counter

In [130]:
counter = Counter(tags)

In [131]:
counter.most_common()

[('order', 23987),
 ('algorithm', 17566),
 ('graph', 14120),
 ('group', 12203),
 ('convergence', 11716),
 ('graphs', 11529),
 ('analysis', 11135),
 ('vertices', 9785),
 ('groups', 9665),
 ('addition', 9442),
 ('sense', 8910),
 ('sequence', 8792),
 ('systems', 8746),
 ('field', 8044),
 ('distribution', 7965),
 ('boundary', 7845),
 ('category', 7834),
 ('infinity', 7773),
 ('action', 7526),
 ('operators', 7258),
 ('dimensions', 7056),
 ('algebra', 7051),
 ('coefficients', 6987),
 ('sum', 6775),
 ('product', 6637),
 ('limit', 6604),
 ('extension', 6546),
 ('performance', 6475),
 ('means', 6434),
 ('polynomials', 6430),
 ('representations', 6365),
 ('eigenvalues', 6242),
 ('dynamics', 6226),
 ('connection', 6142),
 ('curves', 5836),
 ('information', 5561),
 ('surfaces', 5501),
 ('algebras', 5478),
 ('process', 5389),
 ('probability', 5322),
 ('series', 5271),
 ('spectrum', 5208),
 ('geometry', 5187),
 ('pair', 5167),
 ('stability', 5151),
 ('matrices', 4908),
 ('manifolds', 4896),
 ('repre

In [135]:
def tag_ranker(article, counter):
    import pandas as pd
    proposed_tags = [tag for tag in get_tags(article) if tag in counter]
    return set(proposed_tags)

In [139]:
abstract = \
"This paper introduces a complex representation for spacelike surfaces in the Lorentz-Minkowski space L4, based in two complex valued functions which can be assumed to be holomorphic or anti-holomorphic. When the immersion is contained in quadrics of L4, the representation then allows us to obtain interesting partial differential equations with holomorphic or anti-holomorphic parameters, within which we find the partial Riccati Equation. Using then theory of holomorphic complex functions we construct explicitly new local solutions for those PDEs together with its associated geometric solutions. So, several explicit examples are given. As geometric consequence, through of our approach we characterize all conformal totally umbilical spacelike immersions into L4, and in addition, we also show that for each conformal immersion in L4 which satisfies the partial Riccati equation there exists a Bryant immersion in H3, both immersions being congruent by a translation vector. "
tag_ranker(abstract, counter)

{'addition',
 'complex representation',
 'complex valued functions',
 'conformal immersion',
 'geometric consequence',
 'geometric solutions',
 'h3',
 'holomorphic complex functions',
 'immersion',
 'immersions',
 'l4',
 'partial differential equations',
 'pdes',
 'quadrics',
 'representation',
 'spacelike surfaces',
 'translation vector'}

In [160]:
import pickle
from scipy.sparse import load_npz
with open('../trained_models/tf_vect.pickle', 'rb') as pickle_file:
    tf_vect = pickle.load(pickle_file)
tfidf_sim = load_npz('../trained_models/tfidf.npz')

In [230]:
features = label_tfidf.get_feature_names()

In [216]:
from sklearn.metrics.pairwise import linear_kernel

In [237]:
abstract = articles.loc[1500,'abstract']
print(abstract)

Let be finite group with derived subgroup of rank . We prove that . Motivated by the results of M. Isaacs in cite{isa} we show that if is capable then . This answers question of Pyber. We prove that if is capable -group then the rank of is bounded above in terms of the rank of .


In [238]:
cosine_similarities = linear_kernel(tf_vect.transform([abstract]), tfidf_sim).flatten()

In [239]:
related_docs_indices = cosine_similarities.argsort()[:-11:-1]

In [240]:
related_docs_indices

array([  1500, 314476, 110003,  24282,  88591,  88843, 290194, 246954,
       107577, 275822])

In [241]:
# Get tags from the 5 most similar papers based on tfidf similarity
for i in range(5):
    matching_abstract = articles.loc[related_docs_indices[i],'abstract']
    if i == 0:
        tag_weights = label_tfidf.transform([matching_abstract])
    else:
        tag_weights += label_tfidf.transform([matching_abstract])
topn_ids = np.argsort(tag_weights.toarray())[0,::-1][:20]
top_feats = [(features[i], tag_weights[0,i]) for i in topn_ids]
best_tags = pd.DataFrame(top_feats)
best_tags.columns = ['feature', 'tfidf']

In [242]:
print(abstract)

Let be finite group with derived subgroup of rank . We prove that . Motivated by the results of M. Isaacs in cite{isa} we show that if is capable then . This answers question of Pyber. We prove that if is capable -group then the rank of is bounded above in terms of the rank of .


In [243]:
best_tags

Unnamed: 0,feature,tfidf
0,rank,6
1,groups,4
2,order,3
3,group,3
4,connections,1
5,answers,1
6,instance,1
7,interpolation,1
8,flat_knot_types,0
9,fixed_metric,0


In [None]:
245832

In [59]:
import random

In [128]:
tags = []
for abstract in tqdm(articles.loc[:10000].abstract):
    tags.extend([x.replace('_', ' ').strip() for x in label_tokenizer(abstract) if x not in stop_words])

HBox(children=(IntProgress(value=0, max=10001), HTML(value='')))

In [122]:
from collections import Counter

In [129]:
counter = Counter(tags)

In [130]:
counter.most_common()

[('order', 519),
 ('algorithm', 352),
 ('time', 318),
 ('group', 306),
 ('functions', 286),
 ('analysis', 262),
 ('systems', 258),
 ('relation', 229),
 ('eigenvalues', 229),
 ('infinity', 210),
 ('spectrum', 210),
 ('sense', 209),
 ('action', 208),
 ('connection', 200),
 ('operators', 195),
 ('dimensions', 193),
 ('field', 193),
 ('states', 193),
 ('graph', 191),
 ('sequence', 191),
 ('convergence', 190),
 ('algebra', 190),
 ('addition', 188),
 ('limit', 187),
 ('codes', 184),
 ('means', 170),
 ('distribution', 169),
 ('extension', 169),
 ('motion', 169),
 ('boundary', 168),
 ('probability', 167),
 ('representations', 165),
 ('degree', 164),
 ('series', 161),
 ('information', 160),
 ('sum', 159),
 ('graphs', 150),
 ('performance', 149),
 ('product', 147),
 ('description', 145),
 ('geometry', 144),
 ('capacity', 142),
 ('basis', 142),
 ('bounds', 142),
 ('particles', 140),
 ('energy', 140),
 ('coefficients', 140),
 ('dynamics', 139),
 ('presence', 139),
 ('vertices', 139),
 ('groups', 1

In [176]:
papers = pd.read_csv('../data/arxiv_math.csv')

In [231]:
#preprocess_abstract(papers.loc[i,'abstract'])
papers.loc[i,'abstract']

'If the space $\\mathcal{Q}$ of quadratic forms in $\\mathbb{R}^n$ is splitted\nin a direct sum $\\mathcal{Q}_1\\oplus...\\oplus \\mathcal{Q}_k$ and if $X$ and $Y$\nare independent random variables of $\\mathbb{R}^n$, assume that there exist a\nreal number $a$ such that $E(X|X+Y)=a(X+Y)$ and real distinct numbers\n$b_1,...,b_k$ such that $E(q(X)|X+Y)=b_iq(X+Y)$ for any $q$ in $\\mathcal{Q}_i.$\nWe prove that this happens only when $k=2$, when $\\mathbb{R}^n$ can be\nstructured in a Euclidean Jordan algebra and when $X$ and $Y$ have Wishart\ndistributions corresponding to this structure.'

In [353]:
i = random.choice(range(len(articles)))
abstract = articles.loc[i, 'abstract']
print(abstract)

We study relations between two fundamental constructions associated to vector bundles on smooth complex projective curve: the theta function and the Szego kernel . Two types of relations are demonstrated. First, we establish higher-rank version of the prime form, describing the pullback of determinant line bundles by difference maps, and show the theta function pulls back to the determinant of the Szego kernel. Next, we prove that the expansion of the Szego kernel at the diagonal gives the logarithmic derivative of the theta function over the moduli space of bundles for fixed, or moving, curve. In particular, we recover the identification of the space of connections on the theta line bundle with moduli space of flat vector bundles, when the curve is fixed. When the curve varies, we identify this space of connections with the moduli space of em extended connections, which we introduce.


In [354]:
#set([x.replace('_', ' ').replace('certain ', '').strip() for x in label_tokenizer(abstract) if x not in stop_words])
set([x for x in label_tokenizer(abstract) if x not in stop_words])

{'bundles',
 'connections',
 'curve',
 'determinant',
 'determinant_line_bundles',
 'diagonal',
 'difference_maps',
 'expansion',
 'extended_connections',
 'flat_vector_bundles',
 'fundamental_constructions',
 'higher-rank_version',
 'identification',
 'logarithmic_derivative',
 'moduli_space',
 'moving',
 'prime_form',
 'pullback',
 'relations',
 'smooth_complex_projective_curve',
 'szego_kernel',
 'theta_function',
 'theta_line_bundle',
 'vector_bundles'}

In [55]:
from gensim.corpora.dictionary import Dictionary

In [139]:
texts = [label_tokenizer(abstract) for abstract in tqdm(articles.loc[:10000, 'abstract'])]

HBox(children=(IntProgress(value=0, max=10001), HTML(value='')))

In [154]:
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [155]:
tags = list(dictionary.token2id.keys())

In [156]:
abstract = articles.loc[100, 'abstract']

In [157]:
set([x.replace('_', ' ').strip() for x in label_tokenizer(preprocess_abstract(abstract)) if ((x not in stop_words) and (x in tags))])

{'arbitrary characteristic',
 'completely prime factor algebras',
 'field',
 'higher q i-skew tau i-derivation',
 'iterated skew polynomial ring r',
 'k-algebra',
 'pi degree',
 'pi degrees',
 'quantized coordinate rings',
 'quantum gelfand-kirillov conjecture',
 'satisfies',
 'times'}

In [152]:
print(preprocess_abstract(abstract))

For field of arbitrary characteristic, and k-algebra, we show that the PI degree of an iterated skew polynomial ring R...b agrees with the PI degree of R...b when each satisfies q_i-skew relation for q_i in k^{times} and extends to higher q_i-skew tau_i-derivation. We confirm the quantum Gelfand-Kirillov conjecture for various quantized coordinate rings, and calculate their PI degrees. We extend these results to completely prime factor algebras.


In [158]:
abstract

'For a field of arbitrary characteristic, and a k-algebra, we show that the PI degree of an iterated skew polynomial ring R...b agrees with the PI degree of R...b when each satisfies q_i-skew relation for q_i in k^{times} and extends to higher q_i-skew tau_i-derivation. We confirm the quantum Gelfand-Kirillov conjecture for various quantized coordinate rings, and calculate their PI degrees. We extend these results to completely prime factor algebras.'

In [258]:
shorten_abstract(preprocess_abstract(abstract))

'unitary_ensembles Hermitian_matrices weight_function kernel terms polynomials respect weight_function important_role orthogonal_and_symplectic_ensembles Hermitian_matrices matrix_kernels skew-orthogonal_polynomials analogous_role matrix_kernels upper_left-hand_entries  formulas entries terms scalar_kernel corresponding_unitary_ensembles  rational_function entries scalar_kernel extra_terms number order General_formulas extra_terms  skew-orthogonal_polynomials derivations'

In [250]:
import yake

In [263]:
# assuming default parameters
simple_kwextractor = yake.KeywordExtractor(top = 20)
keywords = simple_kwextractor.extract_keywords(preprocess_abstract(articles.loc[1, 'abstract']))

for kw in keywords:
    print(kw[0])

partial cubes
subgraphs of hypercubes
partial
cubes
winklers relations play
isometric subgraphs
winklers relations
finite partial cubes
partial cubes obtained
hypercubes
djokovi
subgraphs
paper
structures
winklers
graph defined
isometric
relations play
play an important
important role
