In [18]:
import spacy
import pandas as pd
import re

In [19]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [20]:
nlp = spacy.load('en_core_web_sm')

## This notebook will walk through the steps of how the tagger generates tags.

#### Start with an example title and abstract:

In [7]:
title = """
On the homotopy hypothesis in dimension 3
"""

abstract = """
We show that if the canonical left semi-model structure on the category of Grothendieck n-groupoids exists, then it satisfies the homotopy hypothesis, i.e. the associated (∞,1)-category is equivalent to that of homotopy n-types, thus generalizing a result of the first named author. As a corollary of the second named author's proof of the existence of the canonical left semi-model structure for Grothendieck 3-groupoids, we obtain a proof of the homotopy hypothesis for Grothendieck 3-groupoids. 
"""

example = title + '. ' + abstract

In [5]:
#Preprocessing

def preprocess_abstract(abstract):
    import re
    okay = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\$()[]{}- ")
    abstract = abstract.replace('$K$-theory', 'k-theory').replace('$C^*$-algebra', 'C-algebra').replace('\\emph', '').replace('\\emph{', '').replace('\\texit{', '')
    abstract = ' '.join([word for word in abstract.split(' ') if set([c for c in word]).issubset(okay)])
    #abstract = ''.join([c for c in abstract if c in okay])
    abstract = abstract.replace('\n', ' ') #remove new line characters
    abstract = re.sub(r'\\\S+', '', abstract) #remove anything after a slash
    abstract = abstract.replace('ä', 'a').replace('ö', 'o').replace('é', 'e')

    abstract = re.sub('\$.*?\$', '', abstract)
    abstract = abstract.replace('such a', ' ').replace('previously known', ' ').replace('so called', ' ').replace('more general', ' ').replace('all the', ' ').replace('all these', ' ').replace('very challenging', ' ')
    abstract = abstract.replace('so-called', ' ').replace('well known', ' ').replace('particularly nice', ' ')
    abstract = abstract.replace('"', '').replace("'", '').replace('`','').replace('\\', '').replace('--', '-').replace('^*', '')
    abstract = re.sub('\[.*?\]', '', abstract)
    abstract = re.sub('\s[a-zA-Z]{1}[\.\,\;]?\s', '. ', abstract)
    abstract = re.sub('\s[0-9]+\s', ' ', abstract)
    abstract = re.sub('\(.*?\)', '', abstract)
    abstract = abstract.replace('*', '').replace('{', '').replace('}', '')
    abstract = re.sub(' +', ' ', abstract)
    return abstract

In [12]:
preprocessed = preprocess_abstract(example)

print("preprocessed title and abstract:")

print(preprocessed)

preprocessed title and abstract:
the homotopy hypothesis in dimension show that if the canonical left semi-model structure on the category of Grothendieck n-groupoids exists, then it satisfies the homotopy hypothesis, i.e. the associated is equivalent to that of homotopy n-types, thus generalizing. result of the first named author. As. corollary of the second named authors proof of the existence of the canonical left semi-model structure for Grothendieck 3-groupoids, we obtain. proof of the homotopy hypothesis for Grothendieck 3-groupoids.


#### Load in stopwords:

In [10]:
from models.stop_words import make_stop_words
stop_words = make_stop_words()

In [14]:
def remove_stopwords_and_lemmatize(text):
    from nltk.tokenize import word_tokenize 
    return ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(text) if w.lower() not in stop_words])

In [21]:
lemmatized = remove_stopwords_and_lemmatize(preprocessed)

print('with stopwords and lemmatization:')

print(lemmatized)

with stopwords and lemmatization:
homotopy hypothesis in dimension show if canonical left semi-model structure on category of Grothendieck n-groupoids exists , then it satisfies homotopy hypothesis , i.e . is equivalent to of homotopy n-types , thus generalizing . result of named author . As . corollary of second named author of existence of canonical left semi-model structure for Grothendieck 3-groupoids , we obtain . of homotopy hypothesis for Grothendieck 3-groupoids .


## Use spacy's noun chunker to generate potential tags

In [27]:
from models.stop_words import make_remove_adjectives
remove_adjectives = make_remove_adjectives()

In [28]:
def shorten_abstract(abstract):
    doc = nlp(abstract)
    shortened = []
    for chunk in doc.noun_chunks:
        if ((doc[chunk.start].text in remove_adjectives) or (doc[chunk.start].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X'])) and (doc[chunk.end - 1].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X']):
            shortened.append('_'.join(chunk.text.split(' ')[1:-1]))

        elif ((doc[chunk.start].text in remove_adjectives) or (doc[chunk.start].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X'])):
            shortened.append('_'.join(chunk.text.split(' ')[1:]))

        elif (doc[chunk.end - 1].pos_ in ['PRON', 'DET', 'INTJ', 'AUX', 'CCONJ', 'APD', 'NUM', 'PART', 'SCONJ', 'PUNCT', 'SYM', 'X']):
            shortened.append('_'.join(chunk.text.split(' ')[:-1]))

        else: 
            shortened.append('_'.join(chunk.text.split(' ')))
    return ' '.join(shortened).strip()

In [29]:
shortened = shorten_abstract(lemmatized)

print('shortened:')

print(shortened)

shortened:
homotopy hypothesis dimension show semi-model_structure category Grothendieck   hypothesis homotopy  result named_author second_named_author existence canonical_left_semi-model_structure Grothendieck  homotopy hypothesis Grothendieck


In [31]:
good_chars = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 -_")
cleaned = ''.join([c for c in shortened if c in good_chars])
print("cleaned:")
print(cleaned)

cleaned:
homotopy hypothesis dimension show semi-model_structure category Grothendieck   hypothesis homotopy  result named_author second_named_author existence canonical_left_semi-model_structure Grothendieck  homotopy hypothesis Grothendieck


In [33]:
cleaned

'homotopy hypothesis dimension show semi-model_structure category Grothendieck   hypothesis homotopy  result named_author second_named_author existence canonical_left_semi-model_structure Grothendieck  homotopy hypothesis Grothendieck'

## Tokenize, using functions borrowed from Jason King

In [41]:
def get_ngrams(tokens, n=2):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    ngrams = [' '.join(ngram) for ngram in ngrams]
    return ngrams

def tokenize(text, ngram_range=(1,1)):
    import re
    tokens = re.findall(r'[a-z0-9_\'-]+', text.lower())
    ngrams = []
    for n in range(ngram_range[0], ngram_range[1]+1):
        ngrams += get_ngrams(tokens, n)
    return ngrams

In [43]:
tokenized = tokenize(cleaned)
print("tokenized: ")
print(tokenized)

tokenized: 
['homotopy', 'hypothesis', 'dimension', 'show', 'semi-model_structure', 'category', 'grothendieck', 'hypothesis', 'homotopy', 'result', 'named_author', 'second_named_author', 'existence', 'canonical_left_semi-model_structure', 'grothendieck', 'homotopy', 'hypothesis', 'grothendieck']


## Finally, remove duplicates and stop words:

In [44]:
from models.stop_words import make_stop_words_tags
stop_words_tags = make_stop_words_tags()
tags = set([x.replace('_', ' ').strip() for x in tokenized if x not in stop_words_tags])
print("tags: ")
print(tags)

tags: 
{'named author', 'category', 'grothendieck', 'homotopy', 'semi-model structure', 'canonical left semi-model structure', 'hypothesis'}


# The above steps are implemented in the Tagger class

In [45]:
from models.tagger import Tagger

In [46]:
tagger = Tagger()

In [47]:
tagger.generate_tags(example)

{'canonical left semi-model structure',
 'category',
 'grothendieck',
 'homotopy',
 'hypothesis',
 'named author',
 'semi-model structure'}

## Tagging the dataset for search purposes:

In [3]:
articles = pd.read_csv('../data/arxiv_math.csv')

In [5]:
from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

In [32]:
articles['tags'] = articles.title_and_abstract.progress_apply(get_tags)

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))




In [33]:
articles.to_csv('../data/tagged.csv', index = False)