In [2]:
import pandas as pd
import re

In [3]:
arxiv = pd.read_csv('../data/arxiv_math.csv')

In [4]:
def preprocess_abstract(abstract):
    abstract = abstract.replace('\n', ' ') #remove new line characters
    #abstract = re.sub('\$.*?\$', '', abstract)
    #abstract = abstract.replace('such a', ' ').replace('previously known', ' ').replace('so called', ' ').replace('more general', ' ').replace('all the', ' ').replace('all these', ' ').replace('very challenging', ' ')
    #abstract = abstract.replace('so-called', ' ').replace('well known', ' ').replace('particularly nice', ' ')
    #abstract = abstract.replace('"', '').replace("'", '').replace('`','').replace('\\', '').replace('--', '-').replace('^*', '')
    #abstract = re.sub('\[.*?\]', '', abstract)
    #abstract = re.sub('\s[a-zA-Z]{1}\s', ' ', abstract)
    #abstract = re.sub('\s[0-9]+\s', ' ', abstract)
    #abstract = re.sub('\(.*?\)', '', abstract)
    #abstract = re.sub('\s[A-Z]{1}\.\s', ' ', abstract)
    #abstract = abstract.replace('*', '')
    #abstract = re.sub(' +', ' ', abstract)
    return abstract

In [5]:
arxiv.abstract = arxiv.abstract.apply(lambda x : x.replace('\n', ' '))
arxiv.title = arxiv.title.apply(lambda x: x.replace('\n', ' '))

In [67]:
def first_sentence(abstract):
    try:
        return abstract.split('.')[1]
    except:
        pass

In [68]:
arxiv.columns

Index(['title', 'abstract', 'categories', 'created', 'id', 'doi'], dtype='object')

In [6]:
import ast
def get_categories(categories):
    try:  
      return ' '.join(['__label__' + x[5:] for x in ast.literal_eval(categories) if x[:5] == 'math.'])
    except:
      return 'Missing'

In [7]:
arxiv['labels'] = arxiv.categories.apply(get_categories)

In [71]:
arxiv['first_sentence'] = arxiv.abstract.apply(first_sentence)

In [8]:
import csv

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train, test = train_test_split(arxiv, test_size = 0.2)

In [12]:
test.to_csv('../embeddings/test.csv')

In [11]:
pd.DataFrame(train.labels + ' ' + train.title + ' ' + train.abstract).dropna().to_csv('../embeddings/train.csv', index = False, header = False,quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\")

./fasttext supervised -input cooking.train -output model_cooking -lr 0.5 -epoch 25 -wordNgrams 2 -bucket 200000 -dim 50 -loss one-vs-all

In [36]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

In [17]:
def get_math_categories(categories):
    import ast
    return [x[5:] for x in ast.literal_eval(categories) if x[:5] == 'math.']

In [19]:
arxiv['math_categories'] = arxiv.categories.apply(get_math_categories)

In [21]:
mlb = MultiLabelBinarizer()

In [24]:
mlb.fit(arxiv['math_categories'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [25]:
train['math_categories'] = train.categories.apply(get_math_categories)
test['math_categories'] = test.categories.apply(get_math_categories)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
y_train = mlb.transform(train.math_categories)
y_test = mlb.transform(test.math_categories)

In [27]:
X_train = train.title + ' ' + train.abstract
X_test = test.title + ' ' + test.abstract

In [29]:
import fastText as ft

In [43]:
loaded_model = ft.load_model('../embeddings/train_model.bin')

In [44]:
class FTVectorizer():
    def __init__(self, ft_model):
        self.model = ft_model
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        def get_sentence_vector(abstract):
            return self.model.get_sentence_vector(abstract.replace('\n', ' '))
        return np.vstack(X.apply(get_sentence_vector).values)        

In [45]:
pipeline = Pipeline( steps = [
                ('vectorizer', FTVectorizer(loaded_model)),
                ('clf', OneVsRestClassifier(LogisticRegression())),
                #('clf', OneVsRestClassifier(MultinomialNB())),
                #('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None))),
                    ])

In [46]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vectorizer', <__main__.FTVectorizer object at 0x7fc89a7bfd30>), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [47]:
y_pred = pipeline.predict(X_test)

In [48]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, hamming_loss, f1_score

In [49]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    import numpy as np
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [50]:
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
print("Hamming Score: ", hamming_score(y_test, y_pred))
print("micro f1 score: ", f1_score(y_test, y_pred, average='micro'))

Hamming Loss:  0.025520149176085007
Hamming Score:  0.6125901965046706
micro f1 score:  0.6792138591301771


In [51]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          AC       0.71      0.57      0.63      1739
          AG       0.79      0.72      0.75      6981
          AP       0.76      0.69      0.72      6738
          AT       0.65      0.53      0.59      1919
          CA       0.57      0.44      0.50      2917
          CO       0.77      0.71      0.74      7492
          CT       0.62      0.51      0.56       926
          CV       0.65      0.50      0.56      2160
          DG       0.74      0.66      0.70      5530
          DS       0.69      0.58      0.63      4097
          FA       0.62      0.50      0.55      3766
          GM       0.43      0.21      0.28       449
          GN       0.62      0.43      0.51       614
          GR       0.69      0.60      0.64      2745
          GT       0.72      0.61      0.66      3044
          HO       0.65      0.36      0.46       426
          IT       0.89      0.85      0.87      5353
          KT       0.60    

  'precision', 'predicted', average, warn_for)


In [10]:
pd.DataFrame(arxiv.labels + ' ' + arxiv.title + ' ' + arxiv.abstract).dropna().to_csv('../data/ftabstract_full.csv', index = False, header = False,quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\")

In [28]:
abstracts = arxiv.abstract.apply(first_sentence)

In [29]:
abstracts.to_csv('../data/abstracts.csv')

In [126]:
new_papers = pd.read_csv('../data/arxiv_math_2008.csv')
new_papers.abstract = new_papers.abstract.apply(lambda x : x.replace('\n', ' '))

In [101]:
classifier = ft.supervised('../data/ftabstract.csv', 'model', label_prefix='__label__')

In [147]:
abstract = \
"We apply modern techniques of dyadic harmonic analysis to obtain sharp estimates for the Bergman projection in weighted Bergman spaces. Our main theorem focuses on the Bergman projection on Hartogs triangle. The estimates of the operator norm are in terms of a Bekollé-Bonami type constant. As an application of the results obtained, we give, for example, an upper bound for the Lp norm of the Bergman projection on the generalized Hartogs triangle Hm/n in C2. "

In [148]:
print(classifier.predict([abstract], 6))

[['AP', 'FA', 'CA', 'DS', 'CV', 'DG']]


In [133]:
i = 10
print(new_papers.loc[i, 'categories'])
print(classifier.predict([new_papers.loc[i, 'abstract']], 6))
print(new_papers.loc[i, 'abstract'])

['physics.optics', 'math-ph', 'math.MP']
[['MP', 'AP', 'DS', 'DG', 'PR', 'QA']]
There is currently a great deal of interest in the theoretical and practical possibility of cloaking objects from the observation by electromagnetic waves. The basic idea of these invisibility devices \cite{glu1, glu2, le},\cite{pss1} is to use anisotropic {\it transformation media} whose permittivity and permeability $\var^{\lambda\nu}, \mu^{\lambda\nu}$, are obtained from the ones, $\var_0^{\lambda\nu}, \mu^{\lambda\nu}_0$, of isotropic media, by singular transformations of coordinates. In this paper we study electromagnetic cloaking in the time-domain using the formalism of time-dependent scattering theory. This formalism allows us to settle in an unambiguous way the mathematical problems posed by the singularities of the inverse of the permittivity and the permeability of the {\it transformation media} on the boundary of the cloaked objects. We write Maxwell's equations in Schr\"odinger form with the el

In [86]:
model = ft.skipgram('../data/ftabstract.csv', 'model')

In [87]:
model.words

{'Lie',
 'let',
 'paper',
 'volatility',
 'advantages',
 '\\\\gamma',
 '$\\\\ell$',
 'jet',
 'sufficient',
 'momenta',
 'orientations',
 'did',
 'trying',
 'supremum',
 'viewed',
 'p-cyclotomic',
 '$A_0$',
 'his',
 'allowing',
 'irreversible',
 'saddle-node',
 'B$',
 'du',
 'top',
 'convenient',
 '$f',
 'BPS',
 'routing',
 'nothing',
 'Kazhdan',
 'identification',
 'K\\,',
 'iteration',
 'dual\\,',
 'transparent',
 'Probab',
 'provide',
 'representative',
 'NP',
 'monomials\\,',
 'foundation',
 'times\\,',
 '$G',
 'Gelman',
 'arrays',
 'choose',
 'transmission',
 'Veronese',
 'multivariate',
 'nilpotent\\,',
 'context\\,',
 'encoding',
 'strengthened',
 'surface\\,',
 'holomorphically',
 'discover',
 'Goldie',
 'categorification',
 'work\\,',
 'curves',
 'trivially',
 'Chen',
 'hold',
 'definable',
 'homogenization',
 'axioms',
 'representations\\,',
 'Oka',
 'noncommutativity',
 'possessing',
 'deriving',
 'degenerate',
 'leading',
 'spinor',
 'abundance',
 'autoequivalences',
 'ways'