In [1]:
import time
import urllib.request
import urllib.error
#import urllib2
import datetime
#from itertools import ifilter
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
import bibtexparser

pd.set_option('mode.chained_assignment','warn')

In [42]:
# The harvest() function utilizes arxiv.org's API to pull metadata for the specified data range.

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

def harvest(arxiv="math"):
    df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from=2006-01-01&until=2007-01-01&" +
           "metadataPrefix=arXiv&set=%s"%arxiv)
    
    while True:
        print ("fetching", url)
        try:
            response = urllib.request.urlopen(url)
            
        except urllib.error.HTTPError as e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print ("Got 503. Retrying after {0:d} seconds.".format(to))

                time.sleep(to)
                continue
                
            else:
                raise
            
        xml = response.read()

        root = ET.fromstring(xml)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            try:
                arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
                meta = record.find(OAI+'metadata')
                info = meta.find(ARXIV+"arXiv")
                created = info.find(ARXIV+"created").text
                created = datetime.datetime.strptime(created, "%Y-%m-%d")
                categories = info.find(ARXIV+"categories").text

                # if there is more than one DOI use the first one
                # often the second one (if it exists at all) refers
                # to an eratum or similar
                doi = info.find(ARXIV+"doi")
                if doi is not None:
                    doi = doi.text.split()[0]

                contents = {'title': info.find(ARXIV+"title").text,
                            'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                            'abstract': info.find(ARXIV+"abstract").text.strip(),
                            'created': created,
                            'categories': categories.split(),
                            'doi': doi,
                            }

                df = df.append(contents, ignore_index=True)
            except: pass

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df

In [43]:
df = harvest()

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2006-01-01&until=2007-01-01&metadataPrefix=arXiv&set=math


AttributeError: 'NoneType' object has no attribute 'findall'

In [38]:
df.to_csv('../data/arxiv_math_2007.csv', index = False)

In [16]:
def test_category(row, category):
    return category in row.categories

In [17]:
for category in sorted(categories):
    category = 'math.' + category
    print(category + ':', df.apply(lambda row: test_category(row, category), axis = 1).sum())

math.AC: 276
math.AG: 1229
math.AP: 1329
math.AT: 349
math.CA: 503
math.CO: 1373
math.CT: 204
math.CV: 351
math.DG: 861
math.DS: 923
math.FA: 661
math.GM: 84
math.GN: 117
math.GR: 548
math.GT: 522
math.HO: 66
math.IT: 988
math.KT: 149
math.LO: 356
math.MG: 236
math.MP: 1339
math.NA: 870
math.NT: 1157
math.OA: 304
math.OC: 1096
math.PR: 1132
math.QA: 315
math.RA: 364
math.RT: 639
math.SG: 211
math.SP: 175
math.ST: 449


In [21]:
for category in sorted(categories):
    df[category] = df.apply(lambda row: int(test_category(row, 'math.' + category)), axis = 1)

In [22]:
df.loc[0]

title         Decomposition and Enumeration of Triangulated ...
abstract      We describe some theoretical results on triang...
categories                                            [math.CO]
created                                     2007-05-13 00:00:00
id                                                    0705.1835
doi                              10.1080/10586458.2008.10129027
AC                                                            0
AG                                                            0
AP                                                            0
AT                                                            0
CA                                                            0
CO                                                            1
CT                                                            0
CV                                                            0
DG                                                            0
DS                                      

In [23]:
df.loc[2, 'title']

'On a certain continuity property of the residues of the poles of\n  $\\sum_{n \\geq 1} \\Lambda(n) e^{- \\pi i q n } n^{-s}$ with respect to $q \\in\n  \\mathbb{Q} \\cap (0, 2)$'

## Difficulty: Mathematical Notation

### Option 1: Remove all Mathematical Notation (and line breaks)

In [35]:
re.sub(r'\$[^\$]+\$', '', df.loc[8, 'title'].replace('\n', ''))

'On the Geometry of the Moduli Space of Real Binary Octics'

In [66]:
df['scrubbed_title'] = df.title.apply(lambda x: re.sub(r'\$[^\$]+\$', '', x.replace('\n', ' ')))

In [67]:
df['scrubbed_abstract'] = df.abstract.apply(lambda x: re.sub(r'\$[^\$]+\$', '', x.replace('\n', ' ')))

In [24]:
df.loc[2, 'title'].replace('\n', ' ')

'On a certain continuity property of the residues of the poles of   $\\sum_{n \\geq 1} \\Lambda(n) e^{- \\pi i q n } n^{-s}$ with respect to $q \\in   \\mathbb{Q} \\cap (0, 2)$'

In [25]:
[x.replace('-', '.').split('.') for x in df.loc[4, 'categories']]

[['math', 'ph'], ['math', 'MP']]

In [66]:
def find_first_category(row):
    return [x for x in row.categories if 'math' in x][0]
def find_math_categories(row):
    return [x for x in row.categories if 'math' in x]

In [67]:
df['first_category'] = df.apply(find_first_category, axis = 1)
df['math_categories'] = df.apply(find_math_categories, axis = 1)

In [68]:
df.head()

Unnamed: 0,title,abstract,categories,created,id,doi,first_category,math_categories
0,Dynamical Objects for Cohomologically Expandin...,The goal of this paper is to construct invaria...,[math.DS],2007-04-01,704.0069,,math.DS,[math.DS]
1,Decomposition numbers for finite Coxeter group...,"Given a finite irreducible Coxeter group $W$, ...","[math.CO, math.GR]",2007-04-02,704.0199,,math.CO,"[math.CO, math.GR]"
2,Capacity of a Multiple-Antenna Fading Channel ...,Given a multiple-input multiple-output (MIMO) ...,"[cs.IT, math.IT]",2007-04-02,704.0217,10.1109/TIT.2008.2011437,math.IT,[math.IT]
3,Optimal Routing for Decode-and-Forward based C...,We investigate cooperative wireless relay netw...,"[cs.IT, math.IT]",2007-04-04,704.0499,10.1109/SAHCN.2007.4292845,math.IT,[math.IT]
4,Linearisation of finite abelian subgroups of t...,This article gives the proof of results announ...,[math.AG],2007-04-04,704.0537,10.4171/GGD/55,math.AG,[math.AG]


In [60]:
df.groupby('first_category').title.count()

first_category
math-ph    1535
math.AC     243
math.AG    1122
math.AP     935
math.AT     257
math.CA     446
math.CO    1078
math.CT     132
math.CV     328
math.DG     939
math.DS     523
math.FA     488
math.GM     118
math.GN     116
math.GR     438
math.GT     399
math.HO      78
math.IT     856
math.KT      83
math.LO     210
math.MG     184
math.NA     354
math.NT     796
math.OA     263
math.OC     348
math.PR    1170
math.QA     337
math.RA     325
math.RT     497
math.SG     112
math.SP     146
math.ST     393
Name: title, dtype: int64

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [83]:
v = TfidfVectorizer()

#X = v.fit_transform(df.abstract.replace('\n', ' '))
#X = v.fit_transform((df.title + ' ' + df.abstract.str.replace('\n', ' ')))
X = v.fit_transform(df.scrubbed_title + ' ' + df.scrubbed_abstract)

y = df.AT

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y) 

In [85]:
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [86]:
print('log_loss', log_loss(y_test, model.predict_proba(X_test)))
print('AUC', roc_auc_score(y_test, model.predict(X_test)))

log_loss 0.077198197388453
AUC 0.510752688172043


In [87]:
confusion_matrix(y_test, model.predict(X_test))

array([[3426,    0],
       [  91,    2]])

In [88]:
y_test.sum()

93

In [99]:
df.loc[6677].categories

['math.AT']

In [100]:
df.loc[11006].scrubbed_abstract

"We compute the topological Witt groups of every complex flag manifold of ordinary type, and thus the interesting (i.e. torsion) part of the KO-groups of these manifolds. Equivalently, we compute Balmer's Witt groups of each flag variety of ordinary type over an algebraically closed field of characteristic not two. Our computation is based on an approach developed by Zibrowius. For types A, B and C, we obtain a full description not only of the additive but also of the multiplicative structure of the graded Witt rings."

In [90]:
y_test[(y_test != model.predict(X_test)) & (y_test == 1)]

6136     1
13686    1
11132    1
6677     1
5559     1
10661    1
6098     1
3652     1
2391     1
7015     1
6233     1
1464     1
8685     1
11006    1
6975     1
12712    1
5640     1
9116     1
2936     1
6216     1
11639    1
2310     1
1031     1
9092     1
3522     1
11041    1
6565     1
2447     1
5639     1
6175     1
        ..
13306    1
8368     1
2467     1
8450     1
10983    1
8069     1
6265     1
5913     1
1467     1
12398    1
3420     1
13641    1
1187     1
429      1
11559    1
7415     1
8888     1
7468     1
8564     1
12560    1
10052    1
4259     1
2409     1
2745     1
7958     1
10134    1
6544     1
1916     1
3227     1
375      1
Name: AT, Length: 91, dtype: int64

In [80]:
v = TfidfVectorizer()
X = v.fit_transform(df['abstract'])
y = df.first_category

In [108]:
enc = OneHotEncoder()
enc.fit(np.array(y).reshape(-1, 1))
y = enc.transform(np.array(y).reshape(-1,1))

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) 

In [130]:
etc = OneVsRestClassifier(LogisticRegression())
etc.fit(X_train, y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None)

In [142]:
y_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [146]:
cm = confusion_matrix(y_test.toarray().argmax(axis=1), etc.predict(X_test).toarray().argmax(axis=1))

In [171]:
predictions.toarray()[:,0].sum()

110

In [173]:
print(cm[:,0])

[353  58 193 174  49 107 206  39  84 146 126 100  20  31  86 105  14  92
  17  52  46  88 179  47  69 200  84  80 111  22  36  78]


In [137]:
print('log_loss', log_loss(y_test, etc.predict_proba(X_test)))
#print('AUC', roc_auc_score(y_test, etc.predict_proba(X_test)))

log_loss 1.724550112750738


In [129]:
etc = OneVsRestClassifier(GradientBoostingClassifier())
etc.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print('log_loss', log_loss(y_test, etc.predict_proba(X_test)))
#print('AUC', metrics.roc_auc_score(y_test, etc.predict_proba(X_test)))

In [10]:
import requests
import re

In [11]:
r = requests.get('https://arxiv.org/archive/math')

In [12]:
categories = [x for x in list(set(re.findall('math\.([\w]+)', r.text))) if len(x) == 2]

In [14]:
len(categories)

32