In [33]:
import os
import random
import re

import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer #, HashingVectorizer

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
#from sklearn.linear_model import SGDlassifier(loss = ...)
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, f1_score

## Read in the two-column data frame (text + labels)

In [35]:
file = os.path.join("data", "bare_all.csv")

text_data = pd.read_csv(file, delimiter='\t')

text = text_data.text
label = text_data.label

## Train-test split

In [36]:
# stratified split (unbalanced data, see 'arXiv_cleanup.ipynb')
text_train, text_test, label_train, label_test = train_test_split(text, label, stratify=label, shuffle=True, random_state=123)

## Have a look at the data

In [4]:
print(f"No records: {round(len(text_data)/1000)}'000")
with pd.option_context('display.max_colwidth', 120):
    display(text_data[:3])

No records: 837'000


Unnamed: 0,text,label
0,"The Spitzer c2d Survey of Large, Nearby, Insterstellar Clouds. IX. The\r\n Serpens YSO Population As Observed With ...",phys
1,"On-line Viterbi Algorithm and Its Relationship to Random Walks In this paper, we introduce the on-line Viterbi alg...",cs
2,Dynamical Objects for Cohomologically Expanding Maps The goal of this paper is to construct invariant dynamical ob...,math


Abstracts of scientific papers tend to be written in a formal style, to not contain typos, nor direct citations, little references, ...

In [5]:
for _ in range(2):
    print(text.iloc[random.choice(range(len(text_data)))])
    print("---")

High-energy irradiation and mass loss rates of hot Jupiters in the solar
  neighborhood   Giant gas planets in close proximity to their host stars experience strong
irradiation. In extreme cases photoevaporation causes a transonic, planetary
wind and the persistent mass loss can possibly affect the planetary evolution.
We have identified nine hot Jupiter systems in the vicinity of the Sun, in
which expanded planetary atmospheres should be detectable through Lyman alpha
transit spectroscopy according to predictions. We use X-ray observations with
Chandra and XMM-Newton of seven of these targets to derive the high-energy
irradiation level of the planetary atmospheres and the resulting mass loss
rates. We further derive improved Lyman alpha luminosity estimates for the host
stars including interstellar absorption. According to our estimates WASP-80 b,
WASP-77 b, and WASP-43 b experience the strongest mass loss rates, exceeding
the mass loss rate of HD 209458 b, whe

### One messy but informative kind of writing they have are LaTeX formulas (*\$...\$*)

In [6]:
text[3]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group $W$, a positive integer $d$, and\r\ntypes $T_1,T_2,...,T_d$ (in the sense of the classification of finite Coxeter\r\ngroups), we compute the number of decompositions $c=\\si_1\\si_2 cdots\\si_d$ of a\r\nCoxeter element $c$ of $W$, such that $\\si_i$ is a Coxeter element in a\r\nsubgroup of type $T_i$ in $W$, $i=1,2,...,d$, and such that the factorisation\r\nis "minimal" in the sense that the sum of the ranks of the $T_i$\'s,\r\n$i=1,2,...,d$, equals the rank of $W$. For the exceptional types, these\r\ndecomposition numbers have been computed by the first author. The type $A_n$\r\ndecomposition numbers have been computed by Goulden and Jackson, albeit using a\r\nsomewhat different language. We explain how to extract the type $B_n$\r\ndecomposition numbers from results of B\\\'ona, Bousquet, Labelle and Leroux on\r\nmap enumeration. Our formula for the 

### We choose to either mask them with * \_latex\_ * or flag them by appending * \_latex\_ * in front of each such expression. 

In [7]:
# mask or flag LaTeX expression with a word ' _LATEX_ '

from sklearn.base import BaseEstimator, TransformerMixin

class DeLaTeX(BaseEstimator, TransformerMixin):
    """
    Replace r"(\$[\s\w\d\\,\.=\(\)*{}/\[\]^;:'`<>|%&@\"!\?~#+-]*?\$)" with ' _latex_ ' or 'latex \1'
    """
    # why does it differ from  r'(\$.+?\$)' ?
    
    def __init__(self, behave = 'mask', latex_re = r"(\$[\s\w\d\\,\.=\(\)*{}/\[\]^;:'`<>|%&@\"!\?~#+-]*?\$)"):
        self.pattern = latex_re
        self.repl = ' _LATEX_ ' if behave == 'mask' else  r' _LATEX_ \1'
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.str.replace(self.pattern, self.repl)
            

In [8]:
delatex = DeLaTeX(behave = 'mask')
delatex.transform(text[3:4])[3][:300]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group  _LATEX_ , a positive integer  _LATEX_ , and\r\ntypes  _LATEX_  (in the sense of the classification of finite Coxeter\r\ngroups), we compute the number of decompositions '

In [9]:
delatex = DeLaTeX(behave = 'flag')
delatex.transform(text[3:4])[3][:300]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group  _LATEX_ $W$, a positive integer  _LATEX_ $d$, and\r\ntypes  _LATEX_ $T_1,T_2,...,T_d$ (in the sense of the classification of finite Coxeter\r\ngroups), we compute the nu'

## Encode the labels

In [10]:
# # sklearn 0.2
# one_hot = OneHotEncoder(handle_unknown='ignore')
# y_train = one_hot.fit_transform(label_train)
# y_test = one_hot.transform(label_test)

In [40]:
# 'cs' -> 0, ..., 'stat' -> 5

label_e = LabelEncoder()
y_train = label_e.fit_transform(label_train)
y_test = label_e.transform(label_test)

print(list(label_e.classes_))

#label_e.inverse_transform([0]) # array(['cs'], dtype=object)

['cs', 'math', 'phys', 'q-bio', 'q-fin', 'stat']


## Optionally, have a look at the most frequent words.

In [38]:
delatex = DeLaTeX(behave = 'flag')
count_v = CountVectorizer(strip_accents='unicode')
word_counts_train = count_v.fit_transform(delatex.fit_transform(tex_text_train))

In [39]:
sum_word_counts_train = word_counts_train.sum(axis=0)
sorted([(round(sum_word_counts_train[0, i],-3), word) for word, i in count_v.vocabulary_.items()],reverse=True)[:10]

[(6640000, 'the'),
 (4214000, 'of'),
 (2336000, 'and'),
 (2242000, '_latex_'),
 (2089000, 'in'),
 (1737000, 'to'),
 (1339000, 'we'),
 (1224000, 'is'),
 (1136000, 'for'),
 (889000, 'that')]

## Go step by step through an arbitrary pipeline 

In [41]:
delatex = DeLaTeX(behave = 'flag')
tex_text_train = delatex.fit_transform(text_train)

In [42]:
count_v = CountVectorizer(strip_accents='unicode', min_df = 2, max_df = 0.8)
word_counts_train = count_v.fit_transform(tex_text_train)

In [43]:
tfidf_t = TfidfTransformer(use_idf=False)
tfidf_scores_train = tfidf_t.fit_transform(word_counts_train)

In [44]:
lsvc = LinearSVC(C=1, class_weight='balanced')
lsvc.fit(tfidf_scores_train, y_train)

LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Ok, let's just have some fun first:

In [51]:
random_abstracts = pd.Series(["""
The Lack of A Priori Distinctions Between Learning Algorithms  This is the first of
two papers that use off-training set (OTS) error to investigate the assumption-free
relationship between learning algorithms. This first paper discusses the senses in
which there are no a priori distinctions between learning algorithms. (The second
paper discusses the senses in which there are such distinctions.) In this first paper
it is shown, loosely speaking, that for any two algorithms A and B, there are "as many"
targets (or priors over targets) for which A has lower expected OTS error than B as
vice versa, for loss functions like zero-one loss. In particular, this is true if A
is cross-validation and B is "anti-cross-validation'' (choose the learning algorithm
with largest cross-validation error). This paper ends with a discussion of the
implications of these results for computational learning theory. It is shown that one
cannot say: if empirical misclassification rate is low, the Vapnik-Chervonenkis
dimension of your generalizer is small, and the training set is large, then with high
probability your OTS error is small. Other implications for "membership queries"
algorithms and "punting" algorithms are also discussed.
""",
"""
X-rays quarks lepton scattering experiment field
""",
"""
genes DNA RNA sequencing protein species fenotype 
""",
"""
computer algorithm graph sorting depth first interface
""",
"""
We offer a novel less intuitive proof of $\limit_{x\to 0} x = 0$,
""",
"""
inflation resources market stock bonds derivatives
""",
"""
distribution Bayesian p value marginalization Monte Carlo
"""
])

In [52]:
label_e.inverse_transform(lsvc.predict(tfidf_t.transform(count_v.transform(delatex.transform(random_abstracts )))))

array(['cs', 'phys', 'q-bio', 'cs', 'math', 'q-fin', 'stat'], dtype=object)

Looks promising :)
Here's the actual score:

In [None]:
predicted_y_test = lsvc.predict(tfidf_t.transform(count_v.transform(delatex.transform(text_test))))

In [54]:
print("Macro F1:", f1_score(y_test, predicted_y_test, average="macro"))
print(classification_report(y_test, predicted_y_test, target_names=label_e.classes_))
print(confusion_matrix(y_test, predicted_y_test))

Macro F1: 0.7961139528610754
              precision    recall  f1-score   support

          cs       0.87      0.88      0.88     27574
        math       0.91      0.92      0.91     47641
        phys       0.98      0.96      0.97    126481
       q-bio       0.58      0.78      0.67      2645
       q-fin       0.71      0.81      0.76      1156
        stat       0.51      0.70      0.59      3641

   micro avg       0.93      0.93      0.93    209138
   macro avg       0.76      0.84      0.80    209138
weighted avg       0.94      0.93      0.93    209138

[[ 24360   1432    443    285     67    987]
 [  1491  43644   1229    211    153    913]
 [  1319   2675 121130    875    110    372]
 [   174     62    230   2063      5    111]
 [    51     81     30      2    940     52]
 [   641    223     64    118     40   2555]]


## Build a pipeline

In [63]:
def build_clf(name, model):
    pipe = []
    pipe.append(( 'delatex', DeLaTeX(behave='flag') ))
    pipe.append(( 'tfidf_v', TfidfVectorizer(use_idf=False, strip_accents='unicode', min_df = 2, max_df = 0.8)  ))
    pipe.append(( name,     model  ))
    return Pipeline(pipe)

In [64]:
def build_pipe_params(name, model_params):
    params = {
        'delatex__behave': ['flag'],
        'count_v__ngram_range': [(1, 1)],
        'tfidf_t__use_idf': [False],
    }
    for (param_name, range_) in model_params:
        params[name + '__' + param_name] = range_
    return params

In [65]:
models = [
    ('LinearSVC', LinearSVC(class_weight='balanced'), [( 'C', [0.01, 1] )] )
]

In [67]:
scores = {}

for name, model, model_params in models:
    pipe = build_clf(name, model)
    params = build_pipe_params(name, model_params)
    grid_s = GridSearchCV(pipe, params, cv=5, scoring='f1_macro', iid=False, n_jobs=-1)
    grid_s.fit(text_train, y_train)
    test_macrof1 = f1_score(y_test, grid_s.predict(text_test), average="macro")
    scores[name] = [test_macrof1, grid_s.best_score_, grid_s.best_params_]

exception calling callback for <Future at 0x220458d5d30 state=finished raised BrokenProcessPool>
sklearn.externals.joblib.externals.loky.process_executor._RemoteTraceback: 
'''
Traceback (most recent call last):
  File "C:\Users\olspa\Anaconda3\lib\site-packages\sklearn\externals\joblib\externals\loky\process_executor.py", line 393, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "C:\Users\olspa\Anaconda3\lib\multiprocessing\queues.py", line 113, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'DeLaTeX' on <module '__main__' (built-in)>
'''

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\olspa\Anaconda3\lib\site-packages\sklearn\externals\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Users\olspa\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 375, in __call__
    self.par

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)