In [396]:
import os
import random
import re

import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#from sklearn.linear_model import SGDlassifier(loss = ...)
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

## Read in the two-column data frame (text + labels)

In [367]:
file = os.path.join("data", "bare_all.csv")

text_data = pd.read_csv(file, delimiter='\t')

text = text_data.text
label = np.array(text_data.label).reshape(-1,1)

## Train-test split

In [368]:
# stratified split (unbalanced data, see 'arXiv_cleanup.ipynb')
text_train, text_test, label_train, label_test = train_test_split(text, label, stratify=label, shuffle=True, random_state=123)

## Have a look at the data

In [366]:
print(f"No records: {round(len(text_data)/1000)}'000")
with pd.option_context('display.max_colwidth', 120):
    display(text_data[:3])

No records: 837'000


Unnamed: 0,text,label
0,"The Spitzer c2d Survey of Large, Nearby, Insterstellar Clouds. IX. The\r\n Serpens YSO Population As Observed With ...",phys
1,"On-line Viterbi Algorithm and Its Relationship to Random Walks In this paper, we introduce the on-line Viterbi alg...",cs
2,Dynamical Objects for Cohomologically Expanding Maps The goal of this paper is to construct invariant dynamical ob...,math


Abstracts of scientific papers tend to be written in a formal style, to not contain typos, nor direct citations, little references, ...

In [354]:
for _ in range(2):
    print(text.iloc[random.choice(range(len(text_data)))])
    print("---")

MULTI-CHANNEL SEARCH FOR SUPERGRAVITY AT THE LARGE HADRON COLLIDER   The potential of seeing supersymmetry (SUSY) at the CERN Large Hadron
Collider (LHC) was studied by looking at 3 types of signals: dilepton events
from slepton pair productions, trilepton events from chargino/neutralino
productions and missing energy plus multi-jet events from gluino/squark
productions. I described my results by mapping out reachable areas in the
supergravity parameter space. Areas explorable at LEP II were also mapped out
for comparison.

---
All-Angle Collimation for Spin Waves   We studied the effect of collimation for monochromatic beams of spin waves,
resulting from the refraction at the interface separating two magnetic
half-planes. The collimation was observed in broad range of the angles of
ncidence for homogenous Co and Py half-planes, due to significant intrinsic
anisotropy of spin wave propagation in these materials. The effect exists for
the sample saturated by in p

### One messy but informative kind of writing they have are LateX formulas (*\$...\$*)

In [369]:
text[3]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group $W$, a positive integer $d$, and\r\ntypes $T_1,T_2,...,T_d$ (in the sense of the classification of finite Coxeter\r\ngroups), we compute the number of decompositions $c=\\si_1\\si_2 cdots\\si_d$ of a\r\nCoxeter element $c$ of $W$, such that $\\si_i$ is a Coxeter element in a\r\nsubgroup of type $T_i$ in $W$, $i=1,2,...,d$, and such that the factorisation\r\nis "minimal" in the sense that the sum of the ranks of the $T_i$\'s,\r\n$i=1,2,...,d$, equals the rank of $W$. For the exceptional types, these\r\ndecomposition numbers have been computed by the first author. The type $A_n$\r\ndecomposition numbers have been computed by Goulden and Jackson, albeit using a\r\nsomewhat different language. We explain how to extract the type $B_n$\r\ndecomposition numbers from results of B\\\'ona, Bousquet, Labelle and Leroux on\r\nmap enumeration. Our formula for the 

### We choose to either mask them with * \_latex\_ * or flag them by appending * \_latex\_ * in front of each such expression. 

In [356]:
# mask or flag LaTeX expression with a word ' _LATEX_ '

from sklearn.base import BaseEstimator, TransformerMixin

class DeLaTeX(BaseEstimator, TransformerMixin):
    """
    Replace r"(\$[\s\w\d\\,\.=\(\)*{}/\[\]^;:'`<>|%&@\"!\?~#+-]*?\$)" with ' _latex_ ' or 'latex \1'
    """
    # why does it differ from  r'(\$.+?\$)' ?
    
    def __init__(self, behave = 'mask', latex_re = r"(\$[\s\w\d\\,\.=\(\)*{}/\[\]^;:'`<>|%&@\"!\?~#+-]*?\$)"):
        self.pattern = latex_re
        self.repl = ' _LATEX_ ' if behave == 'mask' else  r' _LATEX_ \1'
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.str.replace(self.pattern, self.repl)
            

In [357]:
delatex = DeLaTeX(behave = 'flag')
delatex.transform(text[3:4])[3][:300]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group  _LATEX_ $W$, a positive integer  _LATEX_ $d$, and\r\ntypes  _LATEX_ $T_1,T_2,...,T_d$ (in the sense of the classification of finite Coxeter\r\ngroups), we compute the nu'

In [358]:
delatex = DeLaTeX(behave = 'mask')
delatex.transform(text[3:4])[3][:300]

'Decomposition numbers for finite Coxeter groups and generalised\r\n  non-crossing partitions   Given a finite irreducible Coxeter group  _LATEX_ , a positive integer  _LATEX_ , and\r\ntypes  _LATEX_  (in the sense of the classification of finite Coxeter\r\ngroups), we compute the number of decompositions '

## One-hot encode *y*'s

In [370]:
one_hot = preprocessing.OneHotEncoder(handle_unknown='ignore')
y_train = one_hot.fit_transform(label_train)
y_test = one_hot.transform(label_test)

In [371]:
one_hot.inverse_transform([[0,0,1,0,0,0]])

array([['phys']], dtype=object)

### Optionally, have a look at the most frequent words.

In [391]:
count_v = CountVectorizer(strip_accents='unicode')
word_counts_train = count_v.fit_transform(tex_text_train)

In [394]:
# examine the most fequent words
sum_word_counts_train = word_counts_train.sum(axis=0)
sorted([(round(sum_word_counts_train[0, i],-3), word) for word, i in count_v.vocabulary_.items()],reverse=True)[:10]

[(6640000, 'the'),
 (4214000, 'of'),
 (2336000, 'and'),
 (2089000, 'in'),
 (1737000, 'to'),
 (1339000, 'we'),
 (1224000, 'is'),
 (1136000, 'for'),
 (1121000, '_latex_'),
 (889000, 'that')]

## Go step by step by an arbitrary pipeline 

In [None]:
delatex = DeLaTeX(behave = 'flag')


In [None]:
count_v = CountVectorizer(strip_accents='unicode', min_df = 2, max_df = 0.8)

## Build a pipeline

In [431]:
for a,b in [(1,2),(3,4)]:
    print(a,b)

1 2
3 4


In [397]:
def build_clf(model):
    pipe = []
    pipe.append(( 'delatex', DeLaTeX(behave='flag') ))
    pipe.append(( 'count_v', CountVectorizer(strip_accents='unicode', min_df = 2, max_df = 0.8)  ))
    pipe.append(( 'tfidf_t', TfidfTransformer(use_idf=False)  ))
    pipe.append(( 'clf',     model  ))
    return Pipeline(pipe)

In [436]:
lsvc = LinearSVC(C=1, class_weight='balanced')

In [437]:
clf = build_clf(lsvc)

In [438]:
clf.fit(text_train)

ValueError: bad input shape ()

In [433]:
def build_pipe_params(model_params):
    params = {
        'delatex__behave': ['flag'],
        'count_v__ngram_range': [(1, 1)],
        'tfidf_t__use_idf': (False),
    }
    for (name, range_) in model_params:
        params['clf__' + name] = range_
    return params

In [434]:
models = [
    ( LinearSVC(class_weight='balanced'), [( 'C', [0.01, 0.1, 1] )] )
]

In [435]:
for model, model_params in models:
    pipe = build_clf(model)
    params = build_pipe_params(model_params)
    gs_clf = GridSearchCV(pipe, params, cv=5, iid=False, n_jobs=-1)

{'delatex__behave': ['flag'], 'count_v__ngram_range': [(1, 1)], 'tfidf_t__use_idf': False, 'clf__C': [0.01, 0.1, 1]}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)