In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [4]:
df = pd.read_csv('../data/arxiv_math.csv')

In [5]:
# Preprocesser
def preprocess_abstract(abstract):
    import re
    abstract = abstract.replace('\n', ' ') #remove new line characters
    abstract = abstract.replace('$K$-theory', 'k-theory').replace('$C^*$-algebra', 'C-algebra').replace('\\emph', '') #fix a few common
    abstract = re.sub('\$.*?\$', '', abstract) #remove math
    bstract = re.sub('\[.*?\]', '', abstract) #remove anything in brackets
    abstract = re.sub('\s[a-zA-Z]{1}\s', ' ', abstract) #remove single letters - eg. consider a group G - the G does not add anything
    abstract = re.sub('\s[0-9]+\s', ' ', abstract) #remove any single numbers
    abstract = re.sub('\(.*?\)', '', abstract) #remove parentheses
    abstract = re.sub('\s[A-Z]{1}\.\s', ' ', abstract) #remove first initials
    abstract = abstract.replace('*', '').replace('{', '').replace('}', '')
    abstract = re.sub(' +', ' ', abstract) #remove extra spaces
    return abstract

In [7]:
df['title_and_abstract'] = (df.title + ' ' + df.abstract).progress_apply(preprocess_abstract)

HBox(children=(IntProgress(value=0, max=384444), HTML(value='')))




In [64]:
tf_vect = TfidfVectorizer(stop_words = 'english')

In [None]:
tfidf = tf_vect.fit_transform(df.loc[:, 'title_and_abstract'])

In [None]:
title = """
Koopman Operator and its Approximations for Systems with Symmetries
"""

In [None]:
abstract =  """
Nonlinear dynamical systems with symmetries exhibit a rich variety of behaviors, including complex attractor-basin portraits and enhanced and suppressed bifurcations. Symmetry arguments provide a way to study these collective behaviors and to simplify their analysis. The Koopman operator is an infinite dimensional linear operator that fully captures a system's nonlinear dynamics through the linear evolution of functions of the state space. Importantly, in contrast with local linearization, it preserves a system's global nonlinear features. We demonstrate how the presence of symmetries affects the Koopman operator structure and its spectral properties. In fact, we show that symmetry considerations can also simplify finding the Koopman operator approximations using the extended and kernel dynamic mode decomposition methods (EDMD and kernel DMD). Specifically, representation theory allows us to demonstrate that an isotypic component basis induces block diagonal structure in operator approximations, revealing hidden organization. Practically, if the data is symmetric, the EDMD and kernel DMD methods can be modified to give more efficient computation of the Koopman operator approximation and its eigenvalues, eigenfunctions, and eigenmodes. Rounding out the development, we discuss the effect of measurement noise.
"""

In [None]:
# Find the cosine similarity of a given abstract to all of those in the dataset
cosine_similarities = linear_kernel(tf_vect.transform([title + ' ' + abstract]), tfidf).flatten()

In [14]:
# Find the top 10 most similar papers
related_docs_indices = cosine_similarities.argsort()[:-11:-1]

array([162758, 164789,  73092, 279615, 165545, 359935, 285898,  86959,
        82735, 361748])

In [19]:
i = 0
print('Title: ', df.loc[related_docs_indices[i], 'title'])
print('Categories: ', df.loc[related_docs_indices[i], 'categories'])
print('Abstract: ', df.loc[related_docs_indices[i], 'abstract'])
print('URL: ', 'https://arxiv.org/abs/' + df.loc[related_docs_indices[i], 'id'])


Title:  Koopman invariant subspaces and finite linear representations of
  nonlinear dynamical systems for control
Categories:  ['math.DS']
Abstract:  In this work, we explore finite-dimensional linear representations of
nonlinear dynamical systems by restricting the Koopman operator to an invariant
subspace. The Koopman operator is an infinite-dimensional linear operator that
evolves observable functions of the state-space of a dynamical system [Koopman
1931, PNAS]. Dominant terms in the Koopman expansion are typically computed
using dynamic mode decomposition (DMD). DMD uses linear measurements of the
state variables, and it has recently been shown that this may be too
restrictive for nonlinear systems [Williams et al. 2015, JNLS]. Choosing
nonlinear observable functions to form an invariant subspace where it is
possible to obtain linear models, especially those that are useful for control,
is an open challenge.
  Here, we investigate the choice of observable functions for Koopman an

In [20]:
from scipy.sparse import save_npz
save_npz('../trained_models/tfidf.npz', tfidf)

In [11]:
import pickle

pickle.dump(tf_vect, open("../trained_models/tf_vect.pickle", "wb"))

# Predictor

In [17]:
import pickle
from scipy.sparse import load_npz
tfidf = load_npz('../trained_models/tfidf.npz')
with open('../trained_models/tf_vect.pickle', 'rb') as pickle_file:
    tf_vect = pickle.load(pickle_file)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
def get_math_categories(categories):
    import ast
    return [x[5:] for x in ast.literal_eval(categories) if x[:5] == 'math.']

In [11]:
df['math_categories'] = df.categories.apply(get_math_categories)

In [23]:
#y = df['math_categories'].str.join(sep='*').str.get_dummies(sep='*')

In [12]:
mlb = MultiLabelBinarizer()
mlb.fit(df['math_categories'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [13]:
df = pd.concat([df, pd.DataFrame(mlb.transform(df['math_categories']), columns = mlb.classes_)], axis = 1)

In [14]:
train, test = train_test_split(df, test_size = 0.2)

In [15]:
X_train = train.title_and_abstract
X_test = test.title_and_abstract

y_train = train[mlb.classes_]
y_test = test[mlb.classes_]

In [71]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

# tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
#                                 strip_accents = 'unicode', # works 
#                                 stop_words = 'english', # works
#                                 lowercase = True, # works
#                                 max_df = 0.5, # works
#                                 min_df = 10)

In [30]:
from sklearn.svm import LinearSVC

In [31]:
pipeline = Pipeline( steps =
                    [
                        #('vectorizer', TfidfVectorizer(stop_words = 'english')),
                        ('vectorizer', TfidfVectorizer(stop_words = 'english', lowercase = True, min_df = 10)),
                ('clf', OneVsRestClassifier(LinearSVC())),
                #('clf', OneVsRestClassifier(MultinomialNB())),
                #('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None))),
                    ])

In [32]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

In [33]:
#y_pred_prob = pipeline.predict_proba(X_test)
y_pred = pipeline.predict(X_test)

In [34]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, hamming_loss, f1_score

In [35]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    import numpy as np
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [36]:
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
#print("Hamming Score: ", hamming_score(y_test, y_pred))
print("micro f1 score: ", f1_score(y_test, y_pred, average='micro'))

Hamming Loss:  0.022201647829988683
micro f1 score:  0.7033807191494444


In [37]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          AC       0.84      0.55      0.67      1647
          AG       0.84      0.72      0.78      6823
          AP       0.82      0.70      0.75      6609
          AT       0.78      0.51      0.62      1909
          CA       0.72      0.37      0.49      2858
          CO       0.84      0.71      0.77      7554
          CT       0.73      0.51      0.60       920
          CV       0.79      0.48      0.60      2180
          DG       0.82      0.65      0.73      5535
          DS       0.80      0.55      0.65      4168
          FA       0.75      0.44      0.56      3674
          GM       0.67      0.12      0.20       494
          GN       0.79      0.47      0.59       643
          GR       0.78      0.60      0.68      2629
          GT       0.81      0.60      0.69      3028
          HO       0.78      0.35      0.48       423
          IT       0.92      0.85      0.88      5358
          KT       0.70    

  'precision', 'predicted', average, warn_for)


In [145]:
train_res = mlo.MLROS(train, labels=mlb.classes_, percentage=25)

1100
2100
3100
4000
4900
5800
6700
7600
8400
9200
10000
10800
11600
12400
13200
14000
14800
15500
16200
16900
17600
18300
18900
19400
19900
20400
20900
21400
21900
22400
22800
23200
23600
24000
24400
24800
25200
25600
25900
26200
26500
26800
27100
27400
27700
28000
28200
28400
28600
28800
29000
29200
29400
29500


In [146]:
X_res = train_res.title_and_abstract
y_res = train_res[mlb.classes_]

In [147]:
pipeline.fit(X_res, y_res)



Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [148]:
y_pred = pipeline.predict(X_test)

In [150]:
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
#print("Hamming Score: ", hamming_score(y_test, y_pred))
print("micro f1 score: ", f1_score(y_test, y_pred, average='micro'))

Hamming Loss:  0.023494501814303737
micro f1 score:  0.6739943266091057


In [151]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          AC       0.86      0.48      0.62      1666
          AG       0.86      0.67      0.76      6971
          AP       0.83      0.67      0.74      6568
          AT       0.80      0.43      0.56      1881
          CA       0.73      0.29      0.41      2878
          CO       0.86      0.65      0.74      7517
          CT       0.72      0.52      0.61       904
          CV       0.80      0.39      0.53      2130
          DG       0.84      0.61      0.70      5505
          DS       0.82      0.48      0.60      3997
          FA       0.76      0.41      0.53      3748
          GM       0.56      0.19      0.28       483
          GN       0.67      0.52      0.59       616
          GR       0.77      0.51      0.61      2764
          GT       0.84      0.55      0.66      2969
          HO       0.67      0.40      0.50       409
          IT       0.94      0.81      0.87      5374
          KT       0.59    

  'precision', 'predicted', average, warn_for)


## Notice that the F1 Scores of the minority classes are improved!

In [154]:
import fastText as ft
import numpy as np

In [163]:
#loaded_model = ft.load_model('../category-prediction/model_abstract.bin')
loaded_model = ft.load_model('../embeddings/train_model.bin')

In [156]:
def get_sentence_vector(abstract):
    return loaded_model.get_sentence_vector(abstract.replace('\n', ' '))

In [30]:
#X = df.title_and_abstract.apply(get_sentence_vector)

In [37]:
#X = np.vstack(X.values)

In [172]:
articles = pd.read_csv('../data/arxiv_math.csv')

In [173]:
X = articles.title + ' ' + articles.abstract

In [174]:
articles['math_categories'] = articles.categories.apply(get_math_categories)

In [175]:
mlb = MultiLabelBinarizer()

In [176]:
y = mlb.fit_transform(articles.math_categories)

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [164]:
pipeline = Pipeline( steps = [
                ('vectorizer', FTVectorizer(loaded_model)),
                ('clf', OneVsRestClassifier(LogisticRegression())),
                #('clf', OneVsRestClassifier(MultinomialNB())),
                #('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None))),
                    ])

In [165]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vectorizer', <__main__.FTVectorizer object at 0x7f35c9866940>), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [166]:
y_pred = pipeline.predict(X_test)

In [167]:
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
#print("Hamming Score: ", hamming_score(y_test, y_pred))
print("micro f1 score: ", f1_score(y_test, y_pred, average='micro'))

Hamming Loss:  0.014967192966484152
micro f1 score:  0.8101654724470334


In [168]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          AC       0.90      0.71      0.79      1666
          AG       0.90      0.79      0.84      6971
          AP       0.87      0.79      0.83      6568
          AT       0.88      0.69      0.77      1881
          CA       0.82      0.54      0.65      2878
          CO       0.89      0.81      0.85      7517
          CT       0.90      0.77      0.83       904
          CV       0.86      0.64      0.73      2130
          DG       0.88      0.77      0.82      5505
          DS       0.86      0.71      0.78      3997
          FA       0.82      0.62      0.70      3748
          GM       0.90      0.57      0.70       483
          GN       0.91      0.70      0.79       616
          GR       0.86      0.70      0.77      2764
          GT       0.89      0.74      0.81      2969
          HO       0.95      0.76      0.84       409
          IT       0.96      0.93      0.95      5374
          KT       0.87    

  'precision', 'predicted', average, warn_for)


In [52]:
loaded_model.predict([df.loc[0, 'abstract'].replace('\n', ' ')])

([['__label__CO']], array([[1.00001001]]))

In [81]:
ftvect = FTVectorizer(loaded_model)

In [82]:
ftvect.transform(X_test)

array([[ 0.15453576,  0.20995072,  0.01536667, ...,  0.2846516 ,
        -0.26455224, -0.13469005],
       [-0.17050149,  0.18350409, -0.00755677, ...,  0.0728174 ,
        -0.21852271,  0.13495567],
       [ 0.01654368,  0.10911454, -0.10993221, ...,  0.14175668,
        -0.15522371,  0.04105955],
       ...,
       [-0.11207704,  0.10648987, -0.0222425 , ...,  0.29313147,
        -0.08673435,  0.06448122],
       [ 0.1903944 ,  0.03458123,  0.01692822, ...,  0.14078929,
        -0.16159515,  0.06207137],
       [ 0.09276389,  0.05830173, -0.0080329 , ..., -0.02131573,
        -0.17205091,  0.08539037]], dtype=float32)

In [153]:
class FTVectorizer():
    def __init__(self, ft_model):
        self.model = ft_model
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        def get_sentence_vector(abstract):
            return self.model.get_sentence_vector(abstract.replace('\n', ' '))
        return np.vstack(X.apply(get_sentence_vector).values)        

In [163]:
class FTPredictor():
    def __init__(self, ft_model):
        self.model = ft_model
        
    def fit(self, X, y = None):
        return self
    
    def predict(self, X, y = None):
        predictions = []
        for x in X: 
            try:
                prediction = [label[-2:] for label in self.model.predict(x, threshold = 0.5, k = 6)[0]]
            except:
                prediction = list()
            prediction = self.predict_class(prediction)
            predictions.append(prediction)
        return np.array(predictions)
    
    def predict_class(self, prediction):
        predicted_classes = []
        for category in ['AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS', 'FA',
       'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO', 'MG', 'MP', 'NA',
       'NT', 'OA', 'OC', 'PR', 'QA', 'RA', 'RT', 'SG', 'SP', 'ST']:
            if category in prediction:
                predicted_classes.append(1)
            else:
                predicted_classes.append(0)
        return predicted_classes

In [164]:
ftpred = FTPredictor(loaded_model)

In [165]:
ftpred.predict(X_test)

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [166]:
pipeline = Pipeline( steps = [
                #('vectorizer', FTVectorizer(loaded_model)),
                ('clf', FTPredictor(loaded_model)),
                #('clf', OneVsRestClassifier(MultinomialNB())),
                #('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None))),
                    ])

In [167]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('clf', <__main__.FTPredictor object at 0x7f6468b3b208>)])

In [169]:
y_pred = pipeline.predict(X_test)

In [170]:
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
print("Hamming Score: ", hamming_score(y_test, y_pred))
print("micro f1 score: ", f1_score(y_test, y_pred, average='micro'))

Hamming Loss:  0.00969579523729012
Hamming Score:  0.8415664139213671
micro f1 score:  0.8815385684917222


In [171]:
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          AC       0.96      0.83      0.89      1659
          AG       0.95      0.88      0.91      6901
          AP       0.90      0.83      0.87      6715
          AT       0.96      0.84      0.90      1886
          CA       0.87      0.72      0.78      2867
          CO       0.94      0.87      0.91      7425
          CT       0.97      0.87      0.92       921
          CV       0.92      0.78      0.85      2182
          DG       0.93      0.87      0.90      5590
          DS       0.89      0.82      0.85      4162
          FA       0.85      0.81      0.82      3731
          GM       0.97      0.72      0.83       492
          GN       0.95      0.80      0.87       574
          GR       0.94      0.82      0.87      2660
          GT       0.95      0.88      0.91      2983
          HO       0.98      0.85      0.91       415
          IT       0.97      0.95      0.96      5357
          KT       0.96    

  'precision', 'predicted', average, warn_for)


In [54]:
class MLROSOversampler:
    def __init__(self):
        pass
        
    def IRLbl(self, dataset, label, labelset):
        ''' calculates the imbalance ratio per label
        needs dataset with dummy columns '''
        num = max(dataset.loc[:,labelset].sum())
        denom = dataset.loc[:,label].sum()
        return num/denom
    
    def MeanIR(self,dataset, labelset):
        ''' calculates the mean imbalance ratio'''
        import numpy as np
        return np.mean([self.IRLbl(dataset, label, labelset) for label in labelset])
    
    def MLROS(self, dataset, labels, percentage, batch_size = 100):
        from numpy import random
        starting_size = len(dataset)
        samplesToClone = int(len(dataset) * percentage / 100)
        mir = self.MeanIR(dataset, labels)
        cloners = [label for label in labels if self.IRLbl(dataset, label, labels) > mir]
        clone_sets = [dataset.loc[dataset[label] == 1].reset_index(drop = True) for label in cloners]
        clone_set_lengths = [len(x) for x in clone_sets]


        cloneCount = 0

        while(cloneCount < samplesToClone and len(cloners) > 0):
            clones = pd.DataFrame()
            for i,label in enumerate(cloners):
                clones = clones.append(clone_sets[i].loc[random.choice(range(clone_set_lengths[i]), batch_size, replace = True)])
                #clones = clones.append(clone_sets[i].loc[random.choice(range(len(clone_sets[i])))])
            cloneCount += batch_size * len(cloners)
            #print(len(clones))
            print(cloneCount)

            dataset = dataset.append(clones, ignore_index = True)

            for label in cloners:
                if self.IRLbl(dataset, label, labels) <= mir:
                    idx = cloners.index(label)
                    cloners = cloners[:idx] + cloners[idx+1:]
                    clone_sets = clone_sets[:idx] + clone_sets[idx+1:]
                    clone_set_lengths = clone_set_lengths[:idx] + clone_set_lengths[idx+1:]

        return dataset

In [55]:
mlb.classes_

array(['AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS', 'FA',
       'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO', 'MG', 'MP', 'NA',
       'NT', 'OA', 'OC', 'PR', 'QA', 'RA', 'RT', 'SG', 'SP', 'ST'],
      dtype=object)

In [56]:
mlo = MLROSOversampler()

In [57]:
mlo.MLROS(pd.DataFrame(y, columns=mlb.classes_), labels=mlb.classes_, percentage=10)

1000
2000
3000
4000
4900
5800
6700
7600
8500
9300
10100
10900
11700
12500
13300
14100
14900
15700
16500
17300
18100
18800
19500
20200
20900
21600
22300
22900
23400
23900
24400
24900
25400
25900
26400
26900
27400
27900
28400
28800
29200
29600
30000
30400
30800
31200
31600
32000
32300
32600
32900
33200
33500
33800
34100
34400
34700
34900
35100
35300
35500
35700
35900
36100
36300
36500
36600
36700


Unnamed: 0,AC,AG,AP,AT,CA,CO,CT,CV,DG,DS,...,NT,OA,OC,PR,QA,RA,RT,SG,SP,ST
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1]:
from fastText import train_supervised

In [2]:
ft_model = train_supervised(input='../embeddings/train.csv', epoch=25, lr=1.0, minCount=1, loss="ova")

In [4]:
import pandas as pd
articles = pd.read_csv('../data/arxiv_math.csv', nrows = 100)

In [30]:
ft_model.predict([articles.loc[20, 'abstract'].replace('\n', ' ')], threshold = 0.25, k = 4)

([['__label__RA']], array([[0.98935753]]))

In [31]:
articles.loc[20, 'categories']

"['math.RA']"