In [1]:
import pandas as pd
import sklearn
import numpy as np
import pickle
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
import logging
#from importlib import reload
#reload(logging)
logging.basicConfig(level=logging.INFO, filename="predictSubjects_interesting_2.log")

Load paper-text:

In [3]:
%%time
dataset_path = "../resources/interesting_arxiv_papers_textclean.pkl"
df_text = pd.read_pickle(dataset_path)
df_text

Wall time: 601 ms


Unnamed: 0,paper_id,text
0,100036,paper present multimod biometr system fingerpr...
1,100075,random trial known ab test select polici contr...
2,100147,deep qnetwork returnbas reinforc learn promis ...
3,100161,studi problem learn polici demonstr combinator...
4,100208,present novel method compress deep convolut ne...
...,...,...
21797,99842,case combin classifi show product rule aris ma...
21798,99920,propos novel dialogu model framework use binar...
21799,99922,studi classif problem featur acquir cost goal ...
21800,99932,challeng imag process task describ illpos line...


In [4]:
# apply more text_cleaning
def initial_text_cleaning(text):
    #text = text.lower()                                             # transform to lowercase
    #text = re.sub(r'\n', '', text)                                  # remove \n
    #text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    #text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    #text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w{1,2}\b', '', text)                         # remove all single and double letters
    text = re.sub(r'\b(h|i|j|k|x|y)+\b', '', text)                  # remove some common letters used in formules 
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text

# text - cleaning:
df_text['text'] = df_text['text'].apply(initial_text_cleaning)

Feature Extraction:

In [5]:
#%%time
#from sklearn.feature_extraction.text import TfidfVectorizer
#
#corpus = df['text']
#tfidf_vect = TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
#features = tfidf_vect.fit_transform(corpus).toarray()
#print(len(tfidf_vect.vocabulary_), features.shape)

In [6]:
## save vectorizer:
#vectorizer_path = "../resources/tdidf_bigram_interesting_vectorizer.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(tfidf_vect, picklefile)

In [7]:
# open vectorizer:
tfidf_vect = None
vectorizer_path = "../resources/tdidf_bigram_interesting_vectorizer.pkl"
with open(vectorizer_path, 'rb') as picklefile:
    tfidf_vect = pickle.load(picklefile)

Split in train and test set:

In [8]:
dataset_path = "../resources/papers-subjects-dataset.pkl"
df = pd.read_pickle(dataset_path)
ids = list(df_text.paper_id)
df = df.loc[df.paper_id.isin(ids)]
df.reset_index(drop=True, inplace=True)
df = df.merge(df_text, how='left', on="paper_id")
df = df.drop(columns='primary-subject', axis=1)
df

Unnamed: 0,paper_id,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,text
0,41513,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,investig experienti learn paradigm acquir inte...
1,61821,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,work establish empir success framework adapt l...
2,61822,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,paper present oimplement tsne embed techniqu c...
3,61823,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonneg matrix factor ubiquit tool data analysi...
4,61824,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,paper describ serial parallel composit model m...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21797,101218,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,stateoftheart perform deep learn algorithm led...
21798,101219,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,mixup propos dataaugment scheme linearli inter...
21799,101220,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,linear discrimin analysi wellknown method dime...
21800,101222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,research investig techniqu data learn bayesian...


In [9]:
subjects = list(df.columns)[2:-1]
subjects_to_remove = []
for s in subjects:
    count_papers = df[s].sum()
    if count_papers == 0:
        print(s, ':', count_papers)
        subjects_to_remove.append(s)

astro-ph.HE : 0
cond-mat.quant-gas : 0
cond-mat.soft : 0
cond-mat.str-el : 0
math.CT : 0
math.GM : 0
math.GN : 0
math.RT : 0
math.SP : 0
nlin.CG : 0
nlin.PS : 0
nucl-ex : 0


In [10]:
df = df.drop(columns=subjects_to_remove, axis=1)
df.head(2)

Unnamed: 0,paper_id,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.IM,astro-ph.SR,cond-mat,cond-mat.dis-nn,cond-mat.mes-hall,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,text
0,41513,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,investig experienti learn paradigm acquir inte...
1,61821,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,work establish empir success framework adapt l...


In [11]:
X_df = df.loc[:, ['paper_id','text']]
y_df = df.drop(columns=['paper_id','text'], axis=1)

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=None, random_state=0)
train_index, test_index = next(msss.split(X_df, y_df))
print(len(train_index), len(test_index))

17442 4360


In [12]:
df_train = df.iloc[train_index]
df_test = df.iloc[test_index]

X_train = tfidf_vect.transform(df.iloc[train_index]['text'])
X_test = tfidf_vect.transform(df_test['text'])
print(X_train.shape, X_test.shape)

(17442, 40000) (4360, 40000)


In [13]:
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(17442, 121) (4360, 121)


Classification:

In [14]:
labels = []
subjects = []
for l,s in enumerate(list(y_train.columns)):
    labels.append(l)
    subjects.append(s)

In [15]:
%%time
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix #, multilabel_confusion_matrix

#model_modes = ['mnb', 'svc', 'lr', 'dummy_rnd', 'dummy_mf', 'dummy_stf']
model_modes = ['cnb', 'cnb_norm']
models = {
    'lr': {'name': 'Logistic Regression',
           'estimator': LogisticRegression(solver='sag', class_weight='balanced')
    },
    'svc': {'name': 'Linear SVC         ',
            'estimator': LinearSVC(class_weight='balanced')
    },
    'mnb': {'name': 'Multinomial NB     ',
            'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    },
    'cnb': {'name': 'Complement NB      ',
            'estimator': ComplementNB(fit_prior=True, class_prior=None, norm=False)
    },
    'cnb_norm': {'name': 'Complement NB norm ',
                 'estimator': ComplementNB(fit_prior=True, class_prior=None, norm=True)
    },
    'dummy_rnd': {'name': 'Dummy Uniform      ',
                  'estimator': DummyClassifier(strategy='uniform')
    },
    'dummy_mf': {'name': 'Dummy Most-Frequent',
                 'estimator': DummyClassifier(strategy='most_frequent')
    },
    'dummy_stf': {'name': 'Dummy Stratified   ',
                 'estimator': DummyClassifier(strategy='stratified')
    }
}

logging.info("\n\n>>> TAXONOMY WITH %s SUBJECTS\n\n" % len(subjects))

for model_mode in model_modes:
    logging.info("OneVsRest - %s" % models[model_mode]['name'])
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    # TRAIN
    logging.info("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    logging.info("... predicting on train set")
    prediction_train = classifier.predict(X_train)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_train, prediction_train))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_train, prediction_train, average="macro"))
    logging.info("\n%s" % classification_report(y_train, prediction_train, labels=labels, target_names=subjects))
    # TEST
    logging.info("... predicting on test set")
    prediction_test = classifier.predict(X_test)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_test, prediction_test))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_test, prediction_test, average="macro"))
    logging.info("\n%s" % classification_report(y_test, prediction_test, labels=labels, target_names=subjects))
    logging.info("="*60)
    
    ## compare prediction_train-train ; compare prediction_test-test
    #y_ntrain=y_train.to_numpy()
    #y_ntest=y_test.to_numpy()
    #subjects = list(y_train.columns)
    #for i,subject in enumerate(subjects):
    #    logger.debug("\nsubject '%s':" % subject)
    #    logger.debug("- train:")
    #    logger.debug(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]))
    #    logger.debug("accuracy = %s" % accuracy_score(y_ntrain[:,i],prediction_train[:,i]), ';\t',
    #                 "f1_macro = %s" % f1_score(y_ntrain[:,i],prediction_train[:,i], average="macro"))
    #    logger.debug("precision_macro = %s" % precision_score(y_ntrain[:,i],prediction_train[:,i], average="macro"), ';\t',
    #                 "recall_macro = %s" % recall_score(y_ntrain[:,i],prediction_train[:,i], average="macro"))
    #    logger.debug("- test:")
    #    logger.debug(confusion_matrix(y_ntest[:,i],prediction_test[:,i]))
    #    logger.debug("accuracy = %s" % accuracy_score(y_ntest[:,i],prediction_test[:,i]), ';\t',
    #                 "f1_macro = %s" % f1_score(y_ntest[:,i],prediction_test[:,i], average="macro"))
    #    logger.debug("precision_macro = %s" % precision_score(y_ntest[:,i],prediction_test[:,i], average="macro"), ';\t',
    #                 "recall_macro = %s" % recall_score(y_ntest[:,i],prediction_test[:,i], average="macro"))

Wall time: 39.9 s


### New attempt with a simplified taxonomy:

In [16]:
dataset_path = "../resources/papers-subjects-dataset-56.pkl"
df_56 = pd.read_pickle(dataset_path)
df_56 = df_56.loc[df_56['paper_id'].isin(list(df['paper_id']))]
df_56.reset_index(drop=True, inplace=True)
df_56

Unnamed: 0,paper_id,astro-ph,cond-mat,gr-qc,math-ph,nlin,physics,quant-ph,math,cs.AI,...,cs.SC,cs.SY,q-bio,q-fin,stat.ML,stat,eess,econ,hep,nucl
0,41513,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,61821,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,61822,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,61823,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,61824,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21797,101218,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21798,101219,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
21799,101220,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21800,101222,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


Split in train and test set:

In [17]:
#df_train = df.iloc[train_index]
#df_test = df.iloc[test_index]
#
#X_train = tfidf_vect.transform(df.iloc[train_index]['text'])
#X_test = tfidf_vect.transform(df_test['text'])
print(X_train.shape, X_test.shape)

(17442, 40000) (4360, 40000)


In [18]:
y_df = df_56.drop(columns=['paper_id'], axis=1)
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(17442, 56) (4360, 56)


Classification:

In [19]:
labels = []
subjects = []
for l,s in enumerate(list(y_train.columns)):
    labels.append(l)
    subjects.append(s)

In [20]:
logging.info("\n\n>>> TAXONOMY WITH %s SUBJECTS\n\n" % len(subjects))

for model_mode in model_modes:
    logging.info("OneVsRest - %s" % models[model_mode]['name'])
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    # TRAIN
    logging.info("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    logging.info("... predicting on train set")
    prediction_train = classifier.predict(X_train)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_train, prediction_train))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_train, prediction_train, average="macro"))
    logging.info("\n%s" % classification_report(y_train, prediction_train, labels=labels, target_names=subjects))
    # TEST
    logging.info("... predicting on test set")
    prediction_test = classifier.predict(X_test)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_test, prediction_test))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_test, prediction_test, average="macro"))
    logging.info("\n%s" % classification_report(y_test, prediction_test, labels=labels, target_names=subjects))
    logging.info("="*60)