In [1]:
import pandas as pd
import sklearn
import numpy as np
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle
import sys

import warnings
warnings.filterwarnings('ignore')

In [2]:
import logging
#from importlib import reload
#reload(logging)
logging.basicConfig(level=logging.INFO, filename="predictSubjects_2.log")
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

In [3]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.1.4/16388/1  Dashboard: http://192.168.1.4:54283/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


Load paper-text:

In [4]:
%%time
parquets_dir = "../resources/papers-textclean-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet', columns=['paper_id', 'text'])
ddf.compute()

Wall time: 4.05 s


Unnamed: 0,paper_id,text
0,41513,learn poke poke experienti learn intuit physic...
1,41587,sampl complex episod fixedhorizon reinforc lea...
2,61821,adapt learn rate parallel stochast spars nonsm...
3,61822,barneshutsn lauren der maaten pattern recognit...
4,61823,block coordin descent spars nmf vamsi potluru ...
...,...,...
1363,101219,nov mixup local linear outofmanifold regular h...
1364,101220,latent fisher discrimin analysi gang chen depa...
1365,101221,stochast graphlet embed anjan dutta member iee...
1366,101222,bayesian approach learn bayesian network local...


Feature Extraction:

In [5]:
#%%time
#from sklearn.feature_extraction.text import TfidfVectorizer
#
#corpus = ddf['text'].compute()
#tfidf_vect = TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
#features = tfidf_vect.fit_transform(corpus).toarray()
#print(len(tfidf_vect.vocabulary_), features.shape)

In [6]:
## save vectorizer:
#vectorizer_path = "../resources/tdidf_bigram_vectorizer.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(tfidf_vect, picklefile)

In [7]:
# open vectorizer:
tfidf_vect = None
vectorizer_path = "../resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as picklefile:
    tfidf_vect = pickle.load(picklefile)

Split in train and test set:

In [8]:
dataset_path = "../resources/papers-subjects-dataset.pkl"
df = pd.read_pickle(dataset_path)

X_df = df.loc[:, 'paper_id':'primary-subject']
y_df = df.drop(columns=['paper_id', 'primary-subject'], axis=1)

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=None, random_state=0)
train_index, test_index = next(msss.split(X_df, y_df))
print(len(train_index), len(test_index))

29951 7417


In [9]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(29951, 40000) (7417, 40000)


In [10]:
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(29951, 133) (7417, 133)


Classification:

In [11]:
labels = []
subjects = []
for l,s in enumerate(list(y_train.columns)):
    labels.append(l)
    subjects.append(s)

In [12]:
%%time
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix #, multilabel_confusion_matrix

#model_modes = ['mnb', 'svc', 'lr', 'dummy_rnd', 'dummy_mf', 'dummy_stf']
model_modes = ['cnb', 'cnb_norm']
models = {
    'lr': {'name': 'Logistic Regression',
           'estimator': LogisticRegression(solver='sag', class_weight='balanced')
    },
    'svc': {'name': 'Linear SVC         ',
            'estimator': LinearSVC(class_weight='balanced')
    },
    'mnb': {'name': 'Multinomial NB     ',
            'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    },
    'cnb': {'name': 'Complement NB      ',
            'estimator': ComplementNB(fit_prior=True, class_prior=None, norm=False)
    },
    'cnb_norm': {'name': 'Complement NB norm ',
                 'estimator': ComplementNB(fit_prior=True, class_prior=None, norm=True)
    },
    'dummy_rnd': {'name': 'Dummy Uniform      ',
                  'estimator': DummyClassifier(strategy='uniform')
    },
    'dummy_mf': {'name': 'Dummy Most-Frequent',
                 'estimator': DummyClassifier(strategy='most_frequent')
    },
    'dummy_stf': {'name': 'Dummy Stratified   ',
                 'estimator': DummyClassifier(strategy='stratified')
    }
}

logging.info("\n\n>>> TAXONOMY WITH %s SUBJECTS\n\n" % len(subjects))

for model_mode in model_modes:
    logging.info("OneVsRest - %s" % models[model_mode]['name'])
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    # TRAIN
    logging.info("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    logging.info("... predicting on train set")
    prediction_train = classifier.predict(X_train)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_train, prediction_train))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_train, prediction_train, average="macro"))
    logging.info("\n%s" % classification_report(y_train, prediction_train, labels=labels, target_names=subjects))
    # TEST
    logging.info("... predicting on test set")
    prediction_test = classifier.predict(X_test)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_test, prediction_test))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_test, prediction_test, average="macro"))
    logging.info("\n%s" % classification_report(y_test, prediction_test, labels=labels, target_names=subjects))
    logging.info("="*60)
    
    ## compare prediction_train-train ; compare prediction_test-test
    #y_ntrain=y_train.to_numpy()
    #y_ntest=y_test.to_numpy()
    #subjects = list(y_train.columns)
    #for i,subject in enumerate(subjects):
    #    logger.debug("\nsubject '%s':" % subject)
    #    logger.debug("- train:")
    #    logger.debug(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]))
    #    logger.debug("accuracy = %s" % accuracy_score(y_ntrain[:,i],prediction_train[:,i]), ';\t',
    #                 "f1_macro = %s" % f1_score(y_ntrain[:,i],prediction_train[:,i], average="macro"))
    #    logger.debug("precision_macro = %s" % precision_score(y_ntrain[:,i],prediction_train[:,i], average="macro"), ';\t',
    #                 "recall_macro = %s" % recall_score(y_ntrain[:,i],prediction_train[:,i], average="macro"))
    #    logger.debug("- test:")
    #    logger.debug(confusion_matrix(y_ntest[:,i],prediction_test[:,i]))
    #    logger.debug("accuracy = %s" % accuracy_score(y_ntest[:,i],prediction_test[:,i]), ';\t',
    #                 "f1_macro = %s" % f1_score(y_ntest[:,i],prediction_test[:,i], average="macro"))
    #    logger.debug("precision_macro = %s" % precision_score(y_ntest[:,i],prediction_test[:,i], average="macro"), ';\t',
    #                 "recall_macro = %s" % recall_score(y_ntest[:,i],prediction_test[:,i], average="macro"))

Wall time: 2min 19s


### New attempt with a simplified taxonomy:

In [13]:
dataset_path = "../resources/papers-subjects-dataset-56.pkl"
df = pd.read_pickle(dataset_path)
df

Unnamed: 0,paper_id,astro-ph,cond-mat,gr-qc,math-ph,nlin,physics,quant-ph,math,cs.AI,...,cs.SC,cs.SY,q-bio,q-fin,stat.ML,stat,eess,econ,hep,nucl
0,41513,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,41587,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,61821,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,61822,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,61823,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37363,101219,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
37364,101220,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
37365,101221,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
37366,101222,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


Split in train and test set:

In [14]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(29951, 40000) (7417, 40000)


In [15]:
y_df = df.drop(columns=['paper_id'], axis=1)
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(29951, 56) (7417, 56)


Classification:

In [16]:
labels = []
subjects = []
for l,s in enumerate(list(y_train.columns)):
    labels.append(l)
    subjects.append(s)

In [17]:
logging.info("\n\n>>> TAXONOMY WITH %s SUBJECTS\n\n" % len(subjects))

for model_mode in model_modes:
    logging.info("OneVsRest - %s" % models[model_mode]['name'])
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    # TRAIN
    logging.info("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    logging.info("... predicting on train set")
    prediction_train = classifier.predict(X_train)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_train, prediction_train))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_train, prediction_train, average="macro"))
    logging.info("\n%s" % classification_report(y_train, prediction_train, labels=labels, target_names=subjects))
    # TEST
    logging.info("... predicting on test set")
    prediction_test = classifier.predict(X_test)
    logging.info("\n\t\t accuracy = %s" % accuracy_score(y_test, prediction_test))
    logging.info("\n\t\t f1_macro = %s" % f1_score(y_test, prediction_test, average="macro"))
    logging.info("\n%s" % classification_report(y_test, prediction_test, labels=labels, target_names=subjects))
    logging.info("="*60)