In [1]:
import pandas as pd
import sklearn
import numpy as np
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.1.11/13232/1  Dashboard: http://192.168.1.11:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


Load paper-text:

In [3]:
%%time
parquets_dir = "../resources/papers-textclean-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet', columns=['paper_id', 'text'])
ddf.compute()

Wall time: 5.57 s


Unnamed: 0,paper_id,text
0,41513,learn poke poke experienti learn intuit physic...
1,41587,sampl complex episod fixedhorizon reinforc lea...
2,61821,adapt learn rate parallel stochast spars nonsm...
3,61822,barneshutsn lauren der maaten pattern recognit...
4,61823,block coordin descent spars nmf vamsi potluru ...
...,...,...
1363,101219,nov mixup local linear outofmanifold regular h...
1364,101220,latent fisher discrimin analysi gang chen depa...
1365,101221,stochast graphlet embed anjan dutta member iee...
1366,101222,bayesian approach learn bayesian network local...


Feature Extraction:

In [4]:
#%%time
#from sklearn.feature_extraction.text import TfidfVectorizer
#
#corpus = ddf['text'].compute()
#tfidf_vect = TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
#features = tfidf_vect.fit_transform(corpus).toarray()
#print(len(tfidf_vect.vocabulary_), features.shape)

40000 (37368, 40000)
CPU times: user 5min 30s, sys: 34 s, total: 6min 4s
Wall time: 6min 16s


In [5]:
## save vectorizer:
#vectorizer_path = "../resources/tdidf_bigram_vectorizer.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(tfidf_vect, picklefile)

In [4]:
# open vectorizer:
tfidf_vect = None
vectorizer_path = "../resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as picklefile:
    tfidf_vect = pickle.load(picklefile)

Split in train and test set:

In [9]:
dataset_path = "../resources/papers-subjects-dataset.pkl"
df = pd.read_pickle(dataset_path)

X_df = df.loc[:, 'paper_id':'primary-subject']
y_df = df.drop(columns=['paper_id', 'primary-subject'], axis=1)

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=None, random_state=0)
train_index, test_index = next(msss.split(X_df, y_df))
print(len(train_index), len(test_index))



29951 7417


In [10]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(29951, 40000) (7417, 40000)


In [11]:
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(29951, 133) (7417, 133)


Classification:

In [12]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

model_modes = ['mnb', 'svc', 'lr']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression(solver='sag')
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    }
}

for model_mode in model_modes:
    print("="*20, "OneVsRest - %s" % models[model_mode]['name'], "="*20)
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    print("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    print("... predicting on train set")
    print("\t\t score = %s" % classifier.score(X_train, y_train))
    prediction_train = classifier.predict(X_train)
    print("... predicting on test set")
    print("\t\t score = %s" % classifier.score(X_test, y_test))
    prediction_test = classifier.predict(X_test)
    # compare prediction_train-train ; compare prediction_test-test
    y_ntrain=y_train.to_numpy()
    y_ntest=y_test.to_numpy()
    subjects = list(y_train.columns)
    for i,subject in enumerate(subjects):
        print("\nsubject '%s':" % subject)
        print("- train:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]), accuracy_score(y_ntrain[:,i],prediction_train[:,i])))
        print("- test:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntest[:,i],prediction_test[:,i]), accuracy_score(y_ntest[:,i],prediction_test[:,i])))
    print('\n\n')

... training on 29951 samples
... predicting on train set
		 score = 0.5058595706320324
... predicting on test set
		 score = 0.4894162060132129

subject 'astro-ph':
- train:
[[29946     0]
 [    5     0]]	accuracy = 0.9998330606657541
- test:
[[7416    0]
 [   1    0]]	accuracy = 0.9998651745988945

subject 'astro-ph.CO':
- train:
[[29945     0]
 [    6     0]]	accuracy = 0.9997996727989049
- test:
[[7416    0]
 [   1    0]]	accuracy = 0.9998651745988945

subject 'astro-ph.EP':
- train:
[[29949     0]
 [    2     0]]	accuracy = 0.9999332242663016
- test:
[[7416    0]
 [   1    0]]	accuracy = 0.9998651745988945

subject 'astro-ph.GA':
- train:
[[29944     0]
 [    7     0]]	accuracy = 0.9997662849320557
- test:
[[7415    0]
 [   2    0]]	accuracy = 0.9997303491977889

subject 'astro-ph.HE':
- train:
[[29950     0]
 [    1     0]]	accuracy = 0.9999666121331509
- test:
[[7417]]	accuracy = 1.0

subject 'astro-ph.IM':
- train:
[[29925     0]
 [   26     0]]	accuracy = 0.9991319154619211
- 

### New attempt with a simplified taxonomy:

In [13]:
dataset_path = "../resources/papers-subjects-dataset-56.pkl"
df = pd.read_pickle(dataset_path)
df

Unnamed: 0,paper_id,astro-ph,cond-mat,gr-qc,math-ph,nlin,physics,quant-ph,math,cs.AI,...,cs.SC,cs.SY,q-bio,q-fin,stat.ML,stat,eess,econ,hep,nucl
0,41513,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,41587,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,61821,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,61822,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,61823,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37363,101219,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
37364,101220,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
37365,101221,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
37366,101222,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


Split in train and test set:

In [14]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(29951, 40000) (7417, 40000)


In [15]:
y_df = df.drop(columns=['paper_id'], axis=1)
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(29951, 56) (7417, 56)


Classification:

In [16]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

model_modes = ['mnb', 'svc', 'lr']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression(solver='sag')
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    }
}

for model_mode in model_modes:
    print("="*20, "OneVsRest - %s" % models[model_mode]['name'], "="*20)
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    print("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    print("... predicting on train set")
    print("\t\t score = %s" % classifier.score(X_train, y_train))
    prediction_train = classifier.predict(X_train)
    print("... predicting on test set")
    print("\t\t score = %s" % classifier.score(X_test, y_test))
    prediction_test = classifier.predict(X_test)
    # compare prediction_train-train ; compare prediction_test-test
    y_ntrain=y_train.to_numpy()
    y_ntest=y_test.to_numpy()
    subjects = list(y_train.columns)
    for i,subject in enumerate(subjects):
        print("\nsubject '%s':" % subject)
        print("- train:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]), accuracy_score(y_ntrain[:,i],prediction_train[:,i])))
        print("- test:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntest[:,i],prediction_test[:,i]), accuracy_score(y_ntest[:,i],prediction_test[:,i])))
    print('\n\n')

... training on 29951 samples
... predicting on train set
		 score = 0.5081967213114754
... predicting on test set
		 score = 0.4911689362275853

subject 'astro-ph':
- train:
[[29912     0]
 [   39     0]]	accuracy = 0.9986978731928817
- test:
[[7408    0]
 [   9    0]]	accuracy = 0.9987865713900499

subject 'cond-mat':
- train:
[[29831     0]
 [  120     0]]	accuracy = 0.9959934559780975
- test:
[[7393    0]
 [  24    0]]	accuracy = 0.9967641903734663

subject 'gr-qc':
- train:
[[29948     0]
 [    3     0]]	accuracy = 0.9998998363994525
- test:
[[7416    0]
 [   1    0]]	accuracy = 0.9998651745988945

subject 'math-ph':
- train:
[[29947     0]
 [    4     0]]	accuracy = 0.9998664485326032
- test:
[[7416    0]
 [   1    0]]	accuracy = 0.9998651745988945

subject 'nlin':
- train:
[[29932     0]
 [   19     0]]	accuracy = 0.9993656305298655
- test:
[[7413    0]
 [   4    0]]	accuracy = 0.9994606983955777

subject 'physics':
- train:
[[29694     1]
 [  256     0]]	accuracy = 0.9914193182