In [1]:
import pandas as pd
import sklearn
import numpy as np
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.1.11/4832/1  Dashboard: http://192.168.1.11:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


Load paper-text:

In [3]:
%%time
parquets_dir = "../resources/papers-k-textclean-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet', columns=['paper_id', 'text'])
ddf.compute()

Wall time: 1.94 s


Unnamed: 0,paper_id,text
0,61826,discret restrict boltzmann machin guido montuf...
1,61836,knowledg matter import prior optim caglar gulc...
2,61852,deep learn detect robot grasp lenz honglak ash...
3,61907,number respons region deep feedforward network...
4,61911,avoid confus predictor inhibitor valu function...
...,...,...
1113,101196,model human categor natur imag deep featur rep...
1114,101198,oct deep cluster link discrimin model kmean ja...
1115,101216,beziergan automat gener smooth curv interpret ...
1116,101217,channel local block variant squeezeandexcit hu...


Feature Extraction:

In [4]:
#%%time
#from sklearn.feature_extraction.text import TfidfVectorizer
#
#corpus = ddf['text'].compute()
#tfidf_vect = TfidfVectorizer(max_features=40000, min_df=3, norm='l2', ngram_range=(1, 2))
#features = tfidf_vect.fit_transform(corpus).toarray()
#print(len(tfidf_vect.vocabulary_), features.shape)

40000 (15118, 40000)
Wall time: 2min 51s


In [5]:
## save vectorizer:
#vectorizer_path = "../resources/tdidf_bigram_k_vectorizer.pkl"
#with open(vectorizer_path, 'wb') as picklefile:
#    pickle.dump(tfidf_vect, picklefile)

In [4]:
# open vectorizer:
tfidf_vect = None
vectorizer_path = "../resources/tdidf_bigram_k_vectorizer.pkl"
with open(vectorizer_path, 'rb') as picklefile:
    tfidf_vect = pickle.load(picklefile)

Split in train and test set:

In [5]:
dataset_path = "../resources/papers-k-subjects-dataset.pkl"
df = pd.read_pickle(dataset_path)

X_df = df.loc[:, 'paper_id':'primary-subject']
y_df = df.drop(columns=['paper_id', 'primary-subject'], axis=1)

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=None, random_state=0)
train_index, test_index = next(msss.split(X_df, y_df))
print(len(train_index), len(test_index))



12088 3030


In [10]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(12088, 40000) (3030, 40000)


In [12]:
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(12088, 114) (3030, 114)


In [21]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

model_modes = ['mnb', 'svc', 'lr']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression(solver='sag')
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    }
}

for model_mode in model_modes:
    print("="*20, "OneVsRest - %s" % models[model_mode]['name'], "="*20)
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    print("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    print("... predicting on train set")
    print("\t\t score = %s" % classifier.score(X_train, y_train))
    prediction_train = classifier.predict(X_train)
    print("... predicting on test set")
    print("\t\t score = %s" % classifier.score(X_test, y_test))
    prediction_test = classifier.predict(X_test)
    # compare prediction_train-train ; compare prediction_test-test
    y_ntrain=y_train.to_numpy()
    y_ntest=y_test.to_numpy()
    subjects = list(y_train.columns)
    for i,subject in enumerate(subjects):
        print("\nsubject '%s':" % subject)
        print("- train:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]), accuracy_score(y_ntrain[:,i],prediction_train[:,i])))
        print("- test:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntest[:,i],prediction_test[:,i]), accuracy_score(y_ntest[:,i],prediction_test[:,i])))
    print('\n\n')

... training on 12088 samples
... predicting on train set
		 score = 0.5448378557246857
... predicting on test set
		 score = 0.5287128712871287

subject 'astro-ph':
- train:
[[12085     0]
 [    3     0]]	accuracy = 0.9997518199867638
- test:
[[3029    0]
 [   1    0]]	accuracy = 0.9996699669966996

subject 'astro-ph.CO':
- train:
[[12084     0]
 [    4     0]]	accuracy = 0.999669093315685
- test:
[[3029    0]
 [   1    0]]	accuracy = 0.9996699669966996

subject 'astro-ph.EP':
- train:
[[12087     0]
 [    1     0]]	accuracy = 0.9999172733289212
- test:
[[3030]]	accuracy = 1.0

subject 'astro-ph.GA':
- train:
[[12082     0]
 [    6     0]]	accuracy = 0.9995036399735274
- test:
[[3029    0]
 [   1    0]]	accuracy = 0.9996699669966996

subject 'astro-ph.HE':
- train:
[[12087     0]
 [    1     0]]	accuracy = 0.9999172733289212
- test:
[[3030]]	accuracy = 1.0

subject 'astro-ph.IM':
- train:
[[12071     0]
 [   17     0]]	accuracy = 0.9985936465916612
- test:
[[3026    0]
 [   4    0]]	a

### New attempt with a simplified taxonomy:

In [22]:
dataset_path = "../resources/papers-subjects-dataset-56.pkl"
df = pd.read_pickle(dataset_path)
df = df.loc[df['paper_id'].isin(list(ddf['paper_id']))]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,paper_id,astro-ph,cond-mat,gr-qc,math-ph,nlin,physics,quant-ph,math,cs.AI,...,cs.SC,cs.SY,q-bio,q-fin,stat.ML,stat,eess,econ,hep,nucl
0,61826,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,61836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,61852,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,61907,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,61911,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15113,101196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
15114,101198,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
15115,101216,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
15116,101217,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Split in train and test set:

In [23]:
ids_train = list(df.iloc[train_index]['paper_id'])
ids_test = list(df.iloc[test_index]['paper_id'])

ddf_train = ddf.loc[ddf['paper_id'].isin(ids_train)]
ddf_test = ddf.loc[ddf['paper_id'].isin(ids_test)]

X_train = tfidf_vect.transform(ddf_train['text'])
X_test = tfidf_vect.transform(ddf_test['text'])
print(X_train.shape, X_test.shape)

(12088, 40000) (3030, 40000)


In [24]:
y_df = df.drop(columns=['paper_id'], axis=1)
y_train = y_df.iloc[train_index]
y_test = y_df.iloc[test_index]
print(y_train.shape, y_test.shape)

(12088, 56) (3030, 56)


Classification:

In [25]:
%%time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

model_modes = ['mnb', 'svc', 'lr']
models = {
    'lr': {'name': 'Logistic Regression ',
           'estimator': LogisticRegression(solver='sag')
    },
    'svc': {'name': 'Linear SVC         ',
           'estimator': LinearSVC()
    },
    'mnb': {'name': 'Multinomial NB     ',
           'estimator': MultinomialNB(fit_prior=True, class_prior=None)
    }
}

for model_mode in model_modes:
    print("="*20, "OneVsRest - %s" % models[model_mode]['name'], "="*20)
    classifier = OneVsRestClassifier(models[model_mode]['estimator'], n_jobs=-1)
    print("... training on %s samples" % X_train.shape[0])
    classifier.fit(X_train, y_train)
    print("... predicting on train set")
    print("\t\t score = %s" % classifier.score(X_train, y_train))
    prediction_train = classifier.predict(X_train)
    print("... predicting on test set")
    print("\t\t score = %s" % classifier.score(X_test, y_test))
    prediction_test = classifier.predict(X_test)
    # compare prediction_train-train ; compare prediction_test-test
    y_ntrain=y_train.to_numpy()
    y_ntest=y_test.to_numpy()
    subjects = list(y_train.columns)
    for i,subject in enumerate(subjects):
        print("\nsubject '%s':" % subject)
        print("- train:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntrain[:,i],prediction_train[:,i]), accuracy_score(y_ntrain[:,i],prediction_train[:,i])))
        print("- test:")
        print("%s\taccuracy = %s" %(confusion_matrix(y_ntest[:,i],prediction_test[:,i]), accuracy_score(y_ntest[:,i],prediction_test[:,i])))
    print('\n\n')

... training on 12088 samples
... predicting on train set
		 score = 0.5449205823957644
... predicting on test set
		 score = 0.5287128712871287

subject 'astro-ph':
- train:
[[12062     0]
 [   26     0]]	accuracy = 0.9978491065519524
- test:
[[3025    0]
 [   5    0]]	accuracy = 0.9983498349834984

subject 'cond-mat':
- train:
[[12067     0]
 [   21     0]]	accuracy = 0.9982627399073462
- test:
[[3025    0]
 [   5    0]]	accuracy = 0.9983498349834984

subject 'gr-qc':
- train:
[[12086     0]
 [    2     0]]	accuracy = 0.9998345466578424
- test:
[[3030]]	accuracy = 1.0

subject 'math-ph':
- train:
[[12087     0]
 [    1     0]]	accuracy = 0.9999172733289212
- test:
[[3030]]	accuracy = 1.0

subject 'nlin':
- train:
[[12082     0]
 [    6     0]]	accuracy = 0.9995036399735274
- test:
[[3029    0]
 [   1    0]]	accuracy = 0.9996699669966996

subject 'physics':
- train:
[[12002     0]
 [   86     0]]	accuracy = 0.992885506287227
- test:
[[3011    0]
 [  19    0]]	accuracy = 0.993729372937