In [1]:
import os
import numpy as np
import pandas as pd
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

## Open Data Set

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.5.13/8764/1  Dashboard: http://192.168.5.13:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


In [3]:
%%time
clean_parquets_dir = "../data/papers-textclean-parquets"
ddf = dd.read_parquet(clean_parquets_dir, engine='fastparquet', columns=['id_subsection', 'text_subsection'])

Wall time: 31.2 ms


##### Check Dataset :

In [4]:
%%time
ddf.compute()

Wall time: 5.11 s


Unnamed: 0,id_subsection,text_subsection
0,2535.1.1,paper address problem track diagnos complex sy...
1,2535.2.1,complex sophist current gener industri process...
2,2535.2.2,want monitor state system reliabl detect abnor...
3,2535.2.3,paper propos differ approach problem model com...
4,2535.2.4,express discret failur mode produc discontinuo...
...,...,...
19460,101223.24.3,thank attent mechan predict result interpret p...
19461,101223.24.4,believ attentionbas model use structur represe...
19462,101223.25.1,thank waldman develop codevec websit thank mil...
19463,101223.25.2,lead result receiv fund european union seventh...


Remove empty text_subsection:

In [5]:
ddf[ddf['text_subsection'].isna()].compute()

Unnamed: 0,id_subsection,text_subsection


In [None]:
#ddf = ddf.dropna(subset=['text_subsection'])
#ddf[ddf['text_subsection'].isna()].compute()

Remove subsections used in Training Set:

In [6]:
trainingset_path = "./resources/stemmed_training_set.pkl"
df_train = pd.read_pickle(trainingset_path)
df_train

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD
...,...,...,...,...
132703,101131.17.2,C REPRODUCIBILITY,review confer paper iclr main task network rew...,N_PD
132711,101144.2.6,1. Introduction,approach preserv spatial spectral tempor struc...,PD
132712,101144.2.7,1. Introduction,increas effort automat detect phase start time...,PD
132713,101144.2.8,1. Introduction,sampl data descript data numer eeg read partic...,PD


In [7]:
%%time
ddf = ddf[~ddf['id_subsection'].isin(df_train['id_subsection'])]
ddf.compute()

Wall time: 6.72 s


Unnamed: 0,id_subsection,text_subsection
0,2535.1.1,paper address problem track diagnos complex sy...
1,2535.2.1,complex sophist current gener industri process...
2,2535.2.2,want monitor state system reliabl detect abnor...
3,2535.2.3,paper propos differ approach problem model com...
4,2535.2.4,express discret failur mode produc discontinuo...
...,...,...
19460,101223.24.3,thank attent mechan predict result interpret p...
19461,101223.24.4,believ attentionbas model use structur represe...
19462,101223.25.1,thank waldman develop codevec websit thank mil...
19463,101223.25.2,lead result receiv fund european union seventh...


Info:

In [8]:
print("Subsections in data set = %s" % len(ddf.id_subsection))
id_paper_set = set()
ddf.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]), meta=(None, 'int64')).compute()
print("Valid papers in data set = %s" % len(id_paper_set))

Subsections in data set = 3943815
Valid papers in data set = 47178


## Predict Data Set

Load vectorizer and Transform documents to document-term matrix:

In [9]:
vectorizer_path = "./resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as feature_extractor:
    vectorizer = pickle.load(feature_extractor)

In [10]:
%%time
X_to_predict = vectorizer.transform(ddf['text_subsection'])
X_to_predict

Wall time: 3min 23s


<3943815x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 110253030 stored elements in Compressed Sparse Row format>

In [11]:
X_to_predict.shape

(3943815, 40000)

Load classifiers and Predict Data Set:

In [14]:
model_mode = ['lr', 'svc', 'mnb']
y_pred = {}

for m in model_mode:
    classifier_path = "./resources/tdidf_bigr-"+m+".pkl"
    with open(classifier_path, 'rb') as training_model:
        model = pickle.load(training_model)
    y_pred[m] = model.predict(X_to_predict)
    print("PD estimated with '%s': %s/%s" %(m, len([i for i in y_pred[m] if i == 1]), y_pred[m].shape[0]))

PD estimated with 'lr': 13589/3943815
PD estimated with 'svc': 58791/3943815
PD estimated with 'mnb': 247/3943815


In [15]:
# free memory
del X_to_predict

Save Predictions:

In [16]:
ids = list(ddf['id_subsection'].values.compute())

In [18]:
data={'id_subsection': ids}
for m in model_mode:
    data['predict_id_by_'+m] = y_pred[m]

df_tmp = pd.DataFrame(data=data)
df_tmp

Unnamed: 0,id_subsection,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
0,2535.1.1,0,0,0
1,2535.2.1,0,0,0
2,2535.2.2,0,0,0
3,2535.2.3,0,0,0
4,2535.2.4,0,0,0
...,...,...,...,...
3943810,101223.24.3,0,0,0
3943811,101223.24.4,0,0,0
3943812,101223.25.1,0,0,0
3943813,101223.25.2,0,0,0


In [19]:
ddf_res = ddf.merge(df_tmp, how='left', on='id_subsection')
ddf_res.compute()

Unnamed: 0,id_subsection,text_subsection,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
0,2535.1.1,paper address problem track diagnos complex sy...,0,0,0
1,2535.2.1,complex sophist current gener industri process...,0,0,0
2,2535.2.2,want monitor state system reliabl detect abnor...,0,0,0
3,2535.2.3,paper propos differ approach problem model com...,0,0,0
4,2535.2.4,express discret failur mode produc discontinuo...,0,0,0
...,...,...,...,...,...
18169,101223.24.3,thank attent mechan predict result interpret p...,0,0,0
18170,101223.24.4,believ attentionbas model use structur represe...,0,0,0
18171,101223.25.1,thank waldman develop codevec websit thank mil...,0,0,0
18172,101223.25.2,lead result receiv fund european union seventh...,0,0,0


In [20]:
%%time
predict_parquets_dir = "../data/papers-predicted-parquets"
dd.to_parquet(ddf_res, predict_parquets_dir, engine='fastparquet', compression='snappy')

Wall time: 1min 14s
