In [1]:
import os
import numpy as np
import pandas as pd
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

## Open Data Set

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='4GB')
client

0,1
Client  Scheduler: inproc://192.168.1.21/1031/1  Dashboard: http://192.168.1.21:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 4.00 GB


In [3]:
%%time
clean_parquets_dir = "../data/papers-textclean-parquets"
ddf = dd.read_parquet(clean_parquets_dir, engine='fastparquet', columns=['id_subsection', 'text_subsection'])

CPU times: user 15.1 ms, sys: 4.96 ms, total: 20 ms
Wall time: 125 ms


##### Check Dataset :

In [4]:
%%time
ddf.compute()

CPU times: user 4.37 s, sys: 1.3 s, total: 5.67 s
Wall time: 11.3 s


Unnamed: 0,id_subsection,text_subsection
0,2535.1.1,paper address problem track diagnos complex sy...
1,2535.2.1,complex sophist current gener industri process...
2,2535.2.2,want monitor state system reliabl detect abnor...
3,2535.2.3,paper propos differ approach problem model com...
4,2535.2.4,express discret failur mode produc discontinuo...
...,...,...
19460,101223.24.3,thank attent mechan predict result interpret p...
19461,101223.24.4,believ attentionbas model use structur represe...
19462,101223.25.1,thank waldman develop codevec websit thank mil...
19463,101223.25.2,lead result receiv fund european union seventh...


Remove empty text_subsection:

In [5]:
ddf[ddf['text_subsection'].isna()].compute()

Unnamed: 0,id_subsection,text_subsection


In [None]:
#ddf = ddf.dropna(subset=['text_subsection'])
#ddf[ddf['text_subsection'].isna()].compute()

Remove subsections used in Training Set:

In [7]:
trainingset_path = "./resources/stemmed_training_set.pkl"
df_train = pd.read_pickle(trainingset_path)
df_train

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,work machin learn extract focus distinct subpr...,N_PD
1,2549.2.1,Introduction,extract problem convert text newswir articl we...,N_PD
2,2549.2.2,Introduction,increas import brought attent kind automat doc...,N_PD
3,2549.2.3,Introduction,work focus learn approach requir linguist expl...,N_PD
4,2549.2.4,Introduction,time work integr led need special wrapper proc...,N_PD
...,...,...,...,...
132703,101131.17.2,C REPRODUCIBILITY,review confer paper iclr main task network rew...,N_PD
132711,101144.2.6,1. Introduction,approach preserv spatial spectral tempor struc...,PD
132712,101144.2.7,1. Introduction,increas effort automat detect phase start time...,PD
132713,101144.2.8,1. Introduction,sampl data descript data numer eeg read partic...,PD


In [8]:
%%time
ddf = ddf[~ddf['id_subsection'].isin(df_train['id_subsection'])]
ddf.compute()

CPU times: user 11.4 s, sys: 2.88 s, total: 14.3 s
Wall time: 15.3 s


Unnamed: 0,id_subsection,text_subsection
0,2535.1.1,paper address problem track diagnos complex sy...
1,2535.2.1,complex sophist current gener industri process...
2,2535.2.2,want monitor state system reliabl detect abnor...
3,2535.2.3,paper propos differ approach problem model com...
4,2535.2.4,express discret failur mode produc discontinuo...
...,...,...
19460,101223.24.3,thank attent mechan predict result interpret p...
19461,101223.24.4,believ attentionbas model use structur represe...
19462,101223.25.1,thank waldman develop codevec websit thank mil...
19463,101223.25.2,lead result receiv fund european union seventh...


Info:

In [9]:
print("Subsections in data set = %s" % len(ddf.id_subsection))
id_paper_set = set()
ddf.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]), meta=(None, 'int64')).compute()
print("Valid papers in data set = %s" % len(id_paper_set))

Subsections in data set = 3943815
Valid papers in data set = 47178


## Predict Data Set

Load vectorizer and Transform documents to document-term matrix:

In [None]:
vectorizer_path = "./resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as feature_extractor:
    vectorizer = pickle.load(feature_extractor)

In [10]:
%%time
X_to_predict = vectorizer.transform(ddf['text_subsection'])
X_to_predict

CPU times: user 2min 2s, sys: 2.76 s, total: 2min 4s
Wall time: 2min 5s


<3943815x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 64741374 stored elements in Compressed Sparse Row format>

In [11]:
X_to_predict.shape

(3943815, 40000)

Load classifier:

In [12]:
classifier_path = "./resources/tdidf_bigr-lr.pkl"
with open(classifier_path, 'rb') as training_model:
    model = pickle.load(training_model)

Predict Data Set:

In [13]:
%%time
y_pred = model.predict(X_to_predict)

CPU times: user 192 ms, sys: 24.4 ms, total: 217 ms
Wall time: 210 ms


In [14]:
#print(y_pred, y_pred.shape, type(y_pred))
print("PD estimated: %s/%s" %(len([i for i in y_pred if i == 1]),y_pred.shape[0]))

PD estimated: 12080/3943815


In [15]:
# free memory
del X_to_predict

Save Prediction:

In [16]:
ids = list(ddf['id_subsection'].values.compute())

In [17]:
df_tmp = pd.DataFrame(data={'id_subsection': ids,
                            'label_predict_id': y_pred})
                      #, index=ddf.index.compute())
df_tmp

Unnamed: 0,id_subsection,label_predict_id
0,2535.1.1,0
1,2535.2.1,0
2,2535.2.2,0
3,2535.2.3,0
4,2535.2.4,0
...,...,...
3943810,101223.24.3,0
3943811,101223.24.4,0
3943812,101223.25.1,0
3943813,101223.25.2,0


In [18]:
ddf_res = ddf.merge(df_tmp, how='left', on='id_subsection')
ddf_res.compute()



Unnamed: 0,id_subsection,text_subsection,label_predict_id
0,2535.1.1,paper address problem track complex system pro...,0
1,2535.2.1,complex current process grow need agent contro...,0
2,2535.2.2,want monitor state system detect behavior appr...,0
3,2535.2.3,paper differ approach problem model complex hy...,0
4,2535.2.4,express mode system behavior hybrid greater ra...,0
...,...,...,...
18169,101223.24.3,thank attent predict result interpret interest,0
18170,101223.24.4,model use represent code wide rang program pro...,0
18171,101223.25.1,thank develop thank use model sarkar fruit com...,0
18172,101223.25.2,lead result fund union seventh framework agree...,0


In [19]:
%%time
predict_parquets_dir = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/papers-predict(PD-N_PD)-parquets"
dd.to_parquet(ddf_res, predict_parquets_dir, engine='fastparquet', compression='snappy')

CPU times: user 1min 11s, sys: 12.5 s, total: 1min 23s
Wall time: 1min 10s
