In [1]:
import pandas as pd
import sklearn
import numpy as np
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.1.21/14793/1  Dashboard: http://192.168.1.21:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


Load dataset with all analazyble papers:

In [3]:
%%time
parquets_dir = "../../data/papers-textclean-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet')
ddf = ddf.drop(columns = ['__null_dask_index__'], axis=1)
ddf.compute()

CPU times: user 7.78 s, sys: 2.06 s, total: 9.85 s
Wall time: 8.74 s


Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,
1,2535.2.1,Introduction,complex sophist current gener industri process...,
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,
3,2535.2.3,Introduction,paper propos differ approach problem model com...,
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,
...,...,...,...,...
19460,101223.24.3,9 CONCLUSION,thank attent mechan predict result interpret p...,
19461,101223.24.4,9 CONCLUSION,believ attentionbas model use structur represe...,
19462,101223.25.1,ACKNOWLEDGMENTS,thank waldman develop codevec websit thank mil...,
19463,101223.25.2,ACKNOWLEDGMENTS,lead result receiv fund european union seventh...,


Load dataset with predicted analazyble papers:

In [4]:
%%time
predict_parquets_dir = "../../data/papers-predicted-parquets"
ddf_predicted = dd.read_parquet(predict_parquets_dir, index=False, engine='fastparquet')
ddf_predicted = ddf_predicted.drop(columns = ['__null_dask_index__'], axis=1)
ddf_predicted.compute()

CPU times: user 4.65 s, sys: 1.85 s, total: 6.5 s
Wall time: 5.13 s


Unnamed: 0,id_subsection,text_subsection,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
0,2535.1.1,paper address problem track diagnos complex sy...,0,0,0
1,2535.2.1,complex sophist current gener industri process...,0,0,0
2,2535.2.2,want monitor state system reliabl detect abnor...,0,0,0
3,2535.2.3,paper propos differ approach problem model com...,0,0,0
4,2535.2.4,express discret failur mode produc discontinuo...,0,0,0
...,...,...,...,...,...
18169,101223.24.3,thank attent mechan predict result interpret p...,0,0,0
18170,101223.24.4,believ attentionbas model use structur represe...,0,0,0
18171,101223.25.1,thank waldman develop codevec websit thank mil...,0,0,0
18172,101223.25.2,lead result receiv fund european union seventh...,0,0,0


Merge two dataset:

In [5]:
ddf_tmp = ddf_predicted.loc[:, ['id_subsection','predict_id_by_svc']]
ddf_res = ddf.merge(ddf_tmp, how='left', on='id_subsection')
ddf_res.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,predict_id_by_svc
0,2535.2.6,Introduction,advantag use gener probabilist model dbn fault...,,0.0
1,2535.3.22,The framework,show dbn creat process node ft ft simpli add i...,,0.0
2,2535.4.2,Inference,build algorithm start classic kalman filter al...,,0.0
3,2536.5.2,Combining Action Theories,use fluent inhand hasitem account name item ro...,,0.0
4,2536.6.2,Combining Different Agents,money sendrec agent interact buy sell move blo...,,0.0
...,...,...,...,...,...
161707,101223.17.6,6 EVALUATION,follow work larg amount code duplic github alo...,,0.0
161708,101223.18.13,6.1 Quantitative Evaluation,amount data score model drop data score drop t...,,0.0
161709,101223.21.8,6.4 Qualitative Evaluation,open connect close disconnect key key valu val...,,0.0
161710,101223.23.12,8 RELATED WORK,work use distribut represent code element path...,,0.0


In [6]:
del ddf
del ddf_predicted
del ddf_tmp
ddf_res.predict_id_by_svc.unique().compute()

0    0.0
1    1.0
2    NaN
Name: predict_id_by_svc, dtype: float64

In [7]:
# check on 'predict_id_by_svc'
print(len(ddf_res.loc[ddf_res.predict_id_by_svc.isna()].compute()))
print(len(ddf_res.loc[ddf_res.predict_id_by_svc.isna() & (ddf_res.label_subsection.str != None)].compute()))

96380
96380


Find interesting subsections:

In [8]:
ddf_res['interesting_subsection'] = 0

In [9]:
# subsections in important paragraph (to understand the context)
relevant_paragraphs = ['abstract',
                       'introduction',
                       'background',
                       'preliminaries',
                       'motiv',             # e.g. "motivations", "motivating example"
                       'description',       # e.g. "model description"
                       'overview',          # e.g. "system overview"
                       'problem',           # e.g. "problem definition", "the ... Problem"
                       'application',
                       'scenario',
                       'goal',              # e.g. "design goals"
                       'discussion',
                       'work',              # e.g. "future work", "related work"
                       'result',
                       'conclusion',
                       #'experiment',
                       #'architecture',
                       'domain',            # e.g "domain modelling"
                      ]

def is_relevant_paragraph(paragraph_name):
    for p in relevant_paragraphs:
        if p in paragraph_name.lower():
            return 1
    return 0

ddf_res['interesting_subsection'] = ddf_res['paragraph_name'].apply(lambda x: is_relevant_paragraph(x), meta=(None, 'int64'))

In [10]:
print(len(ddf_res.loc[ddf_res.interesting_subsection == 1]))

1502182


In [11]:
# subsection defined or predicted as 'PD'
condition = (ddf_res.label_subsection == 'PD') | (ddf_res.predict_id_by_svc == 1)
ddf_res['interesting_subsection'] = ddf_res['interesting_subsection'].mask(condition, 1)

In [12]:
print(len(ddf_res.loc[ddf_res.interesting_subsection == 1]))

1538402


Get all arxiv paper ids and Keep only subsections of arxiv papers:

In [13]:
list_arxiv_paper = '../../data/LIST_PAPERS_arxiv.txt'
arxiv_ids = []
with open(list_arxiv_paper, 'r') as f:
    arxiv_ids = [line.split('\t\t')[0] for line in f.readlines()]

In [36]:
ddf_arxiv = ddf_res.drop(columns = ['label_subsection','predict_id_by_svc'], axis=1)
ddf_arxiv['id_paper'] = ddf_arxiv.id_subsection.apply(lambda x: x[:x.find('.')], meta=(None, 'object'))
ddf_arxiv['id_paragraph'] = ddf_arxiv.id_subsection.apply(lambda x: x[:x.rfind('.')], meta=(None, 'object'))
ddf_arxiv = ddf_arxiv.loc[ddf_arxiv.id_paper.isin(arxiv_ids)]
ddf_arxiv.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,interesting_subsection,id_paper,id_paragraph
55408,41513.3.3,2 Data,imag depict robot process displac bottl away i...,0,41513,41513.3
55409,41513.4.2,3 Method,xt ut given initi goal state invers model prov...,0,41513,41513.4
55410,41513.6.2,3.2 Evaluation Procedure,succe achiev goal configur visual statist pair...,0,41513,41513.6
55411,41513.8.1,4 Results,robot task displac object initi imag configur ...,1,41513,41513.8
55412,41513.8.4,4 Results,poke object small distanc row depict exampl ro...,1,41513,41513.8
...,...,...,...,...,...,...
161707,101223.17.6,6 EVALUATION,follow work larg amount code duplic github alo...,0,101223,101223.17
161708,101223.18.13,6.1 Quantitative Evaluation,amount data score model drop data score drop t...,0,101223,101223.18
161709,101223.21.8,6.4 Qualitative Evaluation,open connect close disconnect key key valu val...,0,101223,101223.21
161710,101223.23.12,8 RELATED WORK,work use distribut represent code element path...,1,101223,101223.23


Keep only paragraph with at a least a subsection =1 :

In [51]:
at_least_series = ddf_arxiv.groupby(['id_paragraph']).interesting_subsection.sum().compute()
at_least_paragraph_ids = list(at_least_series[at_least_series > 0].index)
len(at_least_paragraph_ids)

114064

In [52]:
ddf_arxiv = ddf_arxiv.loc[ddf_arxiv.id_paragraph.isin(at_least_paragraph_ids)]
ddf_arxiv.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,interesting_subsection,id_paper,id_paragraph
55411,41513.8.1,4 Results,robot task displac object initi imag configur ...,1,41513,41513.8
55412,41513.8.4,4 Results,poke object small distanc row depict exampl ro...,1,41513,41513.8
55413,41513.11.1,6 Discussion and Future Work,work propos learn intuit model physic interact...,1,41513,41513.11
82185,61823.1.1,Abstract,nonneg matrix factor ubiquit tool data analysi...,1,61823,61823.1
82186,61824.2.5,1 Introduction,paper provid mathemat analysi composit model s...,1,61824,61824.2
...,...,...,...,...,...,...
161702,101222.4.3,3 LEARNING DECISION GRAPHS,saw previou structur bs impos set independ con...,0,101222,101222.4
161703,101222.4.14,3 LEARNING DECISION GRAPHS,dijk denot number case xi par defin nabc follo...,0,101222,101222.4
161704,101223.8.17,2.1 Motivating Example,model base neural network model humaninterpret...,1,101223,101223.8
161710,101223.23.12,8 RELATED WORK,work use distribut represent code element path...,1,101223,101223.23


Sort by 'id_subsection' value to build the text for the paper (with only interesting paragraphs):

In [58]:
def adjust_id_subsection(id_subsection): #6 #3 #4
    ids = id_subsection.split('.')
    return "{0}.{1:03}.{2:04}".format(int(ids[0]), int(ids[1]), int(ids[2]))

ddf_text = ddf_arxiv.drop('interesting_subsection', axis=1)
ddf_text['id_subsection'] = ddf_text.id_subsection.apply(lambda x: adjust_id_subsection(x), meta=(None, 'object'))
ddf_text = ddf_text.set_index('id_subsection')
ddf_text = ddf_text.map_partitions(lambda x: x.sort_index())
ddf_text.compute()

Unnamed: 0_level_0,paragraph_name,text_subsection,id_paper,id_paragraph
id_subsection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100002.001.0001,Abstract,consid problem learn function comput children ...,100002,100002.1
100002.001.0002,Abstract,sens second step causal discoveri probabilist ...,100002,100002.1
100002.002.0001,1 Introduction,larg part literatur causal concern learn causa...,100002,100002.2
100002.002.0002,1 Introduction,start point paper consid done causal graph sys...,100002,100002.2
100002.002.0003,1 Introduction,known variabl function variabl precis function...,100002,100002.2
...,...,...,...,...
99998.010.0004,5 APPENDIX: BRIEF BACKGROUND,krizhevski deng complet review common layer,99998,99998.10
99998.010.0005,5 APPENDIX: BRIEF BACKGROUND,convolut layer defin input imag rhwc filter rm...,99998,99998.10
99998.010.0006,5 APPENDIX: BRIEF BACKGROUND,fulli connect layer defin wx bet input rnr wei...,99998,99998.10
99998.010.0007,5 APPENDIX: BRIEF BACKGROUND,typic batch normal layer written wy bet input ...,99998,99998.10


In [61]:
# use '|' to check correctness of reconstrucion:
tmp = ddf_text.groupby('id_paper')['text_subsection'].apply('|'.join, meta=(None, 'object'))
tmp.compute()

id_paper
100036    paper present multimod biometr system fingerpr...
100075    random trial known ab test select polici contr...
100147    deep qnetwork returnbas reinforc learn promis ...
100161    studi problem learn polici demonstr combinator...
100208    present novel method compress deep convolut ne...
                                ...                        
99842     case combin classifi show product rule aris ma...
99920     propos novel dialogu model framework use binar...
99922     studi classif problem featur acquir cost goal ...
99932     challeng imag process task describ illpos line...
99988     model physic system learn molecular fingerprin...
Length: 21802, dtype: object

In [70]:
type(tmp)

dask.dataframe.core.Series

In [106]:
df_text = tmp.to_frame().compute()
df_text

Unnamed: 0_level_0,0
id_paper,Unnamed: 1_level_1
100036,paper present multimod biometr system fingerpr...
100075,random trial known ab test select polici contr...
100147,deep qnetwork returnbas reinforc learn promis ...
100161,studi problem learn polici demonstr combinator...
100208,present novel method compress deep convolut ne...
...,...
99842,case combin classifi show product rule aris ma...
99920,propos novel dialogu model framework use binar...
99922,studi classif problem featur acquir cost goal ...
99932,challeng imag process task describ illpos line...


In [107]:
df_text = df_text.reset_index()
df_text.rename(columns={'id_paper': 'paper_id', 0: 'text'}, inplace=True)
df_text

Unnamed: 0,paper_id,text
0,100036,paper present multimod biometr system fingerpr...
1,100075,random trial known ab test select polici contr...
2,100147,deep qnetwork returnbas reinforc learn promis ...
3,100161,studi problem learn polici demonstr combinator...
4,100208,present novel method compress deep convolut ne...
...,...,...
21797,99842,case combin classifi show product rule aris ma...
21798,99920,propos novel dialogu model framework use binar...
21799,99922,studi classif problem featur acquir cost goal ...
21800,99932,challeng imag process task describ illpos line...


In [108]:
# check: ok!
df_text.loc[df_text.paper_id == '100002']['text'].squeeze()

'consid problem learn function comput children parent structur causal model underli causal graph identifi|sens second step causal discoveri probabilist approach estim function deriv natur myopic activ learn scheme identifi intervent optim inform unknown function jointli given previous observ data test deriv algorithm simpl exampl demonstr produc structur explor polici significantli improv unstructur baselin|larg part literatur causal concern learn causal graph system random variabl known causal discoveri causal infer problem motiv realist problem scienc biologist wish discov gene respons regul gene cell public health research wish certain habit popul influenc certain health outcom|start point paper consid done causal graph system variabl identifi|known variabl function variabl precis function relationship unknown understand causal relationship coars sens abl accur predict result intervent system possibl implic decis make|instanc suppos cell upregul downregul|reduc express lead decreas 

In [109]:
df_text['text'] = df_text.text.apply(lambda x: x.replace('|', ' '))
df_text.loc[df_text.paper_id == '100002']['text'].squeeze()

'consid problem learn function comput children parent structur causal model underli causal graph identifi sens second step causal discoveri probabilist approach estim function deriv natur myopic activ learn scheme identifi intervent optim inform unknown function jointli given previous observ data test deriv algorithm simpl exampl demonstr produc structur explor polici significantli improv unstructur baselin larg part literatur causal concern learn causal graph system random variabl known causal discoveri causal infer problem motiv realist problem scienc biologist wish discov gene respons regul gene cell public health research wish certain habit popul influenc certain health outcom start point paper consid done causal graph system variabl identifi known variabl function variabl precis function relationship unknown understand causal relationship coars sens abl accur predict result intervent system possibl implic decis make instanc suppos cell upregul downregul reduc express lead decreas 

Save results:

In [110]:
dataset_path = "../resources/interesting_arxiv_papers_textclean.pkl"
df_text.to_pickle(dataset_path, protocol=4)