In [1]:
import pandas as pd
import sklearn
import numpy as np
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import pickle

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='6GB')
client

0,1
Client  Scheduler: inproc://192.168.1.21/837/1  Dashboard: http://192.168.1.21:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 6.00 GB


Load dataset with all analazyble papers:

In [3]:
%%time
parquets_dir = "../../data/papers-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet')
#ddf = ddf.drop(columns = ['__null_dask_index__'], axis=1)
ddf.compute()

CPU times: user 13.1 s, sys: 7.56 s, total: 20.7 s
Wall time: 27.6 s


Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,
...,...,...,...,...
19460,101223.24.3,9 CONCLUSION,"\n Thanks to the attention mechanism, the pr...",
19461,101223.24.4,9 CONCLUSION,\n We believe that the attention-based model...,
19462,101223.25.1,ACKNOWLEDGMENTS,\n We would like to thank Guy Waldman for de...,
19463,101223.25.2,ACKNOWLEDGMENTS,\n The research leading to these results has...,


Load dataset with predicted analazyble papers:

In [4]:
%%time
predict_parquets_dir = "../../data/papers-predicted-parquets"
ddf_predicted = dd.read_parquet(predict_parquets_dir, index=False, engine='fastparquet')
ddf_predicted = ddf_predicted.drop(columns = ['__null_dask_index__'], axis=1)
ddf_predicted.compute()

CPU times: user 5.31 s, sys: 2.45 s, total: 7.76 s
Wall time: 8.69 s


Unnamed: 0,id_subsection,text_subsection,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
0,2535.1.1,paper address problem track diagnos complex sy...,0,0,0
1,2535.2.1,complex sophist current gener industri process...,0,0,0
2,2535.2.2,want monitor state system reliabl detect abnor...,0,0,0
3,2535.2.3,paper propos differ approach problem model com...,0,0,0
4,2535.2.4,express discret failur mode produc discontinuo...,0,0,0
...,...,...,...,...,...
18169,101223.24.3,thank attent mechan predict result interpret p...,0,0,0
18170,101223.24.4,believ attentionbas model use structur represe...,0,0,0
18171,101223.25.1,thank waldman develop codevec websit thank mil...,0,0,0
18172,101223.25.2,lead result receiv fund european union seventh...,0,0,0


Merge two dataset:

In [5]:
ddf_tmp = ddf_predicted.loc[:, ['id_subsection','predict_id_by_svc']]
ddf_res = ddf.merge(ddf_tmp, how='left', on='id_subsection')
ddf_res.compute()



Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,predict_id_by_svc
0,2535.2.6,Introduction,\n There are several advantages to the use o...,,0.0
1,2535.3.22,The framework,\n Figure 2 shows a DBN created by this proc...,,0.0
2,2535.4.2,Inference,\n We therefore build our algorithm starting...,,0.0
3,2536.5.2,Combining Action Theories,\n We use two new fluents together with inHa...,,0.0
4,2536.6.2,Combining Different Agents,\n Money Send/Receive Agents Interactions Bu...,,0.0
...,...,...,...,...,...
161713,101223.17.6,6 EVALUATION,\n Following recent work which found a large...,,0.0
161714,101223.18.13,6.1 Quantitative Evaluation,"\n With 20% of the amounts of data, the F1 s...",,0.0
161715,101223.21.8,6.4 Qualitative Evaluation,\n A : B C : D open : connect close : discon...,,0.0
161716,101223.23.12,8 RELATED WORK,"\n In this work, we use distributed represen...",,0.0


In [6]:
del ddf
del ddf_predicted
del ddf_tmp
ddf_res.predict_id_by_svc.unique().compute()

0    0.0
1    1.0
2    NaN
Name: predict_id_by_svc, dtype: float64

In [7]:
# check on 'predict_id_by_svc'
print(len(ddf_res.loc[ddf_res.predict_id_by_svc.isna()].compute()))
print(len(ddf_res.loc[ddf_res.predict_id_by_svc.isna() & (ddf_res.label_subsection.str != None)].compute()))

96567
96567


Find interesting subsections:

In [8]:
ddf_res['interesting_subsection'] = 0

In [9]:
# subsections in important paragraph (to understand the context)
relevant_paragraphs = ['abstract',
                       'introduction',
                       'background',
                       'preliminaries',
                       'motiv',             # e.g. "motivations", "motivating example"
                       'description',       # e.g. "model description"
                       'overview',          # e.g. "system overview"
                       'problem',           # e.g. "problem definition", "the ... Problem"
                       'application',
                       'scenario',
                       'goal',              # e.g. "design goals"
                       'discussion',
                       'work',              # e.g. "future work", "related work"
                       'result',
                       'conclusion',
                       #'experiment',
                       #'architecture',
                       'domain',            # e.g "domain modelling"
                      ]

def is_relevant_paragraph(paragraph_name):
    for p in relevant_paragraphs:
        if p in paragraph_name.lower():
            return 1
    return 0

ddf_res['interesting_subsection'] = ddf_res['paragraph_name'].apply(lambda x: is_relevant_paragraph(x), meta=(None, 'int64'))

In [10]:
print(len(ddf_res.loc[ddf_res.interesting_subsection == 1]))

1502209


In [11]:
# subsection defined or predicted as 'PD'
condition = (ddf_res.label_subsection == 'PD') | (ddf_res.predict_id_by_svc == 1)
ddf_res['interesting_subsection'] = ddf_res['interesting_subsection'].mask(condition, 1)

In [12]:
print(len(ddf_res.loc[ddf_res.interesting_subsection == 1]))

1538429


Get all arxiv paper ids and Keep only subsections of arxiv papers:

In [13]:
list_arxiv_paper = '../../data/LIST_PAPERS_arxiv.txt'
arxiv_ids = []
with open(list_arxiv_paper, 'r') as f:
    arxiv_ids = [line.split('\t\t')[0] for line in f.readlines()]

In [14]:
ddf_arxiv = ddf_res.drop(columns = ['label_subsection','predict_id_by_svc'], axis=1)
ddf_arxiv['id_paper'] = ddf_arxiv.id_subsection.apply(lambda x: x[:x.find('.')], meta=(None, 'object'))
ddf_arxiv['id_paragraph'] = ddf_arxiv.id_subsection.apply(lambda x: x[:x.rfind('.')], meta=(None, 'object'))
ddf_arxiv = ddf_arxiv.loc[ddf_arxiv.id_paper.isin(arxiv_ids)]
ddf_arxiv.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,interesting_subsection,id_paper,id_paragraph
55410,41513.3.3,2 Data,\n 2 Figure 2: These images depict the robo...,0,41513,41513.3
55411,41513.4.2,3 Method,"\n x̂t+1 = F(xt, ut; Wfwd) (1) ˆ ut = G(xt, ...",0,41513,41513.4
55412,41513.6.2,3.2 Evaluation Procedure,\n succeeds at achieving the goal configurat...,0,41513,41513.6
55413,41513.8.1,4 Results,\n The robot was tasked to displace objects ...,1,41513,41513.8
55414,41513.8.4,4 Results,\n poking object by small distances). Row 3 ...,1,41513,41513.8
...,...,...,...,...,...,...
161713,101223.17.6,6 EVALUATION,\n Following recent work which found a large...,0,101223,101223.17
161714,101223.18.13,6.1 Quantitative Evaluation,"\n With 20% of the amounts of data, the F1 s...",0,101223,101223.18
161715,101223.21.8,6.4 Qualitative Evaluation,\n A : B C : D open : connect close : discon...,0,101223,101223.21
161716,101223.23.12,8 RELATED WORK,"\n In this work, we use distributed represen...",1,101223,101223.23


Keep only paragraph with at a least a subsection =1 :

In [15]:
at_least_series = ddf_arxiv.groupby(['id_paragraph']).interesting_subsection.sum().compute()
at_least_paragraph_ids = list(at_least_series[at_least_series > 0].index)
len(at_least_paragraph_ids)

114064

In [16]:
ddf_arxiv = ddf_arxiv.loc[ddf_arxiv.id_paragraph.isin(at_least_paragraph_ids)]
ddf_arxiv.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,interesting_subsection,id_paper,id_paragraph
55413,41513.8.1,4 Results,\n The robot was tasked to displace objects ...,1,41513,41513.8
55414,41513.8.4,4 Results,\n poking object by small distances). Row 3 ...,1,41513,41513.8
55415,41513.11.1,6 Discussion and Future Work,\n In this work we propose to learn “intuiti...,1,41513,41513.11
82188,61823.1.1,Abstract,\n Nonnegative matrix factorization (NMF) ha...,1,61823,61823.1
82189,61824.2.5,1 Introduction,\n This paper provides a mathematical analys...,1,61824,61824.2
...,...,...,...,...,...,...
161708,101222.4.3,3 LEARNING DECISION GRAPHS,"\n As we saw in the previous section, the st...",0,101222,101222.4
161709,101222.4.14,3 LEARNING DECISION GRAPHS,"\n That is, Θ−1 (θ) = {i, j, k|Θ(i, j, k) = ...",0,101222,101222.4
161710,101223.8.17,2.1 Motivating Example,\n • Although our model is based on a neural...,1,101223,101223.8
161716,101223.23.12,8 RELATED WORK,"\n In this work, we use distributed represen...",1,101223,101223.23


Sort by 'id_subsection' value to build the text for the paper (with only interesting paragraphs):

In [17]:
def adjust_id_subsection(id_subsection): #6 #3 #4
    ids = id_subsection.split('.')
    return "{0}.{1:03}.{2:04}".format(int(ids[0]), int(ids[1]), int(ids[2]))

ddf_text = ddf_arxiv.drop('interesting_subsection', axis=1)
ddf_text['id_subsection'] = ddf_text.id_subsection.apply(lambda x: adjust_id_subsection(x), meta=(None, 'object'))
ddf_text = ddf_text.set_index('id_subsection')
ddf_text = ddf_text.map_partitions(lambda x: x.sort_index())
ddf_text.compute()

Unnamed: 0_level_0,paragraph_name,text_subsection,id_paper,id_paragraph
id_subsection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100002.001.0001,Abstract,\n We consider the problem of learning the f...,100002,100002.1
100002.001.0002,Abstract,\n This is in some sense the second step aft...,100002,100002.1
100002.002.0001,1 Introduction,\n Large parts of the literature on causalit...,100002,100002.2
100002.002.0002,1 Introduction,\n The starting point of this paper is to co...,100002,100002.2
100002.002.0003,1 Introduction,"\n That is, it is known which variables are ...",100002,100002.2
...,...,...,...,...
99998.010.0004,5 APPENDIX: BRIEF BACKGROUND,\n (2015); Krizhevsky et al. (2009); Deng et...,99998,99998.10
99998.010.0005,5 APPENDIX: BRIEF BACKGROUND,\n The convolution layer can be defined as ...,99998,99998.10
99998.010.0006,5 APPENDIX: BRIEF BACKGROUND,\n The fully connected layer is defined as ...,99998,99998.10
99998.010.0007,5 APPENDIX: BRIEF BACKGROUND,\n The typical batch normalization layer can...,99998,99998.10


In [20]:
ddf_text.loc[ddf_text.text_subsection.isna()].compute()

Unnamed: 0_level_0,paragraph_name,text_subsection,id_paper,id_paragraph
id_subsection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64514.006.0032,5. Transformations of Deep Residual Networks,,64514,64514.6
64754.009.0020,3.3 The hardness of strong-minimality,,64754,64754.9
66189.005.0008,2.2. Neural networks 2.2.1. Single-layer neura...,,66189,66189.5
67100.008.0028,5.2 Proof of Theorem 6,,67100,67100.8
67100.009.0028,5.3 A Finite-Sample Generalization Result and ...,,67100,67100.9
67409.002.0003,1. Introduction,,67409,67409.2
67799.007.0013,A Stability results on time-varying linear sys...,,67799,67799.7
68118.012.0003,Results and Analysis,,68118,68118.12
69290.022.0007,A.3. Negative Result: Non Inverse-Degree Dynam...,,69290,69290.22
70793.005.0014,4. Experimental Results,,70793,70793.5


In [21]:
ddf_text = ddf_text.dropna(subset=['text_subsection'])
ddf_text.loc[ddf_text.text_subsection.isna()].compute()

Unnamed: 0_level_0,paragraph_name,text_subsection,id_paper,id_paragraph
id_subsection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [22]:
# use '|' to check correctness of reconstrucion:
tmp = ddf_text.groupby('id_paper')['text_subsection'].apply('|'.join, meta=(None, 'object'))
tmp.compute()

id_paper
100036    \n   This paper presents a multimodal biometri...
100075    \n   Randomized trials, also known as A/B test...
100147    \n   The deep Q-network (DQN) and return-based...
100161    \n   We study the problem of learning a good s...
100208    \n   We present a novel method of compression ...
                                ...                        
99842     \n   for the case of combining classifiers. We...
99920     \n   We propose a novel dialogue modeling fram...
99922     \n   We study a classification problem where e...
99932     \n   Many challenging image processing tasks c...
99988     \n   Modeling physics system, learning molecul...
Length: 21802, dtype: object

In [23]:
type(tmp)

dask.dataframe.core.Series

In [24]:
df_text = tmp.to_frame().compute()
df_text

Unnamed: 0_level_0,0
id_paper,Unnamed: 1_level_1
100036,\n This paper presents a multimodal biometri...
100075,"\n Randomized trials, also known as A/B test..."
100147,\n The deep Q-network (DQN) and return-based...
100161,\n We study the problem of learning a good s...
100208,\n We present a novel method of compression ...
...,...
99842,\n for the case of combining classifiers. We...
99920,\n We propose a novel dialogue modeling fram...
99922,\n We study a classification problem where e...
99932,\n Many challenging image processing tasks c...


In [25]:
df_text = df_text.reset_index()
df_text.rename(columns={'id_paper': 'paper_id', 0: 'text'}, inplace=True)
df_text

Unnamed: 0,paper_id,text
0,100036,\n This paper presents a multimodal biometri...
1,100075,"\n Randomized trials, also known as A/B test..."
2,100147,\n The deep Q-network (DQN) and return-based...
3,100161,\n We study the problem of learning a good s...
4,100208,\n We present a novel method of compression ...
...,...,...
21797,99842,\n for the case of combining classifiers. We...
21798,99920,\n We propose a novel dialogue modeling fram...
21799,99922,\n We study a classification problem where e...
21800,99932,\n Many challenging image processing tasks c...


In [26]:
# check: ok!
df_text.loc[df_text.paper_id == '100002']['text'].squeeze()

'\n   We consider the problem of learning the functions computing children from parents in a Structural Causal Model once the underlying causal graph has been identified.\n  |\n   This is in some sense the second step after causal discovery. Taking a probabilistic approach to estimating these functions, we derive a natural myopic active learning scheme that identifies the intervention which is optimally informative about all of the unknown functions jointly, given previously observed data. We test the derived algorithms on simple examples, to demonstrate that they produce a structured exploration policy that significantly improves on unstructured base-lines.\n  |\n   Large parts of the literature on causality are concerned with learning the causal graph of a system of random variables [Spirtes et al., 2000, Tong and Koller, 2001, Eberhardt, 2010, Hyttinen et al., 2013, Mooij et al., 2016]. Also known as causal discovery or causal inference, this problem is motivated by realistic proble

In [27]:
df_text['text'] = df_text.text.apply(lambda x: x.replace('|', ' '))
df_text.loc[df_text.paper_id == '100002']['text'].squeeze()

'\n   We consider the problem of learning the functions computing children from parents in a Structural Causal Model once the underlying causal graph has been identified.\n   \n   This is in some sense the second step after causal discovery. Taking a probabilistic approach to estimating these functions, we derive a natural myopic active learning scheme that identifies the intervention which is optimally informative about all of the unknown functions jointly, given previously observed data. We test the derived algorithms on simple examples, to demonstrate that they produce a structured exploration policy that significantly improves on unstructured base-lines.\n   \n   Large parts of the literature on causality are concerned with learning the causal graph of a system of random variables [Spirtes et al., 2000, Tong and Koller, 2001, Eberhardt, 2010, Hyttinen et al., 2013, Mooij et al., 2016]. Also known as causal discovery or causal inference, this problem is motivated by realistic proble

Save results:

In [28]:
dataset_path = "../resources/interesting_arxiv_papers.pkl"
df_text.to_pickle(dataset_path, protocol=4)