In [1]:
import os
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import snappy
import fastparquet
import dask
import dask.dataframe as dd
import nltk
import re

import logging
logger = logging.getLogger('distributed.worker')
logger.setLevel(logging.ERROR)

## Create Data Set
I save all the papers in parquets files.

In [None]:
def get_label(subsection):
    return "PD" if "label" in subsection.attrib and subsection.attrib["label"] == "Problem" else "N_PD"

In [None]:
%%time
paper_list = "../data/LIST_PAPERS.txt"
paper_line_list = []
with open(paper_list, 'r') as in_file:
    paper_line_list = in_file.readlines()

data_dir = "../data/papers"  # data/prova data/papers
parquets_dir = "../data/papers-parquets"
df = pd.DataFrame(columns=["id_subsection", "paragraph_name", "text_subsection", "label_subsection"])
count_papers = 0
count_parquets = 0
for paper_line in paper_line_list:
    info_paper = paper_line.split("\t\t")
    path_dir = os.path.join(data_dir, "paper_" + info_paper[0])
    if os.path.isdir(path_dir):
        analyzable_train_paper_path = os.path.join(path_dir, "paper_" + info_paper[0] + "_analyzable_train.xml")
        analyzable_paper_path = os.path.join(path_dir, "paper_" + info_paper[0] + "_analyzable.xml")
        path_to_analyze = None
        if not os.path.exists(analyzable_train_paper_path) and os.path.exists(analyzable_paper_path):
            path_to_analyze = analyzable_paper_path
        elif os.path.exists(analyzable_train_paper_path): #and os.path.exists(analyzable_paper_path):
            path_to_analyze = analyzable_train_paper_path
        if path_to_analyze is not None:
            # 1- parse xml
            tree = ET.parse(path_to_analyze)
            root = tree.getroot()
            # 2- get all subsections
            found_subsections = root.findall('.//subsection')
            added = False
            for subsection in found_subsections:
                if '.-1.' in subsection.attrib['id']:
                    continue
                else:
                    paragraph_id = subsection.attrib['id'][:subsection.attrib['id'].rfind('.')]
                    paragraph = root.find("./paragraph[@id='" + paragraph_id + "']")
                    if paragraph is None or 'reference' in paragraph.attrib['name'].lower():
                        continue
                    # add to dataset
                    df = df.append({"id_subsection": subsection.attrib['id'],
                                    "paragraph_name": paragraph.attrib['name'],
                                    "text_subsection": subsection.text,
                                    "label_subsection": np.nan if path_to_analyze == analyzable_paper_path
                                    else get_label(subsection)
                                    }, ignore_index=True)
                    added = True
            if added: count_papers += 1
    # save in parquet
    if count_papers == 2000: # 2 2000
        parquet_path = os.path.join(parquets_dir, 'papers-'+'{:0>2d}'.format(count_parquets)+'.snap.parquet')
        fastparquet.write(parquet_path, df, compression='snappy')
        print(parquet_path)
        count_parquets += 1
        count_papers = 0
        df = pd.DataFrame(columns=["id_subsection", "paragraph_name", "text_subsection", "label_subsection"])
# save in last parquet
if count_papers > 0:
    parquet_path = os.path.join(parquets_dir, 'papers-'+'{:0>2d}'.format(count_parquets)+'.snap.parquet')
    fastparquet.write(parquet_path, df, compression='snappy')
    print(parquet_path)
    count_parquets += 1

In [None]:
#df

In [None]:
print("Papers in dataset = %s" % ((count_parquets-1)*2000+count_papers)) # 2 2000

## Open Data Set

In [2]:
from dask.distributed import Client
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='3GB')
client

0,1
Client  Scheduler: inproc://192.168.1.21/1302/1  Dashboard: http://192.168.1.21:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 3.00 GB


In [3]:
%%time
parquets_dir = "../data/papers-parquets"
ddf = dd.read_parquet(parquets_dir, index=False, engine='fastparquet')

CPU times: user 62.1 ms, sys: 24.8 ms, total: 86.9 ms
Wall time: 1.87 s


##### Check Data Set :

In [4]:
ddf.compute()
#ddf.columns
#ddf.dtypes
#ddf['label_subsection']
#ddf.loc[lambda df: df['id_subsection'].str.contains('2544')].compute()



Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,
...,...,...,...,...
19460,101223.24.3,9 CONCLUSION,"\n Thanks to the attention mechanism, the pr...",
19461,101223.24.4,9 CONCLUSION,\n We believe that the attention-based model...,
19462,101223.25.1,ACKNOWLEDGMENTS,\n We would like to thank Guy Waldman for de...,
19463,101223.25.2,ACKNOWLEDGMENTS,\n The research leading to these results has...,


In [None]:
#ddf['label_subsection'].unique().compute()

In [5]:
ddf[ddf['text_subsection'].isna()].compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
17645,2870.4.8,An Exponential Schedule,,
132882,13129.6.4,5 Results & Discussion,,
137583,13196.5.10,4 Data Statistics and Analysis,,
137729,13199.5.13,"ber, 1964), a is set to 1 in experiments:",,
2654,13260.4.44,3 Experiments,,
...,...,...,...,...
61363,98852.14.26,A Missing Proofs,,
74084,99126.6.4,5 Results,,
101031,99626.10.8,3.1 Robust Certifiability for Arbitrary Distri...,,
101157,99626.18.27,5.3 Robust L1 Regression,,


In [6]:
ddf = ddf.dropna(subset=['text_subsection'])
ddf[ddf['text_subsection'].isna()].compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection


In [7]:
print("Subsections in dataset = %s" % len(ddf.id_subsection))
id_paragraph_set = set()
ddf.id_subsection.apply(lambda x: id_paragraph_set.add(x[:x.find('.')]), meta=(None, 'int64')).compute()
print("Valid papers in dataset = %s" % len(id_paragraph_set))

Subsections in dataset = 4040195
Valid papers in dataset = 48233


## Text Cleaning

In [None]:
#ddf_2554 = ddf.loc[lambda df: df['id_subsection'].str.contains('2544')]
#list(ddf_2554['text_subsection'])

In [8]:
def initial_text_cleaning(text):
    text = text.lower()                                             # transform to lowercase
    text = re.sub(r'\n', '', text)                                  # remove \n
    text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w\b', '', text)                              # remove all single letters
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text


# cleaning text of stop words
from nltk.corpus import stopwords

def remove_stopwords(text, stopwords):
    words = text.split()
    return ' '.join([w for w in words if w not in stopwords])

# cleaning text of nonsense words
from nltk.corpus import words
words_dictionary = set(words.words())
def remove_nonsensewords(text):
    words = text.split()
    return ' '.join([w for w in words if w in words_dictionary])


# stemming and lemmatization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemming(text):
    words = text.split()
    return ' '.join([porter.stem(w) for w in words])

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
def lemmatization(text):
    words = text.split()
    return ' '.join([wordnet.lemmatize(w) for w in words])

In [9]:
%%time
# text - cleaning:
ddf['text_subsection'] = ddf['text_subsection'].apply(initial_text_cleaning, meta=(None, 'object'))

# remove stop-words:
stopwords_file = "./resources/stopwords_list.txt"
stopwords_extended_list = stopwords.words('english')
with open(stopwords_file, 'r') as file:
    stopwords_extended_list.extend([line.replace('\n', '') for line in file.readlines()])
stopwords_extended_list.extend(['table', 'tab', 'figure', 'fig'])
stopwords = set(stopwords_extended_list)
ddf['text_subsection'] = ddf['text_subsection'].apply(lambda x: remove_stopwords(x, stopwords), meta=(None, 'object'))

## stemming and lemmatization:
ddf['text_subsection'] = ddf['text_subsection'].apply(stemming, meta=(None, 'object'))
#ddf['text_subsection'] = ddf['text_subsection'].apply(lemmatization, meta=(None, 'object'))

# remove nonsense-words:
#ddf['text_subsection'] = ddf['text_subsection'].apply(remove_nonsensewords, meta=(None, 'object'))

CPU times: user 23.9 ms, sys: 14 ms, total: 37.9 ms
Wall time: 164 ms


In [None]:
#ddf_2554 = ddf.loc[lambda df: df['id_subsection'].str.contains('2544')]
#list(ddf_2554['text_subsection'])

## Save text-clean data set

In [10]:
%%time
clean_parquets_dir = "../data/papers-textclean-parquets"
dd.to_parquet(ddf, clean_parquets_dir, engine='fastparquet', compression='snappy')

CPU times: user 54min 5s, sys: 54 s, total: 54min 59s
Wall time: 55min 52s


##### Reopen to check:

In [11]:
%%time
clean_parquets_dir = "../data/papers-textclean-parquets"
ddf2 = dd.read_parquet(clean_parquets_dir, engine='fastparquet')

CPU times: user 17.6 ms, sys: 11.3 ms, total: 28.9 ms
Wall time: 229 ms


In [12]:
ddf2.compute()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,
1,2535.2.1,Introduction,complex sophist current gener industri process...,
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,
3,2535.2.3,Introduction,paper propos differ approach problem model com...,
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,
...,...,...,...,...
19460,101223.24.3,9 CONCLUSION,thank attent mechan predict result interpret p...,
19461,101223.24.4,9 CONCLUSION,believ attentionbas model use structur represe...,
19462,101223.25.1,ACKNOWLEDGMENTS,thank waldman develop codevec websit thank mil...,
19463,101223.25.2,ACKNOWLEDGMENTS,lead result receiv fund european union seventh...,
