Skip to content

Multiple improvements to results extraction #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 49 commits into from
Apr 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
4c7c3f8
Allow using named docker volumes
mkardas Dec 10, 2019
c5dcee1
Make CRF and elastic setup optional
mkardas Dec 28, 2019
e21a3a0
Disable multiprocessing
Jan 10, 2020
b0d0dae
Return the top-k best proposals
Jan 16, 2020
d55768e
Extract tasks from paper's text
Jan 16, 2020
1726526
Refactor manual dictionaries
Jan 16, 2020
dec9aa2
Update semantic segmentaton datasets synonyms
Jan 16, 2020
cdcd36d
Add default evidences for all datasets and metrics
Jan 17, 2020
d164644
Select best proposals from top-k
Jan 17, 2020
292a037
Load tables from files
Jan 23, 2020
f931cf6
Compute top-k recall
Jan 23, 2020
1d9fcff
Add pipeline caching utils
Jan 27, 2020
2e4bc46
Update evidences dictionaries
Jan 30, 2020
403a53c
Fix proposals caching
Jan 31, 2020
3149415
Support commas in numeric cells
Feb 2, 2020
650050e
Refactor logprobs computation
Feb 2, 2020
0a55394
Extract axis logprobs computation
Feb 2, 2020
53917e0
Cache axis logprobs
Feb 2, 2020
1378aa2
Add support for complementary metrics
Feb 5, 2020
f5a02fc
Fix transformed metric format
Feb 7, 2020
a6bfbed
Keep canonical taxonomy entries
Feb 11, 2020
6f3a3f3
Filter out stop words
Feb 11, 2020
7b1cb03
Fix single file source archives
Feb 11, 2020
b541c40
Better error message when source is missing
Feb 11, 2020
6991550
Better error message for withdrawn papers
Feb 11, 2020
01cb2ac
Better error message when conversion fails
Feb 11, 2020
aad880d
Normalize cells content
Feb 11, 2020
b2ed4f4
Return proposals for single-digit cells
Feb 11, 2020
298a938
Support negative numbers
Feb 12, 2020
d005f9c
Add text evidences for taxonomy entries
Feb 14, 2020
081bd5c
Predict tasks, datasets and metrics independently
Feb 17, 2020
b85a6e5
Add abstract and references contexts
Feb 18, 2020
7384a39
Add unit tests for metric value parsing
Feb 18, 2020
0a6ff72
Add independent proposal
Feb 19, 2020
b1f690a
Set fragment size
Feb 19, 2020
c989173
Small fixes
Feb 20, 2020
b535afd
Cache logprobs
Feb 20, 2020
d8caebf
Merge branch 'master' into push-api
Mar 2, 2020
0b308c2
Small adaptations
Mar 5, 2020
064b9ee
Use html5lib for html cleaning
Mar 5, 2020
6e6237a
Prepopulate references index
Mar 6, 2020
3e76133
Fixes to reference importing
Mar 9, 2020
6489719
Reference extraction
Mar 9, 2020
6038694
Fix mapping type name
Mar 10, 2020
0096e90
Do not require arxiv id
Mar 10, 2020
01cda5c
Fix reference key
Mar 10, 2020
80a31e5
Retry grobid request when service unavailable
Mar 11, 2020
7c22a57
Close GROBID connection after request
Mar 20, 2020
fdb06d0
Better support for complementary metrics
Apr 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,6 @@ venv.bak/
.mypy_cache/
.idea/*
.vscode/settings.json

# pytest
.pytest_cache
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,9 @@ To test the whole extraction on a single file run
```
make test
```

### Unit Tests

```
PYTHONPATH=. py.test
```
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies:
- python=3.7.1
- pyahocorasick=1.4.0
- Unidecode=1.0.23
- elasticsearch-dsl=7.0.0
- elasticsearch-dsl=6.3.1
- ipython=7.5.0
- joblib=0.13.2
- python-magic=0.4.15
10 changes: 10 additions & 0 deletions extract_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,16 @@ def save_tables(data, outdir):
json.dump(metadata, f)


def load_tables(path):
path = Path(path)
with open(path / "metadata.json", "r") as f:
metadata = json.load(f)

return [Table.from_file(
path,
table_metadata) for table_metadata in metadata]


def set_ids_by_labels(soup):
captions = soup.select(".ltx_caption")
for caption in captions:
Expand Down
155 changes: 155 additions & 0 deletions init_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import re
import json
from pathlib import Path
from collections import Counter
from sota_extractor2.data.elastic import Reference2
from elasticsearch_dsl import connections
from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
from tqdm import tqdm
from elasticsearch.helpers import bulk
from elasticsearch_dsl.connections import connections
import http.client
import xml.etree.ElementTree as ET

# required for bulk saving
http.client._MAXHEADERS = 1000

connections.create_connection(hosts=['elasticsearch'], timeout=20)

papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")


def read_pwc_papers(path):
with open(path, "rt") as f:
return json.load(f)


arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$")


def from_paper_dict(paper):
authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()]
arxiv_id = None
if paper["arxiv_id"]:
arxiv_id = paper["arxiv_id"]
elif paper["url_abs"]:
m = arxiv_url_re.match(paper["url_abs"])
if m:
arxiv_id = m.group("arxiv_id")
title = None
if paper["title"]:
title = paper["title"].rstrip(" .")
slug = None
if paper["paper_url"]:
m = pwc_url_re.match(paper["paper_url"])
if m:
slug = m.group("slug")
return PReference(
title=title,
authors=authors,
ptr=paper["url_pdf"] or paper["url_abs"],
arxiv_id=arxiv_id,
pwc_slug=slug,
date=paper["date"],
orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.",
)


def _text(elem): return "".join(elem.itertext())


def from_paper_elem(elem):
authors_str = [_text(a).strip() for a in elem.findall("author")]
authors_str = [s for s in authors_str if s]
authors = [PAuthor.from_fullname(a) for a in authors_str]
arxiv_id = None
url = None
for ee in elem.findall("ee"):
if url is None or "oa" in ee.attrib: # prefere open access urls
url = _text(ee)
m = arxiv_url_only_re.match(_text(ee))
if m:
url = _text(ee) # prefere arxiv urls
arxiv_id = m.group("arxiv_id")
break
title = None
title_elem = elem.find("title")
if title_elem is not None:
title = _text(title_elem).rstrip(" .")
return PReference(
title=title,
authors=authors,
ptr=url,
arxiv_id=arxiv_id,
orig_ref=f"{', '.join(authors_str)}. {title}.",
)


def merge_references(p_references, elastic_references):
uids = Counter([p_ref.unique_id() for p_ref in p_references])
for p_ref in tqdm(p_references):
uid = p_ref.unique_id()
# ignore papers with too common title
# (often these are "Editorial", "Preface", "Letter")
if uids[uid] > 5:
continue
e_ref = elastic_references.get(uid)
if not e_ref:
e_ref = Reference2.from_ref(p_ref)
elastic_references[uid] = e_ref
e_ref.add_ref(p_ref)


def save_all(docs):
bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500)


def get_elastic_references(unique_ids, chunk_size=1000):
elastic_references = {}
i = 0
while i < len(unique_ids):
ids = unique_ids[i:i+chunk_size]
i += chunk_size
elastic_references.update({
uid: ref for uid, ref in zip(ids, Reference2.mget(ids))
if ref
})
return elastic_references


def init_pwc():
# read list of ML papers (titles, abstracts, arxiv ids, etc.)
all_papers = read_pwc_papers(papers_path)

# change dicts into PReferences
p_references = [from_paper_dict(paper) for paper in all_papers]

# keep references with valid ids
p_references = [ref for ref in p_references if ref.unique_id()]

all_ids = list(set(ref.unique_id() for ref in p_references))
elastic_references = get_elastic_references(all_ids)
merge_references(p_references, elastic_references)
save_all(elastic_references.values())


def init_dblp():
dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-noent.xml"))
#dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml"))
root = dblp_xml.getroot()
p_references = [from_paper_elem(elem) for elem in root]
p_references = [ref for ref in p_references if ref.unique_id()]

all_ids = list(set(ref.unique_id() for ref in p_references))
# todo: add references2 index initialization
elastic_references = {} #get_elastic_references(all_ids)

merge_references(p_references, elastic_references)
save_all(elastic_references.values())

# Reference2._index.delete()
Reference2.init()
init_dblp()
init_pwc()
8 changes: 6 additions & 2 deletions latex2html.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/usr/bin/env bash
OUTNAME="$1"
echo $OUTNAME
RO_SOURCE_DIR="/files/ro-source"
RO_SOURCE_DIR="${2:-/files/ro-source}"
SOURCE_DIR="/files/source"
OUTPUT_DIR="/files/htmls"
OUTPUT_DIR="${3:-/files/htmls}"

mkdir -p /files
cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"

# turn tikzpciture instances into comments
Expand All @@ -23,6 +24,9 @@ do
done

MAINTEX=$(python3 /files/guess_main.py "$SOURCE_DIR")
[ ! -f "$MAINTEX" ] && exit 1

timeout -s KILL 300 engrafo "$MAINTEX" /files/output

[ ! -f /files/output/index.html ] && exit 117
cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME"
30 changes: 3 additions & 27 deletions parse_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,7 @@
pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")


def get_refstrings(p):
paper = p.text if hasattr(p, 'text') else p
if not hasattr(paper, 'fragments'):
return
fragments = paper.fragments
ref_sec_started = False
for f in reversed(fragments):
if f.header.startswith('xxanchor-bib'):
ref_sec_started = True
yield f.text
elif ref_sec_started:
break # the refsection is only at the end of paper


_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
def extract_refs(p):
for ref in get_refstrings(p):
m = _ref_re.match(ref)
if m:
ref_id, ref_str = m.groups()
yield {
"paper_arxiv_id": p.arxiv_no_version,
"ref_id": ref_id,
"ref_str": ref_str.strip(r'\s')
}


class PaperCollectionReferenceParser:
def __init__(self):
Expand All @@ -52,13 +28,13 @@ def __init__(self):
def parse_refs(self, p):
for d in extract_refs(p):
if not d["ref_id"].startswith("pwc-"):
key = d["paper_arxiv_id"] + d["ref_id"]
key = p.arxiv_no_version + d["ref_id"]
if key not in self.cache:
new_id = self.refsdb.add_reference_string(d['ref_str'])
if new_id is not None:
new_id = "pwc-" + new_id
self.cache[key] = new_id
if self.cache[key] and len(self.cache[key]) > 500: # fix to self.cache to make the id compatible with elastic
if self.cache[key] and len(self.cache[key]) > ID_LIMIT: # fix to self.cache to make the id compatible with elastic
self.cache[key] = self.cache[key][:ID_LIMIT]
yield d["ref_id"], self.cache[key]
self.refsdb.sync()
Expand Down
2 changes: 1 addition & 1 deletion sota_extractor2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


elastic = dict(hosts=['localhost'], timeout=20)
grobid = dict(host='10.0.1.145')
grobid = dict(host='grobid')

arxiv = data/'arxiv'
htmls_raw = arxiv/'htmls'
Expand Down
45 changes: 43 additions & 2 deletions sota_extractor2/data/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ class Fragment(Document):
)
outer_headers = Text(analyzer=html_strip, )

class Meta:
doc_type = '_doc'

class Index:
doc_type = '_doc'
name = 'paper-fragments'

@classmethod
Expand Down Expand Up @@ -138,7 +142,11 @@ class Paper(Document):
analyzer=html_strip
)

class Meta:
doc_type = '_doc'

class Index:
doc_type = '_doc'
name = 'papers'

def to_json(self):
Expand Down Expand Up @@ -290,26 +298,42 @@ class Reference(Document):
urls = Keyword()
is_ml = Boolean()

class Meta:
doc_type = '_doc'

class Index:
doc_type = '_doc'
name = 'references'

def __repr__(self):
return f"{self.title} / {self.authors}"


ID_LIMIT=480


class Author2(InnerDoc):
forenames = Text(fields={'keyword': Keyword()})
surname = Text(fields={'keyword': Keyword()})


class Reference2(Document):
title = Text()
authors = Text()
authors = Object(Author2)

idno = Keyword()
date = Date()
ptr = Keyword()

arxiv_id = Keyword()
pwc_slug = Keyword()
orig_refs = Text()

class Meta:
doc_type = '_doc'

class Index:
doc_type = '_doc'
name = 'references2'

def add_ref(self, ref):
Expand All @@ -318,14 +342,15 @@ def add_ref(self, ref):
# self.refs.append(asdict(ref))
if ref.arxiv_id:
self.arxiv_id = ref.arxiv_id
if ref.pwc_slug:
self.pwc_slug = ref.pwc_slug
if ref.idno:
if hasattr(ref.idno, 'values'):
self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
elif isinstance(ref.idno, str):
self.idno = ref.idno
# if ref.date:
# self.date = ref.date
self.date = None
if ref.ptr:
self.ptr = ref.ptr
self.orig_refs = self.orig_refs if self.orig_refs else []
Expand Down Expand Up @@ -414,3 +439,19 @@ def display_fragment(f, cell_type="", display=True):
if display:
display_html(html)
return html


def query_for_evidences(paper_id, values, topk=5, fragment_size=50):
evidence_query = Fragment.search().highlight(
'text', pre_tags="<b>", post_tags="</b>", fragment_size=fragment_size)

query = {
"query": ' '.join(values)
}

fragments = list(evidence_query
.filter('term', paper_id=paper_id)
.query('match', text=query)[:topk]
)

return '\n'.join([' '.join(f.meta['highlight']['text']) for f in fragments])
Loading