Skip to content

Commit

Permalink
preprocess script fixed, tests added
Browse files Browse the repository at this point in the history
  • Loading branch information
jmansilla committed May 12, 2015
1 parent 92d84be commit ad740bc
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 8 deletions.
28 changes: 21 additions & 7 deletions iepy/data/db.py
Expand Up @@ -82,14 +82,27 @@ def get_raw_documents(self):
"""
return IEDocument.objects.filter(text='')

def get_documents_lacking_preprocess(self, step):
def get_documents_lacking_preprocess(self, step_or_steps):
"""Returns an iterator of documents that shall be processed on the given
step."""
if step in PreProcessSteps:
flag_field_name = "%s_done_at" % step.name
query = {"%s__isnull" % flag_field_name: True}
return IEDocument.objects.filter(**query).order_by('id')
return IEDocument.objects.none()
from django.db.models import Q
if not isinstance(step_or_steps, (list, tuple)):
steps = [step_or_steps]
else:
steps = step_or_steps
query = None
for step in steps:
if step in PreProcessSteps:
flag_field_name = "%s_done_at" % step.name
q = Q(**{"%s__isnull" % flag_field_name: True})
if query is None:
query = q
else:
query = query | q
if query is not None:
return IEDocument.objects.filter(query).order_by('id')
else:
return IEDocument.objects.none()


class TextSegmentManager(object):
Expand Down Expand Up @@ -233,7 +246,8 @@ def labels_for(cls, relation, evidences, conflict_solver=None):
logger.info("Getting labels from DB")
labels = EvidenceLabel.objects.filter(
relation=relation,
label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION, EvidenceLabel.NONSENSE],
label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION,
EvidenceLabel.NONSENSE],
labeled_by_machine=False
)
logger.info("Sorting labels them by evidence")
Expand Down
3 changes: 2 additions & 1 deletion iepy/instantiation/preprocess.py
Expand Up @@ -49,7 +49,8 @@ def start_preprocess(docs, increment_ner):
increment_ner = opts['--increment-ner']

dm = ParallelDocManager()
all_docs = dm.get_documents_lacking_preprocess(PreProcessSteps.segmentation)
all_docs = dm.get_documents_lacking_preprocess(
[PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing])

multiple_cores = opts.get('--multiple-cores')
split_in = opts.get("--split-in")
Expand Down
18 changes: 18 additions & 0 deletions tests/test_db_preprocess_administration.py
Expand Up @@ -195,6 +195,24 @@ def test_unsentenced_documents_are_filtered(self):
self.assertIn(doc2, unsentenced)
self.assertNotIn(doc3, unsentenced)

def test_can_get_both_unsegmented_or_unsynparsed_documents(self):
doc1 = SentencedIEDocFactory(text='Something nice.')
doc2 = SentencedIEDocFactory(text='Something even nicer.')
doc3 = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
def filter():
return self.manager.get_documents_lacking_preprocess(
[PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing]
)
docs = [doc1, doc2, doc3]
self.assertEqual(list(filter()), docs)
for d in docs:
d.set_segmentation_result([RawSegment(0, 3, None)])
d.save()
self.assertEqual(list(filter()), docs)
doc1.set_syntactic_parsing_result(["(ROOT (NP (JJ Hello) (NN world) (. .)))]"])
doc1.save()
self.assertEqual(list(filter()), [doc2, doc3])


class TestDocumentSentenceIterator(TestCase):

Expand Down

0 comments on commit ad740bc

Please sign in to comment.