preprocess script fixed, tests added

machinalis · May 12, 2015 · ad740bc · ad740bc
1 parent 92d84be
commit ad740bc
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 8 deletions.
diff --git a/iepy/data/db.py b/iepy/data/db.py
@@ -82,14 +82,27 @@ def get_raw_documents(self):
         """
         return IEDocument.objects.filter(text='')
 
-    def get_documents_lacking_preprocess(self, step):
+    def get_documents_lacking_preprocess(self, step_or_steps):
         """Returns an iterator of documents that shall be processed on the given
         step."""
-        if step in PreProcessSteps:
-            flag_field_name = "%s_done_at" % step.name
-            query = {"%s__isnull" % flag_field_name: True}
-            return IEDocument.objects.filter(**query).order_by('id')
-        return IEDocument.objects.none()
+        from django.db.models import Q
+        if not isinstance(step_or_steps, (list, tuple)):
+            steps = [step_or_steps]
+        else:
+            steps = step_or_steps
+        query = None
+        for step in steps:
+            if step in PreProcessSteps:
+                flag_field_name = "%s_done_at" % step.name
+                q = Q(**{"%s__isnull" % flag_field_name: True})
+                if query is None:
+                    query = q
+                else:
+                    query = query | q
+        if query is not None:
+            return IEDocument.objects.filter(query).order_by('id')
+        else:
+            return IEDocument.objects.none()
 
 
 class TextSegmentManager(object):
@@ -233,7 +246,8 @@ def labels_for(cls, relation, evidences, conflict_solver=None):
         logger.info("Getting labels from DB")
         labels = EvidenceLabel.objects.filter(
             relation=relation,
-            label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION, EvidenceLabel.NONSENSE],
+            label__in=[EvidenceLabel.NORELATION, EvidenceLabel.YESRELATION,
+                       EvidenceLabel.NONSENSE],
             labeled_by_machine=False
         )
         logger.info("Sorting labels them by evidence")

diff --git a/iepy/instantiation/preprocess.py b/iepy/instantiation/preprocess.py
@@ -49,7 +49,8 @@ def start_preprocess(docs, increment_ner):
     increment_ner = opts['--increment-ner']
 
     dm = ParallelDocManager()
-    all_docs = dm.get_documents_lacking_preprocess(PreProcessSteps.segmentation)
+    all_docs = dm.get_documents_lacking_preprocess(
+        [PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing])
 
     multiple_cores = opts.get('--multiple-cores')
     split_in = opts.get("--split-in")

diff --git a/tests/test_db_preprocess_administration.py b/tests/test_db_preprocess_administration.py
@@ -195,6 +195,24 @@ def test_unsentenced_documents_are_filtered(self):
         self.assertIn(doc2, unsentenced)
         self.assertNotIn(doc3, unsentenced)
 
+    def test_can_get_both_unsegmented_or_unsynparsed_documents(self):
+        doc1 = SentencedIEDocFactory(text='Something nice.')
+        doc2 = SentencedIEDocFactory(text='Something even nicer.')
+        doc3 = SentencedIEDocFactory(text='Some sentence. And some other. Indeed!')
+        def filter():
+            return self.manager.get_documents_lacking_preprocess(
+                [PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing]
+            )
+        docs = [doc1, doc2, doc3]
+        self.assertEqual(list(filter()), docs)
+        for d in docs:
+            d.set_segmentation_result([RawSegment(0, 3, None)])
+            d.save()
+        self.assertEqual(list(filter()), docs)
+        doc1.set_syntactic_parsing_result(["(ROOT (NP (JJ Hello) (NN world) (. .)))]"])
+        doc1.save()
+        self.assertEqual(list(filter()), [doc2, doc3])
+
 
 class TestDocumentSentenceIterator(TestCase):