Permalink
Browse files

pipeline fixed

  • Loading branch information...
jmansilla committed Mar 20, 2016
1 parent 1f40586 commit d9a632bc45ab6d9cdd224634eb1e8dbf863d5a1f
Showing with 28 additions and 9 deletions.
  1. +12 −3 iepy/data/db.py
  2. +3 −0 iepy/preprocess/pipeline.py
  3. +13 −6 tests/test_preprocess_pipeline.py
View
@@ -38,6 +38,8 @@ class DocumentManager(object):
"""
### Basic administration and pre-process
def __init__(self, base_queryset=None):
self.base_queryset = base_queryset
def create_document(self, identifier, text, metadata=None, update_mode=False):
"""Creates a new Document with text ready to be inserted on the
@@ -73,14 +75,21 @@ def create_document(self, identifier, text, metadata=None, update_mode=False):
return doc
def _docs(self):
if self.base_queryset:
docs = self.base_queryset
else:
docs = IEDocument.objects.all()
return docs
def __iter__(self):
return iter(IEDocument.objects.all())
return iter(self._docs())
def get_raw_documents(self):
"""returns an interator of documents that lack the text field, or it's
empty.
"""
return IEDocument.objects.filter(text='')
return self._docs().filter(text='')
def get_documents_lacking_preprocess(self, step_or_steps):
"""Returns an iterator of documents that shall be processed on the given
@@ -100,7 +109,7 @@ def get_documents_lacking_preprocess(self, step_or_steps):
else:
query = query | q
if query is not None:
return IEDocument.objects.filter(query).order_by('id')
return self._docs().filter(query).order_by('id')
else:
return IEDocument.objects.none()
@@ -25,7 +25,10 @@ def __init__(self, step_runners, documents_manager):
then that runner will be treated as the responsible for
accomplishing such a PreProcessStep.
"""
from iepy.data.db import DocumentManager # circular imports safety
self.step_runners = step_runners
if not isinstance(documents_manager, DocumentManager):
documents_manager = DocumentManager(documents_manager)
self.documents = documents_manager
def walk_document(self, doc):
@@ -6,10 +6,18 @@
from unittest import TestCase
from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
from iepy.data.db import DocumentManager
class TestPreProcessPipeline(TestCase):
def patch_object(self, *args, **kwargs):
patcher = mock.patch.object(*args, **kwargs)
patched = patcher.start()
patched.patcher = patcher
self.addCleanup(patcher.stop)
return patched
def test_walk_document_applies_all_step_runners_to_the_given_doc(self):
step1_runner = mock.MagicMock()
step1_runner.side_effect = lambda x: x.call_order.append(1)
@@ -60,15 +68,14 @@ def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
override=False, increment=False)
all_docs = [object() for i in range(5)]
docs_manager = mock.MagicMock()
docs_manager.__iter__.return_value = all_docs
docs_manager.get_documents_lacking_preprocess.side_effect = lambda x: all_docs[:2]
self.patch_object(DocumentManager, '__iter__', return_value=all_docs)
dm_get_docs = self.patch_object(DocumentManager, 'get_documents_lacking_preprocess',
return_value=all_docs[:2])
# Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
# only 2 of them
p = PreProcessPipeline([step_runner], docs_manager)
p = PreProcessPipeline([step_runner], DocumentManager())
p.process_step_in_batch(step_runner)
docs_filter = docs_manager.get_documents_lacking_preprocess
docs_filter.assert_called_once_with(step_runner.step)
dm_get_docs.assert_called_once_with(step_runner.step)
self.assertNotEqual(step_runner.call_count, 5)
self.assertEqual(step_runner.call_count, 2)
self.assertEqual(step_runner.call_args_list, [mock.call(d) for d in all_docs[:2]])

0 comments on commit d9a632b

Please sign in to comment.