Skip to content

Commit

Permalink
pipeline fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
jmansilla committed Mar 20, 2016
1 parent 1f40586 commit d9a632b
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 9 deletions.
15 changes: 12 additions & 3 deletions iepy/data/db.py
Expand Up @@ -38,6 +38,8 @@ class DocumentManager(object):
"""

### Basic administration and pre-process
def __init__(self, base_queryset=None):
self.base_queryset = base_queryset

def create_document(self, identifier, text, metadata=None, update_mode=False):
"""Creates a new Document with text ready to be inserted on the
Expand Down Expand Up @@ -73,14 +75,21 @@ def create_document(self, identifier, text, metadata=None, update_mode=False):

return doc

def _docs(self):
if self.base_queryset:
docs = self.base_queryset
else:
docs = IEDocument.objects.all()
return docs

def __iter__(self):
return iter(IEDocument.objects.all())
return iter(self._docs())

def get_raw_documents(self):
"""returns an interator of documents that lack the text field, or it's
empty.
"""
return IEDocument.objects.filter(text='')
return self._docs().filter(text='')

def get_documents_lacking_preprocess(self, step_or_steps):
"""Returns an iterator of documents that shall be processed on the given
Expand All @@ -100,7 +109,7 @@ def get_documents_lacking_preprocess(self, step_or_steps):
else:
query = query | q
if query is not None:
return IEDocument.objects.filter(query).order_by('id')
return self._docs().filter(query).order_by('id')
else:
return IEDocument.objects.none()

Expand Down
3 changes: 3 additions & 0 deletions iepy/preprocess/pipeline.py
Expand Up @@ -25,7 +25,10 @@ def __init__(self, step_runners, documents_manager):
then that runner will be treated as the responsible for
accomplishing such a PreProcessStep.
"""
from iepy.data.db import DocumentManager # circular imports safety
self.step_runners = step_runners
if not isinstance(documents_manager, DocumentManager):
documents_manager = DocumentManager(documents_manager)
self.documents = documents_manager

def walk_document(self, doc):
Expand Down
19 changes: 13 additions & 6 deletions tests/test_preprocess_pipeline.py
Expand Up @@ -6,10 +6,18 @@
from unittest import TestCase

from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
from iepy.data.db import DocumentManager


class TestPreProcessPipeline(TestCase):

def patch_object(self, *args, **kwargs):
patcher = mock.patch.object(*args, **kwargs)
patched = patcher.start()
patched.patcher = patcher
self.addCleanup(patcher.stop)
return patched

def test_walk_document_applies_all_step_runners_to_the_given_doc(self):
step1_runner = mock.MagicMock()
step1_runner.side_effect = lambda x: x.call_order.append(1)
Expand Down Expand Up @@ -60,15 +68,14 @@ def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
override=False, increment=False)
all_docs = [object() for i in range(5)]
docs_manager = mock.MagicMock()
docs_manager.__iter__.return_value = all_docs
docs_manager.get_documents_lacking_preprocess.side_effect = lambda x: all_docs[:2]
self.patch_object(DocumentManager, '__iter__', return_value=all_docs)
dm_get_docs = self.patch_object(DocumentManager, 'get_documents_lacking_preprocess',
return_value=all_docs[:2])
# Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
# only 2 of them
p = PreProcessPipeline([step_runner], docs_manager)
p = PreProcessPipeline([step_runner], DocumentManager())
p.process_step_in_batch(step_runner)
docs_filter = docs_manager.get_documents_lacking_preprocess
docs_filter.assert_called_once_with(step_runner.step)
dm_get_docs.assert_called_once_with(step_runner.step)
self.assertNotEqual(step_runner.call_count, 5)
self.assertEqual(step_runner.call_count, 2)
self.assertEqual(step_runner.call_args_list, [mock.call(d) for d in all_docs[:2]])
Expand Down

0 comments on commit d9a632b

Please sign in to comment.