Stubbing in tests for text extraction with Tika.

overview · Feb 26, 2015 · 806ad3a · 806ad3a
1 parent d7bedb5
commit 806ad3a
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 0 deletions.
diff --git a/osp/corpus/test/utils/test_office_to_text.py b/osp/corpus/test/utils/test_office_to_text.py
@@ -0,0 +1,12 @@
+
+
+from osp.corpus.utils import office_to_text
+
+
+def test_extract_text():
+
+    """
+    Text should be extracted via Tika.
+    """
+
+    pass
diff --git a/osp/corpus/test/utils/test_pdf_to_text.py b/osp/corpus/test/utils/test_pdf_to_text.py
@@ -14,7 +14,10 @@ def test_extract_text():
 
     corpus = MockCorpus()
     corpus.add_segment('000')
+
+    # Create a PDF with 3 pages.
     handle = corpus.add_pdf('000', 'pdf', ['p1', 'p2', 'p3'])
 
+    # Should extract the text.
     pages = pdf_to_text(handle.name).split()
     assert pages == ['p1', 'p2', 'p3']