Skip to content

Commit

Permalink
Stubbing in tests for text extraction with Tika.
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmcclure committed Feb 26, 2015
1 parent d7bedb5 commit 806ad3a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
12 changes: 12 additions & 0 deletions osp/corpus/test/utils/test_office_to_text.py
@@ -0,0 +1,12 @@


from osp.corpus.utils import office_to_text


def test_extract_text():

"""
Text should be extracted via Tika.
"""

pass
3 changes: 3 additions & 0 deletions osp/corpus/test/utils/test_pdf_to_text.py
Expand Up @@ -14,7 +14,10 @@ def test_extract_text():

corpus = MockCorpus()
corpus.add_segment('000')

# Create a PDF with 3 pages.
handle = corpus.add_pdf('000', 'pdf', ['p1', 'p2', 'p3'])

# Should extract the text.
pages = pdf_to_text(handle.name).split()
assert pages == ['p1', 'p2', 'p3']

0 comments on commit 806ad3a

Please sign in to comment.