Skip to content

Commit

Permalink
Revert "Workaround and test for #56 and euske/pdfminer#118"
Browse files Browse the repository at this point in the history
This reverts commit 7c31351.
  • Loading branch information
pombredanne committed Dec 7, 2015
1 parent bc98272 commit f87d2a8
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 50 deletions.
39 changes: 17 additions & 22 deletions src/textcode/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,30 @@
import contextlib
from StringIO import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFException

from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

"""
Extracts text from a pdf file.
"""

def get_text_lines(location):
"""
Return a list of text lines extracted from a pdf file at location.
"""
extracted_text = StringIO()
laparams = LAParams()
with open(location, 'rb') as pdf_file:
try:
with contextlib.closing(PDFParser(pdf_file)) as parser:
document = PDFDocument(parser)
manager = PDFResourceManager()
with contextlib.closing(TextConverter(manager, extracted_text,
laparams=laparams)) as extractor:
interpreter = PDFPageInterpreter(manager, extractor)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
extracted_text.seek(0)
lines = extracted_text.readlines()
except PDFException:
return []
with contextlib.closing(PDFParser(pdf_file)) as parser:
document = PDFDocument(parser)
manager = PDFResourceManager()
with contextlib.closing(TextConverter(manager, extracted_text,
laparams=laparams)) as extractor:
interpreter = PDFPageInterpreter(manager, extractor)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
extracted_text.seek(0)
lines = extracted_text.readlines()
return lines
8 changes: 0 additions & 8 deletions tests/textcode/data/pdf/pdfminer_bug_118/README

This file was deleted.

Binary file removed tests/textcode/data/pdf/pdfminer_bug_118/faulty.pdf
Binary file not shown.
20 changes: 0 additions & 20 deletions tests/textcode/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,3 @@ def get_text(location):
\x0c'''.splitlines(True)

self.assertEqual(expected, result)

def test_pdfminer_cant_parse_faulty_broadcom_doc(self):
# test for https://github.com/euske/pdfminer/issues/118
test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdocument import PDFEncryptionError
with open(test_file,'rb') as inputfile:
parser=PDFParser(inputfile)
try:
PDFDocument(parser)
except PDFEncryptionError:
#this should not fail of course, and will when upstream is fixed
pass

def test_get_text_lines_skip_parse_faulty_broadcom_doc(self):
# test for
test_file = self.get_test_loc('pdf/pdfminer_bug_118/faulty.pdf')
result = pdf.get_text_lines(test_file)
assert [] == result

0 comments on commit f87d2a8

Please sign in to comment.