diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e8f8cc..24c149a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +* Remove PDF functionality (PDFTableSet)- pdftables is not maintained + 0.15.2 (8 February 2017) * #165: detect ods types: boolean, currency, time and percentage. support repeated columns * #160: Correct spelling of separator in source diff --git a/Dockerfile b/Dockerfile index b682622..e77d800 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,5 +26,4 @@ WORKDIR /home/messytables COPY ./requirements-test.txt /home/messytables/ RUN pip install --user -r /home/messytables/requirements-test.txt RUN pip3 install --user -r /home/messytables/requirements-test.txt -RUN pip install --user pdftables COPY . /home/messytables/ diff --git a/doc/index.rst b/doc/index.rst index c5ed512..ac13688 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -144,19 +144,6 @@ tables is ill-defined. .. autoclass:: messytables.html.HTMLRowSet -PDF file support ----------------- - -The library supports PDF documents, using -`pdftables `_ to extract tables. - -Works only for PDFs which contain text information: somewhat erratic in quality. - -.. autoclass:: messytables.pdf.PDFTableSet - :members: tables - -.. autoclass:: messytables.pdf.PDFRowSet - ZIP file support ---------------- diff --git a/horror/simple.pdf b/horror/simple.pdf deleted file mode 100644 index 1877b9a..0000000 Binary files a/horror/simple.pdf and /dev/null differ diff --git a/messytables/__init__.py b/messytables/__init__.py index e2c03b9..8f27efb 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -19,7 +19,6 @@ from messytables.zip import ZIPTableSet from messytables.html import HTMLTableSet, HTMLRowSet -from messytables.pdf import PDFTableSet, PDFRowSet from messytables.any import any_tableset, AnyTableSet from messytables.jts import rowset_as_jts, headers_and_typed_as_jts diff --git a/messytables/any.py b/messytables/any.py index fd9dfc5..a6cd1ab 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,4 +1,4 @@ -from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, +from messytables import (ZIPTableSet, CSVTableSet, XLSTableSet, HTMLTableSet, ODSTableSet) import messytables import re @@ -21,7 +21,6 @@ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheetapplication/zip': 'XLS', 'text/html': 'HTML', 'application/xml': 'HTML', # XHTML is often served as application-xml - 'application/pdf': 'PDF', 'text/plain': 'CSV', # could be TAB. 'application/CDFV2-corrupt': 'XLS', 'application/CDFV2-unknown': 'XLS', @@ -38,8 +37,7 @@ def TABTableSet(fileobj): 'XLS': XLSTableSet, 'HTML': HTMLTableSet, 'CSV': CSVTableSet, - 'ODS': ODSTableSet, - 'PDF': PDFTableSet} + 'ODS': ODSTableSet} def clean_ext(filename): @@ -99,7 +97,6 @@ def guess_ext(ext): 'xlsx': 'XLS', 'htm': 'HTML', 'html': 'HTML', - 'pdf': 'PDF', 'xlt': 'XLS', # obscure Excel extensions taken from # http://en.wikipedia.org/wiki/List_of_Microsoft_Office_filename_extensions diff --git a/messytables/pdf.py b/messytables/pdf.py deleted file mode 100644 index 4f9052e..0000000 --- a/messytables/pdf.py +++ /dev/null @@ -1,97 +0,0 @@ -from messytables.core import RowSet, TableSet, Cell - -from messytables.types import StringType - -try: - from pdftables import get_tables -except ImportError as exc: - if "No module named" not in exc.args[0]: - raise - get_tables = None - - -class PDFCell(Cell): - - def __init__(self, pdftables_cell): - - self._cell = pdftables_cell - - if pdftables_cell.topleft: - w, h = pdftables_cell.size - self._properties = dict( - colspan=w, - rowspan=h, - ) - self.value = pdftables_cell.content - - else: - self._properties = {} - self.value = "" - - self.column = None - self.column_autogenerated = False - self.type = StringType() - - @property - def topleft(self): - return self._cell.topleft - - @property - def properties(self): - return self._properties - - -class PDFTableSet(TableSet): - """ - A TableSet from a PDF document. - """ - def __init__(self, fileobj=None, filename=None, **kw): - if get_tables is None: - raise ImportError("pdftables is not installed") - if filename is not None: - self.fh = open(filename, 'r') - elif fileobj is not None: - self.fh = fileobj - else: - raise TypeError('You must provide one of filename or fileobj') - self.raw_tables = get_tables(self.fh) - - def make_tables(self): - """ - Return a listing of tables (as PDFRowSets) in the table set. - """ - def table_name(table): - return "Table {0} of {1} on page {2} of {3}".format( - table.table_number_on_page, - table.total_tables_on_page, - table.page_number, - table.total_pages) - return [PDFRowSet(table_name(table), table) - for table in self.raw_tables] - - -class PDFRowSet(RowSet): - """ - A RowSet representing a PDF table. - """ - def __init__(self, name, table): - if get_tables is None: - raise ImportError("pdftables is not installed") - super(PDFRowSet, self).__init__() - self.name = name - self.table = table - self.meta = dict( - page_number=table.page_number + 1, - ) - - def raw(self, sample=False): - """ - Yield one row of cells at a time - """ - if hasattr(self.table, "cell_data"): - # New style of cell data. - for row in self.table.cell_data: - yield [PDFCell(pdf_cell) for pdf_cell in row] - else: - for row in self.table: - yield [Cell(pdf_cell) for pdf_cell in row] diff --git a/requirements-test.txt b/requirements-test.txt index 5fe995e..d3c62db 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,7 +1,6 @@ httpretty==0.8.6 nose==1.3.6 requests==2.20.0 -# pdftables==0.0.4 xlrd==0.9.3 python-magic==0.4.12 chardet==2.3.0 diff --git a/setup.py b/setup.py index 4f8f8ed..630a6cf 100644 --- a/setup.py +++ b/setup.py @@ -45,11 +45,10 @@ 'python-dateutil>=1.5.0', 'lxml>=3.2', 'requests', - 'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301 - 'html5lib', + 'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301 + 'html5lib', 'json-table-schema>=0.2, <=0.2.1' ], - extras_require={'pdf': ['pdftables>=0.0.4']}, tests_require=[], entry_points=\ """ diff --git a/test/test_any.py b/test/test_any.py index 1fbfe78..0aba8cd 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -4,7 +4,7 @@ from . import horror_fobj from nose.tools import assert_equal from nose.plugins.skip import SkipTest -from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, +from messytables import (any_tableset, XLSTableSet, ZIPTableSet, CSVTableSet, ODSTableSet, ReadError) @@ -17,17 +17,6 @@ 'tableset': XLSTableSet}, ] -# Special handling for PDFTables - skip if not installed -try: - import pdftables -except ImportError: - got_pdftables = False - suite.append({"filename": "simple.pdf", "tableset": False}) -else: - from messytables import PDFTableSet - got_pdftables = True - suite.append({"filename": "simple.pdf", "tableset": PDFTableSet}) - def test_simple(): for d in suite: diff --git a/test/test_read.py b/test/test_read.py index ec4dbdc..3e6e94c 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -14,7 +14,7 @@ from .shim26 import assert_is_instance, assert_greater_equal from messytables import (CSVTableSet, StringType, HTMLTableSet, - ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet, + ZIPTableSet, XLSTableSet, XLSXTableSet, ODSTableSet, headers_guess, headers_processor, offset_processor, DateType, FloatType, IntegerType, BoolType, rowset_as_jts, @@ -669,31 +669,3 @@ def test_html_table_name(self): assert_equal('Table 1 of 3', table_set.tables[0].name) assert_equal('Table 2 of 3', table_set.tables[1].name) assert_equal('Table 3 of 3', table_set.tables[2].name) - - -class ReadPdfTest(unittest.TestCase): - def setUp(self): - with horror_fobj('simple.pdf') as fh: - try: - PDFTableSet(fh) - except ImportError: - # Optional library isn't installed. Skip the tests. - raise SkipTest( - "pdftables is not installed, skipping PDF tests") - - def test_read_simple_pdf(self): - with horror_fobj('simple.pdf') as fh: - table_set = PDFTableSet(fh) - - assert_equal(1, len(list(table_set.tables))) - - (table,) = table_set.tables - rows = list(table) - - assert_greater_equal(len(rows), 1) - - def test_pdf_names(self): - with horror_fobj('simple.pdf') as fh: - table_set = PDFTableSet(fh) - assert_equal('Table 1 of 1 on page 1 of 1', - table_set.tables[0].name)