Merge pull request #186 from okfn/remove-pdf

Remove PDF functionality
okfn · Nov 13, 2019 · 3366a08 · 3366a08
2 parents 83758c8 + ebfd1e2
commit 3366a08
Show file tree

Hide file tree

Showing 11 changed files with 8 additions and 162 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
+* Remove PDF functionality (PDFTableSet)- pdftables is not maintained
+
 0.15.2 (8 February 2017)
 * #165: detect ods types: boolean, currency, time and percentage. support repeated columns
 * #160: Correct spelling of separator in source

diff --git a/Dockerfile b/Dockerfile
@@ -26,5 +26,4 @@ WORKDIR /home/messytables
 COPY ./requirements-test.txt /home/messytables/
 RUN pip install --user -r /home/messytables/requirements-test.txt
 RUN pip3 install --user -r /home/messytables/requirements-test.txt
-RUN pip install --user pdftables
 COPY . /home/messytables/
diff --git a/doc/index.rst b/doc/index.rst
@@ -144,19 +144,6 @@ tables is ill-defined.
 
 .. autoclass:: messytables.html.HTMLRowSet
 
-PDF file support
-----------------
-
-The library supports PDF documents, using
-`pdftables <https://pdftables.readthedocs.io>`_ to extract tables.
-
-Works only for PDFs which contain text information: somewhat erratic in quality.
-
-.. autoclass:: messytables.pdf.PDFTableSet
-  :members: tables
-
-.. autoclass:: messytables.pdf.PDFRowSet
-
 ZIP file support
 ----------------
 

diff --git a/horror/simple.pdf b/horror/simple.pdf
diff --git a/messytables/__init__.py b/messytables/__init__.py
@@ -19,7 +19,6 @@
 
 from messytables.zip import ZIPTableSet
 from messytables.html import HTMLTableSet, HTMLRowSet
-from messytables.pdf import PDFTableSet, PDFRowSet
 from messytables.any import any_tableset, AnyTableSet
 
 from messytables.jts import rowset_as_jts, headers_and_typed_as_jts
diff --git a/messytables/any.py b/messytables/any.py
@@ -1,4 +1,4 @@
-from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
+from messytables import (ZIPTableSet, CSVTableSet, XLSTableSet,
                          HTMLTableSet, ODSTableSet)
 import messytables
 import re
@@ -21,7 +21,6 @@
               'application/vnd.openxmlformats-officedocument.spreadsheetml.sheetapplication/zip': 'XLS',
               'text/html': 'HTML',
               'application/xml': 'HTML', # XHTML is often served as application-xml
-              'application/pdf': 'PDF',
               'text/plain': 'CSV',  # could be TAB.
               'application/CDFV2-corrupt': 'XLS',
               'application/CDFV2-unknown': 'XLS',
@@ -38,8 +37,7 @@ def TABTableSet(fileobj):
            'XLS': XLSTableSet,
            'HTML': HTMLTableSet,
            'CSV': CSVTableSet,
-           'ODS': ODSTableSet,
-           'PDF': PDFTableSet}
+           'ODS': ODSTableSet}
 
 
 def clean_ext(filename):
@@ -99,7 +97,6 @@ def guess_ext(ext):
               'xlsx': 'XLS',
               'htm': 'HTML',
               'html': 'HTML',
-              'pdf': 'PDF',
               'xlt': 'XLS',
                 # obscure Excel extensions taken from
                 # http://en.wikipedia.org/wiki/List_of_Microsoft_Office_filename_extensions

diff --git a/messytables/pdf.py b/messytables/pdf.py
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,7 +1,6 @@
 httpretty==0.8.6
 nose==1.3.6
 requests==2.20.0
-# pdftables==0.0.4
 xlrd==0.9.3
 python-magic==0.4.12
 chardet==2.3.0

diff --git a/setup.py b/setup.py
@@ -45,11 +45,10 @@
         'python-dateutil>=1.5.0',
         'lxml>=3.2',
         'requests',
-        'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301  
-        'html5lib',        
+        'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301
+        'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
     ],
-    extras_require={'pdf': ['pdftables>=0.0.4']},
     tests_require=[],
     entry_points=\
     """

diff --git a/test/test_any.py b/test/test_any.py
@@ -4,7 +4,7 @@
 from . import horror_fobj
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
-from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
+from messytables import (any_tableset, XLSTableSet, ZIPTableSet,
                          CSVTableSet, ODSTableSet,
                          ReadError)
 
@@ -17,17 +17,6 @@
           'tableset': XLSTableSet},
          ]
 
-# Special handling for PDFTables - skip if not installed
-try:
-    import pdftables
-except ImportError:
-    got_pdftables = False
-    suite.append({"filename": "simple.pdf", "tableset": False})
-else:
-    from messytables import PDFTableSet
-    got_pdftables = True
-    suite.append({"filename": "simple.pdf", "tableset": PDFTableSet})
-
 
 def test_simple():
     for d in suite:

diff --git a/test/test_read.py b/test/test_read.py
@@ -14,7 +14,7 @@
     from .shim26 import assert_is_instance, assert_greater_equal
 
 from messytables import (CSVTableSet, StringType, HTMLTableSet,
-                         ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet,
+                         ZIPTableSet, XLSTableSet, XLSXTableSet,
                          ODSTableSet, headers_guess, headers_processor,
                          offset_processor, DateType, FloatType,
                          IntegerType, BoolType, rowset_as_jts,
@@ -669,31 +669,3 @@ def test_html_table_name(self):
         assert_equal('Table 1 of 3', table_set.tables[0].name)
         assert_equal('Table 2 of 3', table_set.tables[1].name)
         assert_equal('Table 3 of 3', table_set.tables[2].name)
-
-
-class ReadPdfTest(unittest.TestCase):
-    def setUp(self):
-        with horror_fobj('simple.pdf') as fh:
-            try:
-                PDFTableSet(fh)
-            except ImportError:
-                # Optional library isn't installed. Skip the tests.
-                raise SkipTest(
-                    "pdftables is not installed, skipping PDF tests")
-
-    def test_read_simple_pdf(self):
-        with horror_fobj('simple.pdf') as fh:
-            table_set = PDFTableSet(fh)
-
-        assert_equal(1, len(list(table_set.tables)))
-
-        (table,) = table_set.tables
-        rows = list(table)
-
-        assert_greater_equal(len(rows), 1)
-
-    def test_pdf_names(self):
-        with horror_fobj('simple.pdf') as fh:
-            table_set = PDFTableSet(fh)
-        assert_equal('Table 1 of 1 on page 1 of 1',
-                     table_set.tables[0].name)