Skip to content

Commit

Permalink
Merge pull request #186 from okfn/remove-pdf
Browse files Browse the repository at this point in the history
Remove PDF functionality
  • Loading branch information
David Read committed Nov 13, 2019
2 parents 83758c8 + ebfd1e2 commit 3366a08
Show file tree
Hide file tree
Showing 11 changed files with 8 additions and 162 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
* Remove PDF functionality (PDFTableSet)- pdftables is not maintained

0.15.2 (8 February 2017)
* #165: detect ods types: boolean, currency, time and percentage. support repeated columns
* #160: Correct spelling of separator in source
Expand Down
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,4 @@ WORKDIR /home/messytables
COPY ./requirements-test.txt /home/messytables/
RUN pip install --user -r /home/messytables/requirements-test.txt
RUN pip3 install --user -r /home/messytables/requirements-test.txt
RUN pip install --user pdftables
COPY . /home/messytables/
13 changes: 0 additions & 13 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,19 +144,6 @@ tables is ill-defined.

.. autoclass:: messytables.html.HTMLRowSet

PDF file support
----------------

The library supports PDF documents, using
`pdftables <https://pdftables.readthedocs.io>`_ to extract tables.

Works only for PDFs which contain text information: somewhat erratic in quality.

.. autoclass:: messytables.pdf.PDFTableSet
:members: tables

.. autoclass:: messytables.pdf.PDFRowSet

ZIP file support
----------------

Expand Down
Binary file removed horror/simple.pdf
Binary file not shown.
1 change: 0 additions & 1 deletion messytables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from messytables.zip import ZIPTableSet
from messytables.html import HTMLTableSet, HTMLRowSet
from messytables.pdf import PDFTableSet, PDFRowSet
from messytables.any import any_tableset, AnyTableSet

from messytables.jts import rowset_as_jts, headers_and_typed_as_jts
7 changes: 2 additions & 5 deletions messytables/any.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
from messytables import (ZIPTableSet, CSVTableSet, XLSTableSet,
HTMLTableSet, ODSTableSet)
import messytables
import re
Expand All @@ -21,7 +21,6 @@
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheetapplication/zip': 'XLS',
'text/html': 'HTML',
'application/xml': 'HTML', # XHTML is often served as application-xml
'application/pdf': 'PDF',
'text/plain': 'CSV', # could be TAB.
'application/CDFV2-corrupt': 'XLS',
'application/CDFV2-unknown': 'XLS',
Expand All @@ -38,8 +37,7 @@ def TABTableSet(fileobj):
'XLS': XLSTableSet,
'HTML': HTMLTableSet,
'CSV': CSVTableSet,
'ODS': ODSTableSet,
'PDF': PDFTableSet}
'ODS': ODSTableSet}


def clean_ext(filename):
Expand Down Expand Up @@ -99,7 +97,6 @@ def guess_ext(ext):
'xlsx': 'XLS',
'htm': 'HTML',
'html': 'HTML',
'pdf': 'PDF',
'xlt': 'XLS',
# obscure Excel extensions taken from
# http://en.wikipedia.org/wiki/List_of_Microsoft_Office_filename_extensions
Expand Down
97 changes: 0 additions & 97 deletions messytables/pdf.py

This file was deleted.

1 change: 0 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
httpretty==0.8.6
nose==1.3.6
requests==2.20.0
# pdftables==0.0.4
xlrd==0.9.3
python-magic==0.4.12
chardet==2.3.0
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,10 @@
'python-dateutil>=1.5.0',
'lxml>=3.2',
'requests',
'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301
'html5lib',
'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301
'html5lib',
'json-table-schema>=0.2, <=0.2.1'
],
extras_require={'pdf': ['pdftables>=0.0.4']},
tests_require=[],
entry_points=\
"""
Expand Down
13 changes: 1 addition & 12 deletions test/test_any.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from . import horror_fobj
from nose.tools import assert_equal
from nose.plugins.skip import SkipTest
from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
from messytables import (any_tableset, XLSTableSet, ZIPTableSet,
CSVTableSet, ODSTableSet,
ReadError)

Expand All @@ -17,17 +17,6 @@
'tableset': XLSTableSet},
]

# Special handling for PDFTables - skip if not installed
try:
import pdftables
except ImportError:
got_pdftables = False
suite.append({"filename": "simple.pdf", "tableset": False})
else:
from messytables import PDFTableSet
got_pdftables = True
suite.append({"filename": "simple.pdf", "tableset": PDFTableSet})


def test_simple():
for d in suite:
Expand Down
30 changes: 1 addition & 29 deletions test/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .shim26 import assert_is_instance, assert_greater_equal

from messytables import (CSVTableSet, StringType, HTMLTableSet,
ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet,
ZIPTableSet, XLSTableSet, XLSXTableSet,
ODSTableSet, headers_guess, headers_processor,
offset_processor, DateType, FloatType,
IntegerType, BoolType, rowset_as_jts,
Expand Down Expand Up @@ -669,31 +669,3 @@ def test_html_table_name(self):
assert_equal('Table 1 of 3', table_set.tables[0].name)
assert_equal('Table 2 of 3', table_set.tables[1].name)
assert_equal('Table 3 of 3', table_set.tables[2].name)


class ReadPdfTest(unittest.TestCase):
def setUp(self):
with horror_fobj('simple.pdf') as fh:
try:
PDFTableSet(fh)
except ImportError:
# Optional library isn't installed. Skip the tests.
raise SkipTest(
"pdftables is not installed, skipping PDF tests")

def test_read_simple_pdf(self):
with horror_fobj('simple.pdf') as fh:
table_set = PDFTableSet(fh)

assert_equal(1, len(list(table_set.tables)))

(table,) = table_set.tables
rows = list(table)

assert_greater_equal(len(rows), 1)

def test_pdf_names(self):
with horror_fobj('simple.pdf') as fh:
table_set = PDFTableSet(fh)
assert_equal('Table 1 of 1 on page 1 of 1',
table_set.tables[0].name)

0 comments on commit 3366a08

Please sign in to comment.