okfn · pudo · Aug 6, 2015 · Aug 24, 2015 · Aug 24, 2015 · Aug 24, 2015
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,12 @@
 *.swp
 *.egg-info
 *.pyc
+*.eggs
 *.DS_Store
 */_build/*
 *.py~
 *.~lock.*#
+.coverage
+dist/*
+.tox/*
+pyenv3
diff --git a/.travis.yml b/.travis.yml
@@ -1,12 +1,12 @@
 language: python
 python:
-  - "2.6"
   - "2.7"
-  - "3.4"
+  - "3.5"
 install:
+  - pip install -U pip setuptools
   - pip install -e .
   - pip install -r requirements-test.txt
-  - pip install coveralls
+  - pip install coveralls nose coverage httpretty
 script: nosetests --with-coverage --cover-package=messytables
 after_success:
   - coveralls
diff --git a/Dockerfile b/Dockerfile
diff --git a/Makefile b/Makefile
@@ -1,10 +1,4 @@
-run:    build
-	@docker run \
-	    --rm \
-		-ti \
-	    messytables
+test:
+	nosetests --with-coverage --cover-package=messytables --cover-erase
 
-build:
-	@docker build -t messytables .
-
-.PHONY: run build
+.PHONY: run build test
diff --git a/README.md b/README.md
@@ -1,19 +1,11 @@
-# Parsing for messy tables
-
-[![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables)
-[![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
-[![Latest Version](https://pypip.in/version/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![Downloads](https://pypip.in/download/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![Supported Python versions](https://pypip.in/py_versions/messytables/badge.svg)](https://pypi.python.org/pypi/ckanserviceprovider/)
-[![Development Status](https://pypip.in/status/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![License](https://pypip.in/license/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
+# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
 
 A library for dealing with messy tabular data in several formats, guessing types and detecting headers.
 
-See the documentation at: https://messytables.readthedocs.io
+See the full documentation at: https://messytables.readthedocs.org
 
 Find the package at: https://pypi.python.org/pypi/messytables
 
-See CONTRIBUTING.md for how to send patches, run tests.
+See ``CONTRIBUTING.md`` for how to send patches, run tests.
 
 **Contact**: Open Knowledge Labs - http://okfnlabs.org/contact/. We especially recommend the forum: http://discuss.okfn.org/category/open-knowledge-labs/
diff --git a/messytables/__init__.py b/messytables/__init__.py
@@ -1,25 +1,21 @@
 
 from messytables.util import offset_processor, null_processor
-from messytables.headers import headers_guess, headers_processor, headers_make_unique
+from messytables.headers import headers_guess, headers_processor
+from messytables.headers import headers_make_unique
 from messytables.types import type_guess, types_processor
-from messytables.types import StringType, IntegerType, FloatType, \
-        DecimalType, DateType, DateUtilType, BoolType
 from messytables.error import ReadError
 
-from messytables.core import Cell, TableSet, RowSet, seekable_stream
-from messytables.commas import CSVTableSet, CSVRowSet
+from messytables.buffered import seekable_stream
+from messytables.core import Cell, TableSet, RowSet
+from messytables.commas import CSVTableSet, CSVRowSet, TSVTableSet
 from messytables.ods import ODSTableSet, ODSRowSet
 from messytables.excel import XLSTableSet, XLSRowSet
-
-# XLSXTableSet has been deprecated and its functionality is now provided by
-# XLSTableSet. This is to retain backwards compatibility with anyone
-# constructing XLSXTableSet directly (rather than using any_tableset)
-XLSXTableSet = XLSTableSet
-XLSXRowSet = XLSRowSet
-
 from messytables.zip import ZIPTableSet
 from messytables.html import HTMLTableSet, HTMLRowSet
 from messytables.pdf import PDFTableSet, PDFRowSet
-from messytables.any import any_tableset, AnyTableSet
+from messytables.any import any_tableset
 
 from messytables.jts import rowset_as_jts, headers_and_typed_as_jts
+
+import warnings
+warnings.filterwarnings('ignore', "Coercing non-XML name")
diff --git a/messytables/any.py b/messytables/any.py
@@ -1,8 +1,10 @@
-from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
-                         HTMLTableSet, ODSTableSet)
-import messytables
 import re
 
+from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet
+from messytables import HTMLTableSet, ODSTableSet, TSVTableSet
+from messytables.buffered import seekable_stream
+from messytables.error import ReadError
+
 
 MIMELOOKUP = {'application/x-zip-compressed': 'ZIP',
               'application/zip': 'ZIP',
@@ -24,14 +26,13 @@
               'application/pdf': 'PDF',
               'text/plain': 'CSV',  # could be TAB.
               'application/CDFV2-corrupt': 'XLS',
+              'application/CDFV2-unknown': 'XLS',
               'application/vnd.oasis.opendocument.spreadsheet': 'ODS',
               'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
               }
 
-def TABTableSet(fileobj):
-    return CSVTableSet(fileobj, delimiter='\t')
 
-parsers = {'TAB': TABTableSet,
+parsers = {'TAB': TSVTableSet,
            'ZIP': ZIPTableSet,
            'XLS': XLSTableSet,
            'HTML': HTMLTableSet,
@@ -61,9 +62,9 @@ def get_mime(fileobj):
     import magic
     # Since we need to peek the start of the stream, make sure we can
     # seek back later. If not, slurp in the contents into a StringIO.
-    fileobj = messytables.seekable_stream(fileobj)
+    fileobj = seekable_stream(fileobj)
     header = fileobj.read(4096)
-    mimetype = magic.from_buffer(header, mime=True).decode('utf-8')
+    mimetype = magic.from_buffer(header, mime=True)  # .decode('utf-8')
     fileobj.seek(0)
     if MIMELOOKUP.get(mimetype) == 'ZIP':
         # consider whether it's an Microsoft Office document
@@ -159,13 +160,6 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw):
                     mimetype=magic_mime))
 
     if error:
-        raise messytables.ReadError('any: \n'.join(error))
+        raise ReadError('any: \n'.join(error))
     else:
-        raise messytables.ReadError("any: Did not attempt any detection.")
-
-
-class AnyTableSet:
-    '''Deprecated - use any_tableset instead.'''
-    @staticmethod
-    def from_fileobj(fileobj, mimetype=None, extension=None):
-        return any_tableset(fileobj, mimetype=mimetype, extension=extension)
+        raise ReadError("any: Did not attempt any detection.")
diff --git a/messytables/buffered.py b/messytables/buffered.py
@@ -0,0 +1,89 @@
+import io
+
+BUFFER_SIZE = 4096
+
+
+def seekable_stream(fileobj):
+    try:
+        fileobj.seek(0)
+        # if we got here, the stream is seekable
+        return fileobj
+    except:
+        # otherwise seek failed, so slurp in stream and wrap
+        # it in a BytesIO
+        return BufferedFile(fileobj)
+
+
+class BufferedFile(object):
+    """A buffered file that preserves the beginning of a stream."""
+
+    def __init__(self, fp, buffer_size=BUFFER_SIZE + 2):
+        self.data = io.BytesIO()
+        self.fp = fp
+        self.offset = 0
+        self.len = 0
+        self.fp_offset = 0
+        self.buffer_size = buffer_size
+
+    def _next_line(self):
+        try:
+            return self.fp.readline()
+        except AttributeError:
+            return next(self.fp)
+
+    def _read(self, n):
+        return self.fp.read(n)
+
+    @property
+    def _buffer_full(self):
+        return self.len >= self.buffer_size
+
+    def readline(self):
+        if self.len < self.offset < self.fp_offset:
+            raise BufferError('Line is not available anymore')
+        if self.offset >= self.len:
+            line = self._next_line()
+            self.fp_offset += len(line)
+
+            self.offset += len(line)
+
+            if not self._buffer_full:
+                self.data.write(line)
+                self.len += len(line)
+        else:
+            line = self.data.readline()
+            self.offset += len(line)
+        return line
+
+    def read(self, n=-1):
+        if n == -1:
+            # if the request is to do a complete read, then do a complete
+            # read.
+            self.data.seek(self.offset)
+            return self.data.read(-1) + self.fp.read(-1)
+
+        if self.len < self.offset < self.fp_offset:
+            raise BufferError('Data is not available anymore')
+        if self.offset >= self.len:
+            byte = self._read(n)
+            self.fp_offset += len(byte)
+
+            self.offset += len(byte)
+
+            if not self._buffer_full:
+                self.data.write(byte)
+                self.len += len(byte)
+        else:
+            byte = self.data.read(n)
+            self.offset += len(byte)
+        return byte
+
+    def tell(self):
+        return self.offset
+
+    def seek(self, offset):
+        if self.len < offset < self.fp_offset:
+            raise BufferError('Cannot seek because data is not buffered here')
+        self.offset = offset
+        if offset < self.len:
+            self.data.seek(offset)