From 749c8f8bc9d8348afd7177fddcfa9dd942ab6feb Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:01:32 -0400 Subject: [PATCH 1/9] ENH: Allow read_csv to take a URL --- pandas/io/parsers.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3028184936e72..71de6be277528 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,10 @@ Parameters ---------- -filepath_or_buffer : string or file handle / StringIO +filepath_or_buffer : string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http://, ftp://, and file://. For + file:// URLs, a host is expected. For instance, a local file could be + file://localhost/path/to/table.csv %s header : int, default 0 Row to use for the column labels of the parsed DataFrame @@ -80,7 +83,6 @@ %s """ % (_parser_params % _table_sep) - _fwf_widths = """\ colspecs : a list of pairs (tuples), giving the extents of the fixed-width fields of each line as half-open internals @@ -99,8 +101,25 @@ """ % (_parser_params % _fwf_widths) +def _is_url(url): + """ + Very naive check to see if url is an http, ftp, or file location. + """ + from urlparse import urlparse + parsed_url = urlparse(url) + if parsed_url.scheme in ['http','file', 'ftp']: + return True + else: + return False + + def _read(cls, filepath_or_buffer, kwds): "Generic reader of line files." + + if _is_url(filepath_or_buffer): + from urllib2 import urlopen + filepath_or_buffer = urlopen(filepath_or_buffer) + if hasattr(filepath_or_buffer, 'read'): f = filepath_or_buffer else: From 65701f35f092cd318aa1b89b6e9d78d6e997eb98 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:03:00 -0400 Subject: [PATCH 2/9] TST: Add test data for URL io --- pandas/io/tests/salary.table | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 pandas/io/tests/salary.table diff --git a/pandas/io/tests/salary.table b/pandas/io/tests/salary.table new file mode 100644 index 0000000000000..090b53e5535b8 --- /dev/null +++ b/pandas/io/tests/salary.table @@ -0,0 +1,47 @@ +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 From dca72cc7e237085034830dd2df22d8a70d7efd28 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:08:54 -0400 Subject: [PATCH 3/9] ENH: Allow https in url --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 71de6be277528..392df471e07f0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -103,11 +103,11 @@ def _is_url(url): """ - Very naive check to see if url is an http, ftp, or file location. + Very naive check to see if url is an http(s), ftp, or file location. """ from urlparse import urlparse parsed_url = urlparse(url) - if parsed_url.scheme in ['http','file', 'ftp']: + if parsed_url.scheme in ['http','file', 'ftp', 'https']: return True else: return False From 60922a3b08121b6bf5f97d5519bd53b72a7203b9 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:27:09 -0400 Subject: [PATCH 4/9] REF: Go ahead and import urlparse --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 392df471e07f0..ecc4f9bf413bb 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -4,6 +4,7 @@ from StringIO import StringIO import re from itertools import izip +from urlparse import urlparse import numpy as np @@ -105,7 +106,6 @@ def _is_url(url): """ Very naive check to see if url is an http(s), ftp, or file location. """ - from urlparse import urlparse parsed_url = urlparse(url) if parsed_url.scheme in ['http','file', 'ftp', 'https']: return True From 03e59a5343cefa0f1475691757dfbe7a39852a43 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:27:52 -0400 Subject: [PATCH 5/9] ENH: Only give strings to _is_url --- pandas/io/parsers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ecc4f9bf413bb..33aee1c66a006 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -112,11 +112,10 @@ def _is_url(url): else: return False - def _read(cls, filepath_or_buffer, kwds): "Generic reader of line files." - if _is_url(filepath_or_buffer): + if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): from urllib2 import urlopen filepath_or_buffer = urlopen(filepath_or_buffer) From f3788a024476e23b6895f04a585b9668ffb072b8 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Mon, 26 Mar 2012 14:29:28 -0400 Subject: [PATCH 6/9] TST: Add tests for read_table with URL --- pandas/io/tests/test_parsers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index d9c7a60f6ec29..463672e23e00b 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -764,6 +764,18 @@ def test_fwf(self): df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None) assert_frame_equal(df, expected) + def test_url(self): + # HTTP(S) + url = 'https://raw.github.com/jseabold/pandas/read-table-url/pandas/io/tests/salary.table' + url_table = read_table(url) + dirpath = curpath() + localtable = os.path.join(dirpath, 'salary.table') + local_table = read_table(localtable) + assert_frame_equal(url_table, local_table) + # FILE + url_table = read_table('file://localhost/'+localtable) + assert_frame_equal(url_table, local_table) + #TODO: ftp testing class TestParseSQL(unittest.TestCase): From 1d1292bb841689ee67a3b09ae787fa5ffca51ed1 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Thu, 29 Mar 2012 18:38:16 -0400 Subject: [PATCH 7/9] ENH: Py3 compatibility for reading URLs --- pandas/io/parsers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 33aee1c66a006..21d17afd2d8a0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,6 +13,7 @@ import datetime import pandas.core.common as com import pandas._tseries as lib +from pandas.util import py3compat from pandas.util.decorators import Appender @@ -118,7 +119,10 @@ def _read(cls, filepath_or_buffer, kwds): if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): from urllib2 import urlopen filepath_or_buffer = urlopen(filepath_or_buffer) - + if py3compat.PY3: + from io import TextIOWrapper + filepath_or_buffer = TextIOWrapper(filepath_or_buffer, + encoding=encoding) if hasattr(filepath_or_buffer, 'read'): f = filepath_or_buffer else: From 0dcab6729f20b4389a4c6f8d8cd012b4c666a377 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Sat, 31 Mar 2012 14:55:24 -0400 Subject: [PATCH 8/9] ENH: Improve encoding error handling for URLs in Py3 --- pandas/io/parsers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 21d17afd2d8a0..f82a7a932e540 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -121,8 +121,13 @@ def _read(cls, filepath_or_buffer, kwds): filepath_or_buffer = urlopen(filepath_or_buffer) if py3compat.PY3: from io import TextIOWrapper + if encoding: + errors = 'strict' + else: + errors = 'replace' filepath_or_buffer = TextIOWrapper(filepath_or_buffer, - encoding=encoding) + encoding=encoding, + errors=errors) if hasattr(filepath_or_buffer, 'read'): f = filepath_or_buffer else: From 2e3f7f44d7c09b33a3403a9df7837ad7221a32fd Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sat, 31 Mar 2012 21:13:32 +0100 Subject: [PATCH 9/9] Various fixes so test_url() passes on Python 3. --- pandas/core/common.py | 4 ++-- pandas/io/parsers.py | 6 +++--- setup.py | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 15cc0442d547f..be605d57e6e87 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -667,10 +667,10 @@ def _get_handle(path, mode, encoding=None): if py3compat.PY3: # pragma: no cover def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): # ignore encoding - return csv.reader(f, dialect=csv.excel, **kwds) + return csv.reader(f, dialect=dialect, **kwds) def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): - return csv.writer(f, dialect=csv.excel, **kwds) + return csv.writer(f, dialect=dialect, **kwds) else: class UnicodeReader: """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f82a7a932e540..c4a136c8ce50c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -125,9 +125,9 @@ def _read(cls, filepath_or_buffer, kwds): errors = 'strict' else: errors = 'replace' - filepath_or_buffer = TextIOWrapper(filepath_or_buffer, - encoding=encoding, - errors=errors) + encoding = 'utf-8' + filepath_or_buffer = StringIO(filepath_or_buffer.read().decode(encoding, errors)) + if hasattr(filepath_or_buffer, 'read'): f = filepath_or_buffer else: diff --git a/setup.py b/setup.py index adc799ca23ef0..4251ef10de668 100755 --- a/setup.py +++ b/setup.py @@ -393,7 +393,8 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): ], package_data={'pandas.io' : ['tests/*.h5', 'tests/*.csv', - 'tests/*.xls'], + 'tests/*.xls', + 'tests/*.table'], 'pandas.tests' : ['data/*.pickle', 'data/*.csv'] },