Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

DOC: add doc for reading from DataFrame.to_html #3656

Merged
merged 2 commits into from May 21, 2013
Jump to file or symbol
Failed to load files and symbols.
+24 −12
Split
View
@@ -66,6 +66,7 @@ pandas 0.11.1
- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
to specify custom column names of the returned DataFrame (GH3649_),
thanks @hoechenberger
+ - ``read_html`` no longer performs hard date conversion
**API Changes**
View
@@ -68,6 +68,21 @@ Enhancements
- ``pd.read_html()`` can now parse HTML strings, files or urls and return
DataFrames, courtesy of @cpcloud. (GH3477_, GH3605_, GH3606_, GH3616_).
It works with a *single* parser backend: BeautifulSoup4 + html5lib
+ - You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so
+
+ .. ipython :: python
+
+ df = DataFrame({'a': range(3), 'b': list('abc')})
+ print df
+ html = df.to_html()
+ alist = pd.read_html(html, infer_types=True, index_col=0)
+ print df == alist[0]
+
+ Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and
+ ``DataFrame.to_html()`` are not inverses.
+
+ - ``pd.read_html()`` no longer performs hard conversion of date strings
+ (GH3656_).
- ``HDFStore``
@@ -211,3 +226,4 @@ on GitHub for a complete list.
.. _GH3616: https://github.com/pydata/pandas/issues/3616
.. _GH3605: https://github.com/pydata/pandas/issues/3605
.. _GH3606: https://github.com/pydata/pandas/issues/3606
+.. _GH3656: https://github.com/pydata/pandas/issues/3656
View
@@ -636,7 +636,6 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows):
# must be sequential since dates trump numbers if both args are given
if infer_types:
df = df.convert_objects(convert_numeric=True)
- df = df.convert_objects(convert_dates='coerce')
if index_col is not None:
cols = df.columns[index_col]
@@ -722,7 +721,7 @@ def _parse(parser, io, match, flavor, header, index_col, skiprows, infer_types,
def read_html(io, match='.+', flavor='html5lib', header=None, index_col=None,
- skiprows=None, infer_types=False, attrs=None):
+ skiprows=None, infer_types=True, attrs=None):
r"""Read an HTML table into a DataFrame.
Parameters
@@ -2,7 +2,6 @@
import re
from cStringIO import StringIO
from unittest import TestCase
-import collections
import numbers
from urllib2 import urlopen
from contextlib import closing
@@ -408,7 +407,7 @@ def try_remove_ws(x):
return x
df = self.run_read_html(self.banklist_data, 'Metcalf',
- attrs={'id': 'table'}, infer_types=True)[0]
+ attrs={'id': 'table'})[0]
ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'),
converters={'Updated Date': Timestamp,
'Closing Date': Timestamp})
@@ -431,7 +430,9 @@ def try_remove_ws(x):
'Hamilton Bank, NA', 'The Citizens Savings Bank']
dfnew = df.applymap(try_remove_ws).replace(old, new)
gtnew = ground_truth.applymap(try_remove_ws)
- assert_frame_equal(dfnew, gtnew)
+ converted = dfnew.convert_objects(convert_numeric=True)
+ assert_frame_equal(converted.convert_objects(convert_dates='coerce'),
+ gtnew)
@slow
def test_gold_canyon(self):
@@ -487,6 +488,3 @@ def test_lxml_finds_tbody():
url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&'
'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
assert get_lxml_elements(url, 'tbody')
-
-
-
View
@@ -126,13 +126,13 @@ def assert_almost_equal(a, b, check_less_precise = False):
return assert_dict_equal(a, b)
if isinstance(a, basestring):
- assert a == b, "{0} != {1}".format(a, b)
+ assert a == b, "%s != %s" % (a, b)
return True
if isiterable(a):
np.testing.assert_(isiterable(b))
na, nb = len(a), len(b)
- assert na == nb, "{0} != {1}".format(na, nb)
+ assert na == nb, "%s != %s" % (na, nb)
if np.array_equal(a, b):
return True
@@ -154,8 +154,6 @@ def assert_almost_equal(a, b, check_less_precise = False):
if check_less_precise:
dtype_a = np.dtype(type(a))
dtype_b = np.dtype(type(b))
- if dtype_a.kind == 'i' and dtype_b == 'i':
- pass
if dtype_a.kind == 'f' and dtype_b == 'f':
if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4:
decimal = 3