diff --git a/doc/source/release.rst b/doc/source/release.rst index aa852c4416ade..3888d430b49de 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -184,6 +184,11 @@ Bug Fixes - Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) - Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) - Bug in ``iat/iloc`` with duplicate indices on a Series (:issue:`6493`) +- Bug in ``read_html`` where nan's were incorrectly being used to indicate + missing values in text. Should use the empty string for consistency with the + rest of pandas (:issue:`5129`). +- Bug in ``read_html`` tests where redirected invalid URLs would make one test + fail (:issue:`6445`). pandas 0.13.1 ------------- diff --git a/pandas/io/html.py b/pandas/io/html.py index e60630204a8b9..4375d08abc37c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -579,8 +579,9 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] + empty = [''] for ind, length in iteritems(not_max): - body[ind] += [np.nan] * (lens_max - length) + body[ind] += empty * (lens_max - length) def _data_to_frame(data, header, index_col, skiprows, infer_types, @@ -760,15 +761,15 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - + attrs = {'id': 'table'} - + is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - + attrs = {'asdf': 'table'} - + is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 table attributes can be found `here diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/io/tests/data/computer_sales_page.html new file mode 100644 index 0000000000000..ff2b031b58d64 --- /dev/null +++ b/pandas/io/tests/data/computer_sales_page.html @@ -0,0 +1,619 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
 Three months ended
+April 30
 Six months ended
+April 30
 
 
 2013  2012  2013  2012  
 
 In millions
 

Net revenue:

             

Notebooks

 $3,718 $4,900 $7,846 $9,842 

Desktops

  3,103  3,827  6,424  7,033 

Workstations

  521  537  1,056  1,072 

Other

  242  206  462  415 
          

Personal Systems

  7,584  9,470  15,788  18,362 
          

Supplies

  4,122  4,060  8,015  8,139 

Commercial Hardware

  1,398  1,479  2,752  2,968 

Consumer Hardware

  561  593  1,240  1,283 
          

Printing

  6,081  6,132  12,007  12,390 
          

Printing and Personal Systems Group

  13,665  15,602  27,795  30,752 
          

Industry Standard Servers

  2,806  3,186  5,800  6,258 

Technology Services

  2,272  2,335  4,515  4,599 

Storage

  857  990  1,690  1,945 

Networking

  618  614  1,226  1,200 

Business Critical Systems

  266  421  572  826 
          

Enterprise Group

  6,819  7,546  13,803  14,828 
          

Infrastructure Technology Outsourcing

  3,721  3,954  7,457  7,934 

Application and Business Services

  2,278  2,535  4,461  4,926 
          

Enterprise Services

  5,999  6,489  11,918  12,860 
          

Software

  941  970  1,867  1,916 

HP Financial Services

  881  968  1,838  1,918 

Corporate Investments

  10  7  14  37 
          

Total segments

  28,315  31,582  57,235  62,311 
          

Eliminations of intersegment net revenue and other

  (733) (889) (1,294) (1,582)
          

Total HP consolidated net revenue

 $27,582 $30,693 $55,941 $60,729 
          
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 77c15a6c58657..3a7106fc6b4bb 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -22,6 +22,7 @@ from pandas.compat import map, zip, StringIO, string_types from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html +from pandas.parser import CParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -143,7 +144,7 @@ def test_banklist(self): def test_spam_no_types(self): with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', - infer_types=False) + infer_types=False) with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) @@ -305,8 +306,11 @@ def test_bad_url_protocol(self): @network def test_invalid_url(self): - with tm.assertRaises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + try: + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + except ValueError as e: + tm.assert_equal(str(e), 'No tables found') @slow def test_file_url(self): @@ -581,6 +585,14 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assertRaisesRegexp(CParserError, r"Passed header=\[0,1\] are " + "too many rows for this multi_index " + "of columns"): + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + class TestReadHtmlLxml(tm.TestCase): @classmethod @@ -631,6 +643,12 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + import pandas as pd + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + def test_invalid_flavor(): url = 'google.com'