Exception with read_html and header #5129

cancan101 · 2013-10-06T20:02:32Z

In [5]: pd.read_html("http://pastebin.com/raw.php?i=7mAF0Ei6",infer_types=False, header=[0,1])[0]
---------------------------------------------------------------------------

----> 1 pd.read_html("http://pastebin.com/raw.php?i=7mAF0Ei6",infer_types=False, header=[0,1])[0]

/home/alex/git/pandas/pandas/io/html.pyc in read_html(io, match, flavor, header, index_col, skiprows, infer_types, attrs, parse_dates, tupleize_cols, thousands)
    838                          'data (you passed a negative value)')
    839     return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
--> 840                   parse_dates, tupleize_cols, thousands, attrs)

/home/alex/git/pandas/pandas/io/html.pyc in _parse(flavor, io, match, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands, attrs)
    710     return [_data_to_frame(table, header, index_col, skiprows, infer_types,
    711                            parse_dates, tupleize_cols, thousands)
--> 712             for table in tables]
    713 
    714 

/home/alex/git/pandas/pandas/io/html.pyc in _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands)
    600                     skiprows=_get_skiprows(skiprows),
    601                     parse_dates=parse_dates, tupleize_cols=tupleize_cols,
--> 602                     thousands=thousands)
    603     df = tp.read()
    604 

/home/alex/git/pandas/pandas/io/parsers.pyc in TextParser(*args, **kwds)
   1171     """
   1172     kwds['engine'] = 'python'
-> 1173     return TextFileReader(*args, **kwds)
   1174 
   1175 

/home/alex/git/pandas/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    481             self.options['has_index_names'] = kwds['has_index_names']
    482 
--> 483         self._make_engine(self.engine)
    484 
    485     def _get_options_with_defaults(self, engine):

/home/alex/git/pandas/pandas/io/parsers.pyc in _make_engine(self, engine)
    596             elif engine == 'python-fwf':
    597                 klass = FixedWidthFieldParser
--> 598             self._engine = klass(self.f, **self.options)
    599 
    600     def _failover_to_python(self):

/home/alex/git/pandas/pandas/io/parsers.pyc in __init__(self, f, **kwds)
   1294         if len(self.columns) > 1:
   1295             self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
-> 1296                 self.columns, self.index_names, self.col_names)
   1297         else:
   1298             self.columns = self.columns[0]

/home/alex/git/pandas/pandas/io/parsers.pyc in _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names)
    734         # if we find 'Unnamed' all of a single level, then our header was too long
    735         for n in range(len(columns[0])):
--> 736             if all([ 'Unnamed' in c[n] for c in columns ]):
    737                 raise _parser.CParserError("Passed header=[%s] are too many rows for this "
    738                                            "multi_index of columns" % ','.join([ str(x) for x in self.header ]))

TypeError: argument of type 'float' is not iterable

cpcloud · 2013-10-06T20:07:56Z

easy fix. ... coming up

cpcloud · 2013-10-06T20:16:06Z

"bug" is that you can't call 'Unnamed' in c[n] if c[n] is nan

cpcloud · 2013-10-06T21:36:30Z

hm k not so easy :p

cpcloud · 2013-10-06T22:05:39Z

@cancan101 For now do this:

df = read_html("http://pastebin.com/raw.php?i=7mAF0Ei6",
               infer_types=False, header=[0, 1],
               tupleize_cols=True)[0]

cancan101 · 2013-10-06T22:08:06Z

@cpcloud Did you push new code?

Otherwise, assuming you meant pd and not self, I get the same error:

In [31]: df = pd.read_html("http://pastebin.com/raw.php?i=7mAF0Ei6",
                    infer_types=False, header=[0, 1],
                    tupleize_cols=True)[0]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-31-23ec51a3b3fd> in <module>()
      1 df = pd.read_html("http://pastebin.com/raw.php?i=7mAF0Ei6",
      2                     infer_types=False, header=[0, 1],
----> 3                     tupleize_cols=True)[0]

/home/alex/git/pandas/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, infer_types, attrs, parse_dates, tupleize_cols, thousands)
    882                          'data (you passed a negative value)')
    883     return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
--> 884                   parse_dates, tupleize_cols, thousands, attrs)

/home/alex/git/pandas/pandas/io/html.py in _parse(flavor, io, match, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands, attrs)
    754     return [_data_to_frame(table, header, index_col, skiprows, infer_types,
    755                            parse_dates, tupleize_cols, thousands)
--> 756             for table in tables]
    757 
    758 

/home/alex/git/pandas/pandas/io/html.py in _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands)
    643                     skiprows=_get_skiprows(skiprows),
    644                     parse_dates=parse_dates, tupleize_cols=tupleize_cols,
--> 645                     thousands=thousands)
    646     df = tp.read()
    647 

/home/alex/git/pandas/pandas/io/parsers.py in TextParser(*args, **kwds)
   1171     """
   1172     kwds['engine'] = 'python'
-> 1173     return TextFileReader(*args, **kwds)
   1174 
   1175 

/home/alex/git/pandas/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    481             self.options['has_index_names'] = kwds['has_index_names']
    482 
--> 483         self._make_engine(self.engine)
    484 
    485     def _get_options_with_defaults(self, engine):

/home/alex/git/pandas/pandas/io/parsers.py in _make_engine(self, engine)
    596             elif engine == 'python-fwf':
    597                 klass = FixedWidthFieldParser
--> 598             self._engine = klass(self.f, **self.options)
    599 
    600     def _failover_to_python(self):

/home/alex/git/pandas/pandas/io/parsers.py in __init__(self, f, **kwds)
   1294         if len(self.columns) > 1:
   1295             self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
-> 1296                 self.columns, self.index_names, self.col_names)
   1297         else:
   1298             self.columns = self.columns[0]

/home/alex/git/pandas/pandas/io/parsers.py in _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names)
    734         # if we find 'Unnamed' all of a single level, then our header was too long
    735         for n in range(len(columns[0])):
--> 736             if all([ 'Unnamed' in c[n] for c in columns ]):
    737                 raise _parser.CParserError("Passed header=[%s] are too many rows for this "
    738                                            "multi_index of columns" % ','.join([ str(x) for x in self.header ]))

TypeError: argument of type 'float' is not iterable

cpcloud · 2013-10-06T22:08:27Z

oh whoops ... sorry local fix :) my bad

cancan101 · 2013-10-07T00:33:06Z

@cpcloud Not sure if this is related: #5107

cpcloud · 2013-10-07T00:34:21Z

maybe ... some of these you might just have to play around with params and do the parsing manually afterwards...these tables are really horrible (great for testing!)

cancan101 · 2013-10-07T00:44:43Z

Agreed about changing params. Still, a StopIteration seems pretty bad.

cancan101 · 2013-10-12T00:11:57Z

@cpcloud What ever came of that local fix?

jreback · 2013-10-16T12:49:08Z

@cpcloud ?

cpcloud · 2013-10-16T12:50:01Z

I can get to it this weekend

jreback · 2013-10-16T12:50:29Z

np :)

jreback · 2013-10-23T12:42:28Z

@cpcloud ?

jreback · 2013-10-27T22:14:11Z

bumping to 0.14

cpcloud · 2013-10-27T22:14:20Z

pushing to 0.14 ... runnging out of time

cpcloud · 2014-01-17T03:24:44Z

@cancan101 This still an issue?

cancan101 · 2014-01-17T23:02:03Z

Yes, I still get the same exception.

cpcloud · 2014-01-17T23:10:16Z

OK thx

cpcloud · 2014-02-22T17:24:16Z

This might be the most annoying bug ever

cpcloud · 2014-02-22T17:35:11Z

@cancan101 What are you expecting header=[0, 1] to return in this case?

Looking at this table i can't really see how header=[0, 1] makes any sense.

cancan101 · 2014-02-22T18:16:07Z

Let me see if I can find a better example of issue

cpcloud · 2014-02-22T18:29:01Z

I think this example is fine. I'm just not sure exactly what to do here ... the problem is that the lxml will drop some data even in recover=False mode and won't fail parsing, but the bs4 engine will because it doesn't drop data, and throws the correct parser error

ghost assigned cpcloud Oct 6, 2013

jreback mentioned this issue Oct 8, 2013

StopIteration in read_html #5107

Closed

cpcloud mentioned this issue Feb 22, 2014

BUG/TST: read_html should follow pandas conventions when creating empty data #6447

Merged

jreback modified the milestones: 0.15.0, 0.14.0 Feb 26, 2014

cpcloud closed this as completed in #6447 Feb 27, 2014

wesm unassigned cpcloud Oct 12, 2016

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Exception with read_html and header #5129

Exception with read_html and header #5129

cancan101 commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cancan101 commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cancan101 commented Oct 7, 2013

cpcloud commented Oct 7, 2013

cancan101 commented Oct 7, 2013

cancan101 commented Oct 12, 2013

jreback commented Oct 16, 2013

cpcloud commented Oct 16, 2013

jreback commented Oct 16, 2013

jreback commented Oct 23, 2013

jreback commented Oct 27, 2013

cpcloud commented Oct 27, 2013

cpcloud commented Jan 17, 2014

cancan101 commented Jan 17, 2014

cpcloud commented Jan 17, 2014

cpcloud commented Feb 22, 2014

cpcloud commented Feb 22, 2014

cancan101 commented Feb 22, 2014

cpcloud commented Feb 22, 2014

Exception with read_html and header #5129

Exception with read_html and header #5129

Comments

cancan101 commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cancan101 commented Oct 6, 2013

cpcloud commented Oct 6, 2013

cancan101 commented Oct 7, 2013

cpcloud commented Oct 7, 2013

cancan101 commented Oct 7, 2013

cancan101 commented Oct 12, 2013

jreback commented Oct 16, 2013

cpcloud commented Oct 16, 2013

jreback commented Oct 16, 2013

jreback commented Oct 23, 2013

jreback commented Oct 27, 2013

cpcloud commented Oct 27, 2013

cpcloud commented Jan 17, 2014

cancan101 commented Jan 17, 2014

cpcloud commented Jan 17, 2014

cpcloud commented Feb 22, 2014

cpcloud commented Feb 22, 2014

cancan101 commented Feb 22, 2014

cpcloud commented Feb 22, 2014