StopIteration error when trying to import specific columns from Excel file using parse_cols #9002

Closed
pybokeh opened this Issue Dec 4, 2014 · 9 comments

Comments

Projects
None yet
4 participants

pybokeh commented Dec 4, 2014

df = pd.read_excel(r'\\path_to_Excel_file', 
                   'Sheet1', parse_cols=['MODEL_YEAR'])

returns:

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
<ipython-input-29-336e4fd9eea1> in <module>()
      4 
      5 df = pd.read_excel(r'\\path_to_Excel_file', 
----> 6                    'Sheet1', parse_cols=["MODEL_YEAR"])

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\excel.py in read_excel(io, sheetname, **kwds)
    125     engine = kwds.pop('engine', None)
    126 
--> 127     return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
    128 
    129 

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\excel.py in parse(self, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, has_index_names, **kwds)
    236                                  skip_footer=skip_footer,
    237                                  convert_float=convert_float,
--> 238                                  **kwds)
    239 
    240     def _should_parse(self, i, parse_cols):

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\excel.py in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, convert_float, **kwds)
    355                             skip_footer=skip_footer,
    356                             chunksize=chunksize,
--> 357                             **kwds)
    358 
    359         return parser.read()

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in TextParser(*args, **kwds)
   1285     """
   1286     kwds['engine'] = 'python'
-> 1287     return TextFileReader(*args, **kwds)
   1288 
   1289 

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    555             self.options['has_index_names'] = kwds['has_index_names']
    556 
--> 557         self._make_engine(self.engine)
    558 
    559     def _get_options_with_defaults(self, engine):

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
    698             elif engine == 'python-fwf':
    699                 klass = FixedWidthFieldParser
--> 700             self._engine = klass(self.f, **self.options)
    701 
    702     def _failover_to_python(self):

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in __init__(self, f, **kwds)
   1392         # infer column indices from self.usecols if is is specified.
   1393         self._col_indices = None
-> 1394         self.columns, self.num_original_columns = self._infer_columns()
   1395 
   1396         # Now self.columns has the set of columns that we will process.

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in _infer_columns(self)
   1607             columns = []
   1608             for level, hr in enumerate(header):
-> 1609                 line = self._buffered_line()
   1610 
   1611                 while self.line_pos <= hr:

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in _buffered_line(self)
   1734             return self.buf[0]
   1735         else:
-> 1736             return self._next_line()
   1737 
   1738     def _empty(self, line):

D:\Miniconda3\envs\notebook\lib\site-packages\pandas\io\parsers.py in _next_line(self)
   1758                             break
   1759                 except IndexError:
-> 1760                     raise StopIteration
   1761         else:
   1762             while self.pos in self.skiprows:

StopIteration: 

Documentation says the following:
parse_cols : int or list, default None

  • If None then parse all columns
  • If int then indicates last column to be parsed
  • If list of ints then indicates list of column numbers to be parsed
  • If string then indicates comma separated list of column names and column ranges (e.g. “A:E” or “A,C,E:F”)

Also did:

df = pd.read_excel(r'\\path_to_Excel_file', 
                   'Sheet1', parse_cols="MODEL_YEAR")

But still got the same error.

Using Windows 7 with Miniconda3
conda list output:
beautiful-soup            4.3.2                    py34_0
beautifulsoup4            4.3.2                     <pip>
brewer2mpl                1.4.1                     <pip>
dateutil                  2.1                      py34_2
ggplot                    0.6.5                     <pip>
ipython                   2.3.1                    py34_0
jdcal                     1.0                      py34_0
jinja2                    2.7.3                    py34_1
lxml                      3.4.0                    py34_0
markupsafe                0.23                     py34_0
matplotlib                1.4.2                np19py34_0
numexpr                   2.3.1                np19py34_0
numpy                     1.9.1                    py34_0
openpyxl                  2.0.2                    py34_0
pandas                    0.15.1               np19py34_0
patsy                     0.3.0                np19py34_0
pip                       1.5.6                    py34_0
pycrypto                  2.6.1                    py34_2
pygments                  2.0.1                    py34_0
pyodbc                    3.0.7                    py34_0
pyparsing                 2.0.1                    py34_0
pyqt                      4.10.4                   py34_0
pyreadline                2.0                      py34_0
python                    3.4.2                         0
python-dateutil           2.1                       <pip>
pytz                      2014.9                   py34_0
pywin32                   219                      py34_0
pyzmq                     14.4.1                   py34_0
requests                  2.4.3                    py34_0
scipy                     0.14.0               np19py34_0
seaborn                   0.5.0                     <pip>
setuptools                7.0                      py34_0
six                       1.8.0                    py34_0
statsmodels               0.6.0                np19py34_0
tornado                   4.0.2                    py34_0
xlrd                      0.9.3                    py34_0
Contributor

jreback commented Dec 4, 2014

create a csv and try to parse, then show what you have.

pybokeh commented Dec 5, 2014

This worked using usecols parameter:
df = pd.read_csv(r'D:\temp\four_years.csv', sep=',', usecols=['DTF','MTF'])

Not sure how this is relevant to my read_excel problem.

Also, not sure why we have parse_cols for read_excel() versus usecols for read_csv(), but I see there is already an issue created for parameter name inconsistency.

Contributor

jreback commented Dec 5, 2014

it could be a bug. Interested in looking at it?

@jreback jreback added Bug Excel labels Dec 5, 2014

jreback added this to the 0.16.0 milestone Dec 5, 2014

pybokeh commented Dec 5, 2014

@jreback Sorry, wish I could. But admittedly, my Python knowledge is not that great. Been using Python for a few years as a data analyst, but I have not personally created a large project or debugged a large project before.

@jreback jreback modified the milestone: 0.16.0, Next Major Release Mar 6, 2015

Just FYI I am having exactly the same problem with pandas 0.17.1

Contributor

BranYang commented Feb 24, 2016

@pybokeh @cardosan Do we have an excel file to reproduce this bug?

Contributor

BranYang commented Feb 26, 2016

@cardosan Sorry but the file you uploaded cannot reproduce this bug on master now. Could you please verify the bug using you own environment and paste your results here?

import pandas as pd
df = pd.read_csv("010215_BAU45_SUMMARY_GIU_tot_dem_codes.xls")
pd.__version__

Here is my results running on most recent master code (up to commit fe584e7)

In [1]: import pandas as pd

In [2]: df=pd.read_excel("C:/D/tmp/010215_BAU45_SUMMARY_GIU_tot_dem_codes.xls")
# It successfully completed.
In [3]: pd.__version__
Out[3]: '0.18.0rc1+47.gfe584e7'

And using 0.17.1 on MY computer

In [1]: import pandas as pd

In [2]: df=pd.read_excel("C:/D/tmp/010215_BAU45_SUMMARY_GIU_tot_dem_codes.xls")
# It successfully completed on my computer
In [3]: pd.__version__
Out[3]: '0.17.1'

@jreback jreback modified the milestone: 0.18.0, Next Major Release Feb 27, 2016

jreback closed this in 78d671f Feb 27, 2016

The prob is not when I just read the file but when explicitly call parse_cols, see below

In [5]: df=pd.read_excel("/mnt/D092A60B92A5F65E/DATA/Work/FORMIT/WP2/simulations_from_wp2/01_02_15/010215_BAU45_SUMMARY_GIU_tot_dem_codes.xls",parse_cols='country')
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in _next_line(self)
   1804                 try:
-> 1805                     line = self._check_comments([self.data[self.pos]])[0]
   1806                     self.pos += 1

IndexError: list index out of range

During handling of the above exception, another exception occurred:

StopIteration                             Traceback (most recent call last)
<ipython-input-5-7fbe79d06135> in <module>()
----> 1 df=pd.read_excel("/mnt/D092A60B92A5F65E/DATA/Work/FORMIT/WP2/simulations_from_wp2/01_02_15/010215_BAU45_SUMMARY_GIU_tot_dem_codes.xls",parse_cols='country')

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, engine, **kwds)
    168         date_parser=date_parser, na_values=na_values, thousands=thousands,
    169         convert_float=convert_float, has_index_names=has_index_names,
--> 170         skip_footer=skip_footer, converters=converters, **kwds)
    171 
    172 class ExcelFile(object):

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/excel.py in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, verbose, **kwds)
    434                                 skiprows=skiprows,
    435                                 skip_footer=skip_footer,
--> 436                                 **kwds)
    437 
    438             output[asheetname] = parser.read()

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in TextParser(*args, **kwds)
   1344     """
   1345     kwds['engine'] = 'python'
-> 1346     return TextFileReader(*args, **kwds)
   1347 
   1348 

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    735             elif engine == 'python-fwf':
    736                 klass = FixedWidthFieldParser
--> 737             self._engine = klass(self.f, **self.options)
    738 
    739     def _failover_to_python(self):

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in __init__(self, f, **kwds)
   1450         # infer column indices from self.usecols if is is specified.
   1451         self._col_indices = None
-> 1452         self.columns, self.num_original_columns = self._infer_columns()
   1453 
   1454         # Now self.columns has the set of columns that we will process.

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in _infer_columns(self)
   1664             columns = []
   1665             for level, hr in enumerate(header):
-> 1666                 line = self._buffered_line()
   1667 
   1668                 while self.line_pos <= hr:

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in _buffered_line(self)
   1791             return self.buf[0]
   1792         else:
-> 1793             return self._next_line()
   1794 
   1795     def _empty(self, line):

/home/giuseppec/miniconda3/envs/bw2_py3/lib/python3.4/site-packages/pandas/io/parsers.py in _next_line(self)
   1815                             break
   1816                 except IndexError:
-> 1817                     raise StopIteration
   1818         else:
   1819             while self.pos in self.skiprows:

StopIteration: 

In [6]: pd.__version__
Out[6]: '0.17.1'


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment