Skip to content

BUG: valueerror: found non-unique column index !! when using read_csv and arrow engine when CSV has duplicate columns #52408

@tfr2003

Description

@tfr2003

I got this error massage :

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[9], line 1
----> 1 ff=pd.read_csv("CSVWO/2022-December.csv",engine="pyarrow")

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    899 kwds_defaults = _refine_defaults_read(
    900     dialect,
    901     delimiter,
   (...)
    908     dtype_backend=dtype_backend,
    909 )
    910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:583, in _read(filepath_or_buffer, kwds)
    580     return parser
    582 with parser:
--> 583     return parser.read(nrows)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1692, in TextFileReader.read(self, nrows)
   1689 if self.engine == "pyarrow":
   1690     try:
   1691         # error: "ParserBase" has no attribute "read"
-> 1692         df = self._engine.read()  # type: ignore[attr-defined]
   1693     except Exception:
   1694         self.close()

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/arrow_parser_wrapper.py:163, in ArrowParserWrapper.read(self)
    161     frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
    162 else:
--> 163     frame = table.to_pandas()
    164 return self._finalize_pandas_output(frame)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyarrow/array.pxi:830, in pyarrow.lib._PandasConvertible.to_pandas()

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyarrow/table.pxi:3990, in pyarrow.lib.Table._to_pandas()

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyarrow/pandas_compat.py:819, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    816     ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)
    818 _check_data_column_metadata_consistency(all_columns)
--> 819 columns = _deserialize_column_index(table, all_columns, column_indexes)
    820 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    822 axes = [columns, index]

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyarrow/pandas_compat.py:938, in _deserialize_column_index(block_table, all_columns, column_indexes)
    935     columns = _reconstruct_columns_from_metadata(columns, column_indexes)
    937 # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
--> 938 columns = _flatten_single_level_multiindex(columns)
    940 return columns

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyarrow/pandas_compat.py:1185, in _flatten_single_level_multiindex(index)
   1183     # Cheaply check that we do not somehow have duplicate column names
   1184     if not index.is_unique:
-> 1185         raise ValueError('Found non-unique column index')
   1187     return pd.Index(
   1188         [levels[_label] if _label != -1 else None for _label in labels],
   1189         dtype=dtype,
   1190         name=index.names[0]
   1191     )
   1192 return index

ValueError: Found non-unique column index

When try to use (read_csv(path,engine="pyarrow”)). !!!

Thanks

Metadata

Metadata

Assignees

No one assigned

    Labels

    Arrowpyarrow functionalityIO CSVread_csv, to_csvUpstream issueIssue related to pandas dependency

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions