-
-
Notifications
You must be signed in to change notification settings - Fork 18.9k
Description
Code Sample, a copy-pastable example if possible
import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')
Problem description
to_parquet
tries to convert an object
column to int64
. This happens when using either engine but is clearly seen when using data.to_parquet('example.parquet', engine='fastparquet')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
447 if selement.type == parquet_thrift.Type.INT64:
--> 448 data = data.astype(int)
449 elif selement.type == parquet_thrift.Type.BOOLEAN:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
176 kwargs[new_arg_name] = new_arg_value
--> 177 return func(*args, **kwargs)
178 return wrapper
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4996 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4997 **kwargs)
4998 return self._constructor(new_data).__finalize__(self)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3713 def astype(self, dtype, **kwargs):
-> 3714 return self.apply('astype', dtype=dtype, **kwargs)
3715
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
574 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575 **kwargs)
576
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
663 # _astype_nansafe works fine with 1-d only
--> 664 values = astype_nansafe(values.ravel(), dtype, copy=True)
665 values = values.reshape(self.shape)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
708 if np.issubdtype(dtype.type, np.integer):
--> 709 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
710
pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src\util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: 'Z31'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-6bc14a88da64> in <module>()
----> 1 data.to_parquet('example.parquet', engine='fastparquet')
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\core\frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1940 from pandas.io.parquet import to_parquet
1941 to_parquet(self, fname, engine,
-> 1942 compression=compression, **kwargs)
1943
1944 @Substitution(header='Write out the column names. If a list of strings '
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in to_parquet(df, path, engine, compression, **kwargs)
255 """
256 impl = get_engine(engine)
--> 257 return impl.write(df, path, compression=compression, **kwargs)
258
259
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\pandas\io\parquet.py in write(self, df, path, compression, **kwargs)
216 with catch_warnings(record=True):
217 self.api.write(path, df,
--> 218 compression=compression, **kwargs)
219
220 def read(self, path, columns=None, **kwargs):
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
846 if file_scheme == 'simple':
847 write_simple(filename, data, fmd, row_group_offsets,
--> 848 compression, open_with, has_nulls, append)
849 elif file_scheme in ['hive', 'drill']:
850 if append:
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
715 else None)
716 rg = make_row_group(f, data[start:end], fmd.schema,
--> 717 compression=compression)
718 if rg is not None:
719 fmd.row_groups.append(rg)
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in make_row_group(f, data, schema, compression)
612 comp = compression
613 chunk = write_column(f, data[column.name], column,
--> 614 compression=comp)
615 rg.columns.append(chunk)
616 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\fastparquet\writer.py in write_column(f, data, selement, compression)
453 raise ValueError('Error converting column "%s" to primitive '
454 'type %s. Original error: '
--> 455 '%s' % (data.name, t, e))
456
457 else:
ValueError: Error converting column "B" to primitive type INT64. Original error: invalid literal for int() with base 10: 'Z31'
You can see that it is a mixed type column issue if you use to_csv
and read_csv
to load data from csv file instead - you get the following warning on import:
C:\Users\I347500\AppData\Local\Continuum\miniconda3\envs\work\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
Specifying dtype
option solves the issue but it isn't convenient that there is no way to set column types after loading the data. It is also strange that to_parquet
tries to infer column types instead of using dtypes as stated in .dtypes
or .info()
Expected Output
to_parquet
tries write parquet file using dtypes as specified
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.6.5.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.23.0
pytest: 3.5.1
pip: 10.0.1
setuptools: 39.1.0
Cython: None
numpy: 1.14.3
scipy: 1.1.0
pyarrow: 0.9.0
xarray: None
IPython: 6.4.0
sphinx: None
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: 1.1.0
xlwt: 1.3.0
xlsxwriter: 1.0.4
lxml: None
bs4: None
html5lib: 0.9999999
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: 0.1.5
pandas_gbq: None
pandas_datareader: None