Skip to content

Commit

Permalink
PERF: Improve performance of StataReader (#25780)
Browse files Browse the repository at this point in the history
Improve performance of StataReader when converting columns
with missing values

xref #25772
  • Loading branch information
bashtage authored and jreback committed Mar 20, 2019
1 parent 4c21e5c commit 85c3f82
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
16 changes: 13 additions & 3 deletions asv_bench/benchmarks/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ class Stata(BaseIO):

def setup(self, convert_dates):
self.fname = '__test__.dta'
N = 100000
C = 5
N = self.N = 100000
C = self.C = 5
self.df = DataFrame(np.random.randn(N, C),
columns=['float{}'.format(i) for i in range(C)],
index=date_range('20000101', periods=N, freq='H'))
self.df['object'] = tm.makeStringIndex(N)
self.df['object'] = tm.makeStringIndex(self.N)
self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
np.iinfo(np.int8).max - 27, N)
self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
Expand All @@ -36,4 +36,14 @@ def time_write_stata(self, convert_dates):
self.df.to_stata(self.fname, self.convert_dates)


class StataMissing(Stata):
def setup(self, convert_dates):
super(StataMissing, self).setup(convert_dates)
for i in range(10):
missing_data = np.random.randn(self.N)
missing_data[missing_data < 0] = np.nan
self.df['missing_{0}'.format(i)] = missing_data
self.df.to_stata(self.fname, self.convert_dates)


from ..pandas_vb_common import setup # noqa: F401
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ I/O
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
-
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)


Plotting
Expand Down
19 changes: 13 additions & 6 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from pandas.core.dtypes.common import (
ensure_object, is_categorical_dtype, is_datetime64_dtype)

from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
from pandas import (
DatetimeIndex, compat, concat, isna, to_datetime, to_timedelta)
from pandas.core.arrays import Categorical
from pandas.core.base import StringMixin
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -1572,7 +1573,7 @@ def read(self, nrows=None, convert_dates=None,
data = DataFrame.from_dict(OrderedDict(data_formatted))
del data_formatted

self._do_convert_missing(data, convert_missing)
data = self._do_convert_missing(data, convert_missing)

if convert_dates:
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
Expand Down Expand Up @@ -1616,7 +1617,7 @@ def read(self, nrows=None, convert_dates=None,

def _do_convert_missing(self, data, convert_missing):
# Check for missing values, and replace if found

replacements = {}
for i, colname in enumerate(data):
fmt = self.typlist[i]
if fmt not in self.VALID_RANGE:
Expand Down Expand Up @@ -1646,8 +1647,14 @@ def _do_convert_missing(self, data, convert_missing):
dtype = np.float64
replacement = Series(series, dtype=dtype)
replacement[missing] = np.nan

data[colname] = replacement
replacements[colname] = replacement
if replacements:
columns = data.columns
replacements = DataFrame(replacements)
data = concat([data.drop(replacements.columns, 1),
replacements], 1)
data = data[columns]
return data

def _insert_strls(self, data):
if not hasattr(self, 'GSO') or len(self.GSO) == 0:
Expand Down Expand Up @@ -1712,7 +1719,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
except ValueError:
vc = Series(categories).value_counts()
repeats = list(vc.index[vc > 1])
repeats = '\n' + '-' * 80 + '\n'.join(repeats)
repeats = '-' * 80 + '\n' + '\n'.join(repeats)
raise ValueError('Value labels for column {col} are not '
'unique. The repeated labels are:\n'
'{repeats}'
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,7 +1311,7 @@ def test_unsupported_datetype(self):
def test_repeated_column_labels(self):
# GH 13923
msg = (r"Value labels for column ethnicsn are not unique\. The"
r" repeated labels are:\n\n-+wolof")
r" repeated labels are:\n-+\nwolof")
with pytest.raises(ValueError, match=msg):
read_stata(self.dta23, convert_categoricals=True)

Expand Down

0 comments on commit 85c3f82

Please sign in to comment.