Skip to content

Commit

Permalink
BUG: more consistent na_values #1657
Browse files Browse the repository at this point in the history
  • Loading branch information
changhiskhan committed Aug 19, 2012
1 parent 27c4c96 commit d9abf68
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 10 deletions.
6 changes: 3 additions & 3 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ data into a DataFrame object. They can take a number of arguments:
- ``names``: List of column names to use. If passed, header will be
implicitly set to None.
- ``na_values``: optional list of strings to recognize as NaN (missing
values), in addition to a default set. If you pass an empty list or an
empty list for a particular column, no values (including empty strings)
will be considered NA
values), either in addition to or in lieu of the default set.
- ``keep_default_na``: whether to include the default set of missing values
in addition to the ones specified in ``na_values``
- ``parse_dates``: if True then index will be parsed as dates
(False by default). You can specify more complicated options to parse
a subset of columns or a combination of columns into a single date column
Expand Down
31 changes: 25 additions & 6 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class DateConversionError(Exception):
na_values : list-like or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to
parse_dates : boolean, list of ints or names, list of lists, or dict
True -> try parsing all columns
[1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column
Expand Down Expand Up @@ -199,6 +202,7 @@ def read_csv(filepath_or_buffer,
names=None,
skiprows=None,
na_values=None,
keep_default_na=True,
thousands=None,
comment=None,
parse_dates=False,
Expand All @@ -218,7 +222,8 @@ def read_csv(filepath_or_buffer,
sep=sep, dialect=dialect,
header=header, index_col=index_col,
names=names, skiprows=skiprows,
na_values=na_values, thousands=thousands,
na_values=na_values, keep_default_na=keep_default_na,
thousands=thousands,
comment=comment, parse_dates=parse_dates,
keep_date_col=keep_date_col,
dayfirst=dayfirst, date_parser=date_parser,
Expand All @@ -244,6 +249,7 @@ def read_table(filepath_or_buffer,
names=None,
skiprows=None,
na_values=None,
keep_default_na=True,
thousands=None,
comment=None,
parse_dates=False,
Expand All @@ -263,7 +269,8 @@ def read_table(filepath_or_buffer,
sep=sep, dialect=dialect,
header=header, index_col=index_col,
names=names, skiprows=skiprows,
na_values=na_values, thousands=thousands,
na_values=na_values, keep_default_na=keep_default_na,
thousands=thousands,
comment=comment, parse_dates=parse_dates,
keep_date_col=keep_date_col,
dayfirst=dayfirst, date_parser=date_parser,
Expand Down Expand Up @@ -292,6 +299,7 @@ def read_fwf(filepath_or_buffer,
names=None,
skiprows=None,
na_values=None,
keep_default_na=True,
thousands=None,
comment=None,
parse_dates=False,
Expand All @@ -311,7 +319,8 @@ def read_fwf(filepath_or_buffer,
colspecs=colspecs, widths=widths,
header=header, index_col=index_col,
names=names, skiprows=skiprows,
na_values=na_values, thousands=thousands,
na_values=na_values, keep_default_na=keep_default_na,
thousands=thousands,
comment=comment, parse_dates=parse_dates,
keep_date_col=keep_date_col,
dayfirst=dayfirst, date_parser=date_parser,
Expand Down Expand Up @@ -407,6 +416,7 @@ class TextParser(object):
Column or columns to use as the (possibly hierarchical) index
na_values : iterable, default None
Custom NA values
keep_default_na : bool, default True
thousands : str, default None
Thousands separator
comment : str, default None
Expand All @@ -425,7 +435,8 @@ class TextParser(object):
"""

def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
index_col=None, na_values=None, thousands=None,
index_col=None, na_values=None, keep_default_na=True,
thousands=None,
comment=None, parse_dates=False, keep_date_col=False,
date_parser=None, dayfirst=False,
chunksize=None, skiprows=None, skip_footer=0, converters=None,
Expand Down Expand Up @@ -467,12 +478,20 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,

assert(self.skip_footer >= 0)

if na_values is None:
self.keep_default_na = keep_default_na
if na_values is None and keep_default_na:
self.na_values = _NA_VALUES
elif isinstance(na_values, dict):
if keep_default_na:
for k, v in na_values.iteritems():
v = set(list(v)) | _NA_VALUES
na_values[k] = v
self.na_values = na_values
else:
self.na_values = set(list(na_values)) | _NA_VALUES
na_values = set(list(na_values))
if keep_default_na:
na_values = na_values | _NA_VALUES
self.na_values = na_values

self.thousands = thousands
self.comment = comment
Expand Down
17 changes: 16 additions & 1 deletion pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,28 @@ def test_empty_string(self):
np.nan, 'seven']})
assert_frame_equal(xp.reindex(columns=df.columns), df)

df = read_csv(StringIO(data), na_values={'One': [], 'Three': []})
df = read_csv(StringIO(data), na_values={'One': [], 'Three': []},
keep_default_na=False)
xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'],
'Two' : [1,2,3,4,5,6,7],
'Three' : ['one', 'two', 'three', 'nan', 'five',
'', 'seven']})
assert_frame_equal(xp.reindex(columns=df.columns), df)

df = read_csv(StringIO(data), na_values=['a'], keep_default_na=False)
xp = DataFrame({'One' : [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
'Two' : [1, 2, 3, 4, 5, 6, 7],
'Three' : ['one', 'two', 'three', 'nan', 'five', '',
'seven']})
assert_frame_equal(xp.reindex(columns=df.columns), df)

df = read_csv(StringIO(data), na_values={'One': [], 'Three': []})
xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
'Two' : [1,2,3,4,5,6,7],
'Three' : ['one', 'two', 'three', np.nan, 'five',
np.nan, 'seven']})
assert_frame_equal(xp.reindex(columns=df.columns), df)


def test_read_csv(self):
pass
Expand Down

0 comments on commit d9abf68

Please sign in to comment.