Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include missing data count in pd.DataFrame.describe() #26102

Closed
Closed
Changes from 1 commit
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.

Always

Just for now

@@ -412,8 +412,7 @@ Other

- Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed `datetime` (:issue:`25851`)
- Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`)
-
-
- Added enhancement to :func:`pd.DataFrame.describe` to include missing data count as one of the summary statistics (:issue:`21689`)


.. _whatsnew_0.250.contributors:
@@ -9802,9 +9802,10 @@ def describe(self, percentiles=None, include=None, exclude=None):

def describe_numeric_1d(series):
stat_index = (['count', 'mean', 'std', 'min'] +
formatted_percentiles + ['max'])
formatted_percentiles + ['max', 'missing'])
d = ([series.count(), series.mean(), series.std(), series.min()] +

This comment has been minimized.

Copy link
@jreback

jreback Apr 28, 2019

Contributor

should be the first arg

series.quantile(percentiles).tolist() + [series.max()])
series.quantile(percentiles).tolist() + [series.max(),
series.isna().sum()])
return pd.Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data):
@@ -505,6 +505,19 @@ def test_corrwith_kendall(self):
# ---------------------------------------------------------------------
# Describe

def test_missing_describe(self):
df = pd.DataFrame(data={'col1': [1, np.nan],
'col2': [3, 4]})
result = df.describe()

expected = pd.DataFrame({'col1': [1, 1, np.nan, 1, 1, 1, 1, 1, 1],
'col2': [2, 3.5, 0.707107, 3, 3.25, 3.5,
3.75, 4, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max', 'missing'])

tm.assert_frame_equal(result, expected)

def test_bool_describe_in_mixed_frame(self):
df = DataFrame({
'string_data': ['a', 'b', 'c', 'd', 'e'],
@@ -516,9 +529,9 @@ def test_bool_describe_in_mixed_frame(self):
# Boolean and string data are not.
result = df.describe()
expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
tm.assert_frame_equal(result, expected)

# Top value is a boolean value that is False
@@ -546,9 +559,9 @@ def test_describe_bool_frame(self):
})
result = df.describe()
expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
2, 3, 4]},
2, 3, 4, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
tm.assert_frame_equal(result, expected)

df = pd.DataFrame({
@@ -605,11 +618,11 @@ def test_describe_categorical_columns(self):
categories=['int1', 'int2', 'obj'],
ordered=True, name='XXX')
expected = DataFrame({'int1': [5, 30, df.int1.std(),
10, 20, 30, 40, 50],
10, 20, 30, 40, 50, 0],
'int2': [5, 30, df.int2.std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'],
'50%', '75%', 'max', 'missing'],
columns=exp_columns)
tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values,
@@ -627,11 +640,11 @@ def test_describe_datetime_columns(self):
exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
freq='MS', tz='US/Eastern', name='XXX')
expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
10, 20, 30, 40, 50],
10, 20, 30, 40, 50, 0],
1: [5, 30, df.iloc[:, 1].std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
expected.columns = exp_columns
tm.assert_frame_equal(result, expected)
assert result.columns.freq == 'MS'
@@ -649,29 +662,30 @@ def test_describe_timedelta_values(self):
pd.Timedelta('2 days'),
pd.Timedelta('3 days'),
pd.Timedelta('4 days'),
pd.Timedelta('5 days')],
pd.Timedelta('5 days'), 0],
't2': [5, pd.Timedelta('3 hours'),
df.iloc[:, 1].std(),
pd.Timedelta('1 hours'),
pd.Timedelta('2 hours'),
pd.Timedelta('3 hours'),
pd.Timedelta('4 hours'),
pd.Timedelta('5 hours')]},
pd.Timedelta('5 hours'), 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])

result = df.describe()
tm.assert_frame_equal(result, expected)

exp_repr = (" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00")
exp_repr = (" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00\n"
"missing 0 0")
assert repr(result) == exp_repr

def test_describe_tz_values(self, tz_naive_fixture):
@@ -684,14 +698,15 @@ def test_describe_tz_values(self, tz_naive_fixture):
df = pd.DataFrame({'s1': s1, 's2': s2})

expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
2, 1.581139, 0, 1, 2, 3, 4],
2, 1.581139, 0, 1, 2, 3, 4, 0],
's2': [5, 5, s2.value_counts().index[0], 1,
start.tz_localize(tz),
end.tz_localize(tz), np.nan, np.nan,
np.nan, np.nan, np.nan, np.nan, np.nan]},
np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan]},
index=['count', 'unique', 'top', 'freq', 'first',
'last', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max']
'75%', 'max', 'missing']
)
result = df.describe(include='all')
tm.assert_frame_equal(result, expected)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.