pandas-dev · alexander-ponomaroff · Apr 15, 2019 · Apr 27, 2019 · Apr 27, 2019 · Apr 27, 2019
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -412,8 +412,7 @@ Other
 
 - Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed `datetime` (:issue:`25851`)
 - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`)
--
--
+- Added enhancement to :func:`pd.DataFrame.describe` to include missing data count as one of the summary statistics (:issue:`21689`)
 
 
 .. _whatsnew_0.250.contributors:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9802,9 +9802,10 @@ def describe(self, percentiles=None, include=None, exclude=None):
 
         def describe_numeric_1d(series):
             stat_index = (['count', 'mean', 'std', 'min'] +
-                          formatted_percentiles + ['max'])
+                          formatted_percentiles + ['max', 'missing'])
             d = ([series.count(), series.mean(), series.std(), series.min()] +
-                 series.quantile(percentiles).tolist() + [series.max()])
+                 series.quantile(percentiles).tolist() + [series.max(),
+                 series.isna().sum()])
             return pd.Series(d, index=stat_index, name=series.name)
 
         def describe_categorical_1d(data):

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -505,6 +505,19 @@ def test_corrwith_kendall(self):
     # ---------------------------------------------------------------------
     # Describe
 
+    def test_missing_describe(self):
+        df = pd.DataFrame(data={'col1': [1, np.nan],
+                                'col2': [3, 4]})
+        result = df.describe()
+
+        expected = pd.DataFrame({'col1': [1, 1, np.nan, 1, 1, 1, 1, 1, 1],
+                                 'col2': [2, 3.5, 0.707107, 3, 3.25, 3.5,
+                                          3.75, 4, 0]},
+                                index=['count', 'mean', 'std', 'min', '25%',
+                                       '50%', '75%', 'max', 'missing'])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_bool_describe_in_mixed_frame(self):
         df = DataFrame({
             'string_data': ['a', 'b', 'c', 'd', 'e'],
@@ -516,9 +529,9 @@ def test_bool_describe_in_mixed_frame(self):
         # Boolean and string data are not.
         result = df.describe()
         expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
-                                           10, 20, 30, 40, 50]},
+                                           10, 20, 30, 40, 50, 0]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'missing'])
         tm.assert_frame_equal(result, expected)
 
         # Top value is a boolean value that is False
@@ -546,9 +559,9 @@ def test_describe_bool_frame(self):
         })
         result = df.describe()
         expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
-                                           2, 3, 4]},
+                                           2, 3, 4, 0]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'missing'])
         tm.assert_frame_equal(result, expected)
 
         df = pd.DataFrame({
@@ -605,11 +618,11 @@ def test_describe_categorical_columns(self):
                                           categories=['int1', 'int2', 'obj'],
                                           ordered=True, name='XXX')
         expected = DataFrame({'int1': [5, 30, df.int1.std(),
-                                       10, 20, 30, 40, 50],
+                                       10, 20, 30, 40, 50, 0],
                               'int2': [5, 30, df.int2.std(),
-                                       10, 20, 30, 40, 50]},
+                                       10, 20, 30, 40, 50, 0]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'],
+                                    '50%', '75%', 'max', 'missing'],
                              columns=exp_columns)
         tm.assert_frame_equal(result, expected)
         tm.assert_categorical_equal(result.columns.values,
@@ -627,11 +640,11 @@ def test_describe_datetime_columns(self):
         exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
                                        freq='MS', tz='US/Eastern', name='XXX')
         expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
-                                  10, 20, 30, 40, 50],
+                                  10, 20, 30, 40, 50, 0],
                               1: [5, 30, df.iloc[:, 1].std(),
-                                  10, 20, 30, 40, 50]},
+                                  10, 20, 30, 40, 50, 0]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'missing'])
         expected.columns = exp_columns
         tm.assert_frame_equal(result, expected)
         assert result.columns.freq == 'MS'
@@ -649,29 +662,30 @@ def test_describe_timedelta_values(self):
                                      pd.Timedelta('2 days'),
                                      pd.Timedelta('3 days'),
                                      pd.Timedelta('4 days'),
-                                     pd.Timedelta('5 days')],
+                                     pd.Timedelta('5 days'), 0],
                               't2': [5, pd.Timedelta('3 hours'),
                                      df.iloc[:, 1].std(),
                                      pd.Timedelta('1 hours'),
                                      pd.Timedelta('2 hours'),
                                      pd.Timedelta('3 hours'),
                                      pd.Timedelta('4 hours'),
-                                     pd.Timedelta('5 hours')]},
+                                     pd.Timedelta('5 hours'), 0]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'missing'])
 
         result = df.describe()
         tm.assert_frame_equal(result, expected)
 
-        exp_repr = ("                           t1                      t2\n"
-                    "count                       5                       5\n"
-                    "mean          3 days 00:00:00         0 days 03:00:00\n"
-                    "std    1 days 13:56:50.394919  0 days 01:34:52.099788\n"
-                    "min           1 days 00:00:00         0 days 01:00:00\n"
-                    "25%           2 days 00:00:00         0 days 02:00:00\n"
-                    "50%           3 days 00:00:00         0 days 03:00:00\n"
-                    "75%           4 days 00:00:00         0 days 04:00:00\n"
-                    "max           5 days 00:00:00         0 days 05:00:00")
+        exp_repr = ("                             t1                      t2\n"
+                    "count                         5                       5\n"
+                    "mean            3 days 00:00:00         0 days 03:00:00\n"
+                    "std      1 days 13:56:50.394919  0 days 01:34:52.099788\n"
+                    "min             1 days 00:00:00         0 days 01:00:00\n"
+                    "25%             2 days 00:00:00         0 days 02:00:00\n"
+                    "50%             3 days 00:00:00         0 days 03:00:00\n"
+                    "75%             4 days 00:00:00         0 days 04:00:00\n"
+                    "max             5 days 00:00:00         0 days 05:00:00\n"
+                    "missing                       0                       0")
         assert repr(result) == exp_repr
 
     def test_describe_tz_values(self, tz_naive_fixture):
@@ -684,14 +698,15 @@ def test_describe_tz_values(self, tz_naive_fixture):
         df = pd.DataFrame({'s1': s1, 's2': s2})
 
         expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
-                                     2, 1.581139, 0, 1, 2, 3, 4],
+                                     2, 1.581139, 0, 1, 2, 3, 4, 0],
                               's2': [5, 5, s2.value_counts().index[0], 1,
                                      start.tz_localize(tz),
                                      end.tz_localize(tz), np.nan, np.nan,
-                                     np.nan, np.nan, np.nan, np.nan, np.nan]},
+                                     np.nan, np.nan, np.nan, np.nan, np.nan,
+                                     np.nan]},
                              index=['count', 'unique', 'top', 'freq', 'first',
                                     'last', 'mean', 'std', 'min', '25%', '50%',
-                                    '75%', 'max']
+                                    '75%', 'max', 'missing']
                              )
         result = df.describe(include='all')
         tm.assert_frame_equal(result, expected)