pandas-dev · alexander-ponomaroff · Apr 15, 2019 · Apr 27, 2019 · Apr 27, 2019 · Apr 27, 2019
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -420,7 +420,7 @@ Other
 ^^^^^
 
 - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`)
-
+- Added enhancement to :func:`pd.DataFrame.describe` to include size as one of the summary statistics (:issue:`21689`)
 
 .. _whatsnew_0.250.contributors:
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9804,9 +9804,10 @@ def describe(self, percentiles=None, include=None, exclude=None):
 
         def describe_numeric_1d(series):
             stat_index = (['count', 'mean', 'std', 'min'] +
-                          formatted_percentiles + ['max'])
+                          formatted_percentiles + ['max', 'size'])
             d = ([series.count(), series.mean(), series.std(), series.min()] +
-                 series.quantile(percentiles).tolist() + [series.max()])
+                 series.quantile(percentiles).tolist() + [series.max(),
+                 series.size])
             return pd.Series(d, index=stat_index, name=series.name)
 
         def describe_categorical_1d(data):

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -505,6 +505,19 @@ def test_corrwith_kendall(self):
     # ---------------------------------------------------------------------
     # Describe
 
+    def test_missing_describe(self):
+        df = pd.DataFrame(data={'col1': [1, np.nan],
+                                'col2': [3, 4]})
+        result = df.describe()
+
+        expected = pd.DataFrame({'col1': [1, 1, np.nan, 1, 1, 1, 1, 1, 2],
+                                 'col2': [2, 3.5, 0.707107, 3, 3.25, 3.5,
+                                          3.75, 4, 2]},
+                                index=['count', 'mean', 'std', 'min', '25%',
+                                       '50%', '75%', 'max', 'size'])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_bool_describe_in_mixed_frame(self):
         df = DataFrame({
             'string_data': ['a', 'b', 'c', 'd', 'e'],
@@ -516,9 +529,9 @@ def test_bool_describe_in_mixed_frame(self):
         # Boolean and string data are not.
         result = df.describe()
         expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
-                                           10, 20, 30, 40, 50]},
+                                           10, 20, 30, 40, 50, 5]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'size'])
         tm.assert_frame_equal(result, expected)
 
         # Top value is a boolean value that is False
@@ -546,9 +559,9 @@ def test_describe_bool_frame(self):
         })
         result = df.describe()
         expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
-                                           2, 3, 4]},
+                                           2, 3, 4, 5]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'size'])
         tm.assert_frame_equal(result, expected)
 
         df = pd.DataFrame({
@@ -605,11 +618,11 @@ def test_describe_categorical_columns(self):
                                           categories=['int1', 'int2', 'obj'],
                                           ordered=True, name='XXX')
         expected = DataFrame({'int1': [5, 30, df.int1.std(),
-                                       10, 20, 30, 40, 50],
+                                       10, 20, 30, 40, 50, 5],
                               'int2': [5, 30, df.int2.std(),
-                                       10, 20, 30, 40, 50]},
+                                       10, 20, 30, 40, 50, 5]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'],
+                                    '50%', '75%', 'max', 'size'],
                              columns=exp_columns)
         tm.assert_frame_equal(result, expected)
         tm.assert_categorical_equal(result.columns.values,
@@ -627,11 +640,11 @@ def test_describe_datetime_columns(self):
         exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
                                        freq='MS', tz='US/Eastern', name='XXX')
         expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
-                                  10, 20, 30, 40, 50],
+                                  10, 20, 30, 40, 50, 5],
                               1: [5, 30, df.iloc[:, 1].std(),
-                                  10, 20, 30, 40, 50]},
+                                  10, 20, 30, 40, 50, 5]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'size'])
         expected.columns = exp_columns
         tm.assert_frame_equal(result, expected)
         assert result.columns.freq == 'MS'
@@ -649,16 +662,16 @@ def test_describe_timedelta_values(self):
                                      pd.Timedelta('2 days'),
                                      pd.Timedelta('3 days'),
                                      pd.Timedelta('4 days'),
-                                     pd.Timedelta('5 days')],
+                                     pd.Timedelta('5 days'), 5],
                               't2': [5, pd.Timedelta('3 hours'),
                                      df.iloc[:, 1].std(),
                                      pd.Timedelta('1 hours'),
                                      pd.Timedelta('2 hours'),
                                      pd.Timedelta('3 hours'),
                                      pd.Timedelta('4 hours'),
-                                     pd.Timedelta('5 hours')]},
+                                     pd.Timedelta('5 hours'), 5]},
                              index=['count', 'mean', 'std', 'min', '25%',
-                                    '50%', '75%', 'max'])
+                                    '50%', '75%', 'max', 'size'])
 
         result = df.describe()
         tm.assert_frame_equal(result, expected)
@@ -671,7 +684,8 @@ def test_describe_timedelta_values(self):
                     "25%           2 days 00:00:00         0 days 02:00:00\n"
                     "50%           3 days 00:00:00         0 days 03:00:00\n"
                     "75%           4 days 00:00:00         0 days 04:00:00\n"
-                    "max           5 days 00:00:00         0 days 05:00:00")
+                    "max           5 days 00:00:00         0 days 05:00:00\n"
+                    "size                        5                       5")
         assert repr(result) == exp_repr
 
     def test_describe_tz_values(self, tz_naive_fixture):
@@ -684,14 +698,15 @@ def test_describe_tz_values(self, tz_naive_fixture):
         df = pd.DataFrame({'s1': s1, 's2': s2})
 
         expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
-                                     2, 1.581139, 0, 1, 2, 3, 4],
+                                     2, 1.581139, 0, 1, 2, 3, 4, 5],
                               's2': [5, 5, s2.value_counts().index[0], 1,
                                      start.tz_localize(tz),
                                      end.tz_localize(tz), np.nan, np.nan,
-                                     np.nan, np.nan, np.nan, np.nan, np.nan]},
+                                     np.nan, np.nan, np.nan, np.nan, np.nan,
+                                     np.nan]},
                              index=['count', 'unique', 'top', 'freq', 'first',
                                     'last', 'mean', 'std', 'min', '25%', '50%',
-                                    '75%', 'max']
+                                    '75%', 'max', 'size']
                              )
         result = df.describe(include='all')
         tm.assert_frame_equal(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -420,7 +420,7 @@ Other @@
     ^^^^^
     - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`)
+    - Added enhancement to :func:`pd.DataFrame.describe` to include size as one of the summary statistics (:issue:`21689`)
     .. _whatsnew_0.250.contributors:
@@ Expand Down @@