DataFrame.append preserves columns dtype if possible

pandas-dev · Jan 1, 2018 · 6bb6860 · 6bb6860
1 parent 0e3c797
commit 6bb6860
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -145,6 +145,7 @@ Other Enhancements
 - ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
   Previously, calls to ``pipe`` were diverted to  the ``mean`` method (:issue:`17905`).
 - :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
+- :meth:`DataFrame.append` now preserves the type of the calling dataframe's columns, when possible (:issue:`18359`)
 
 .. _whatsnew_0230.api_breaking:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5039,8 +5039,12 @@ def append(self, other, ignore_index=False, verify_integrity=False):
                 # index name will be reset
                 index = Index([other.name], name=self.index.name)
 
-            combined_columns = self.columns.tolist() + self.columns.union(
-                other.index).difference(self.columns).tolist()
+            idx_diff = other.index.difference(self.columns)
+            try:
+                combined_columns = self.columns.append(idx_diff)
+            except TypeError:
+                lst = self.columns.tolist()
+                combined_columns = Index(lst).append(idx_diff)
             other = other.reindex(combined_columns, copy=False)
             other = DataFrame(other.values.reshape((1, len(other))),
                               index=index,

diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -1,5 +1,6 @@
 from warnings import catch_warnings
 
+import datetime as dt
 import dateutil
 import numpy as np
 from numpy.random import randn
@@ -820,11 +821,59 @@ def test_append_preserve_index_name(self):
         result = df1.append(df2)
         assert result.index.name == 'A'
 
+    @pytest.mark.parametrize("df_columns", [
+        pd.RangeIndex(3),
+        pd.CategoricalIndex('A B C'.split()),
+        pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
+        pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
+        pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
+                          dt.datetime(2013, 1, 3, 6, 10),
+                          dt.datetime(2013, 1, 3, 7, 12)]),
+        pd.Index([1, 2, 3]),
+    ])
+    def test_append_same_columns_type(self, df_columns):
+        # GH18359
+
+        # ser.index is a normal pd.Index, result from df.append(ser) should be
+        # pd.Index (but this is not possible for IntervalIndex and MultiIndex)
+        if not isinstance(df_columns, (pd.IntervalIndex, pd.MultiIndex)):
+            df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
+            ser = pd.Series([7], index=['a'], name=2)
+            result = df.append(ser)
+            idx_diff = ser.index.difference(df_columns)
+            combined_columns = Index(df_columns.tolist()).append(idx_diff)
+            expected = pd.DataFrame([[1., 2., 3., np.nan],
+                                     [4, 5, 6, np.nan],
+                                     [np.nan, np.nan, np.nan, 7]],
+                                    index=[0, 1, 2],
+                                    columns=combined_columns)
+            assert_frame_equal(result, expected)
+
+        # df wider than ser
+        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
+        ser_index = df_columns[:2]
+        ser = pd.Series([7, 8], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
+                                index=[0, 1, 2],
+                                columns=df_columns)
+        assert_frame_equal(result, expected)
+
+        # ser wider than df
+        ser_index = df_columns
+        df_columns = df_columns[:2]
+        df = pd.DataFrame([[1, 2], [4, 5]], columns=df_columns)
+        ser = pd.Series([7, 8, 9], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
+                                index=[0, 1, 2],
+                                columns=ser_index)
+        assert_frame_equal(result, expected)
+
     def test_append_dtype_coerce(self):
 
         # GH 4993
         # appending with datetime will incorrectly convert datetime64
-        import datetime as dt
         from pandas import NaT
 
         df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -1457,12 +1457,14 @@ def test_crosstab_normalize(self):
                                           index=pd.Index([1, 2, 'All'],
                                                          name='a',
                                                          dtype='object'),
-                                          columns=pd.Index([3, 4], name='b'))
+                                          columns=pd.Index([3, 4], name='b',
+                                                           dtype='object'))
         col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
                                           index=pd.Index([1, 2], name='a',
                                                          dtype='object'),
                                           columns=pd.Index([3, 4, 'All'],
-                                                           name='b'))
+                                                           name='b',
+                                                           dtype='object'))
 
         all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
                                            [0.2, 0.6, 0.8],
@@ -1471,7 +1473,8 @@ def test_crosstab_normalize(self):
                                                          name='a',
                                                          dtype='object'),
                                           columns=pd.Index([3, 4, 'All'],
-                                                           name='b'))
+                                                           name='b',
+                                                           dtype='object'))
         tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
                                           margins=True), row_normal_margins)
         tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',