Skip to content

Commit

Permalink
Merge pull request #3043 from jreback/combine_3016
Browse files Browse the repository at this point in the history
BUG: frame combine_first where non-specified values could cause dtype changes (#3041)
  • Loading branch information
jreback committed Mar 14, 2013
2 parents 1fe657a + 899c147 commit 4b22372
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 4 deletions.
2 changes: 2 additions & 0 deletions RELEASE.rst
Expand Up @@ -146,6 +146,7 @@ pandas 0.11.0
- Bug in DataFrame column insertion when the column creation fails, existing frame is left in
an irrecoverable state (GH3010_)
- Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
- Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
- Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from
other values), (GH2850_)
- Unstack of a frame with no nans would always cause dtype upcasting (GH2929_)
Expand Down Expand Up @@ -180,6 +181,7 @@ pandas 0.11.0
.. _GH3010: https://github.com/pydata/pandas/issues/3010
.. _GH3012: https://github.com/pydata/pandas/issues/3012
.. _GH3029: https://github.com/pydata/pandas/issues/3029
.. _GH3041: https://github.com/pydata/pandas/issues/3041


pandas 0.10.1
Expand Down
17 changes: 13 additions & 4 deletions pandas/core/frame.py
Expand Up @@ -3723,7 +3723,7 @@ def _compare(a, b):
return self._constructor(data=new_data, index=self.index,
columns=self.columns, copy=False)

def combine(self, other, func, fill_value=None):
def combine(self, other, func, fill_value=None, overwrite=True):
"""
Add two DataFrame objects and do not propagate NaN values, so if for a
(column, time) one frame is missing a value, it will default to the
Expand All @@ -3734,6 +3734,8 @@ def combine(self, other, func, fill_value=None):
other : DataFrame
func : function
fill_value : scalar value
overwrite : boolean, default True
If True then overwrite values for common keys in the calling frame
Returns
-------
Expand All @@ -3760,9 +3762,16 @@ def combine(self, other, func, fill_value=None):
series = this[col].values
otherSeries = other[col].values

this_mask = isnull(series)
other_mask = isnull(otherSeries)

# don't overwrite columns unecessarily
# DO propogate if this column is not in the intersection
if not overwrite and other_mask.all():
result[col] = this[col].copy()
continue

if do_fill:
this_mask = isnull(series)
other_mask = isnull(otherSeries)
series = series.copy()
otherSeries = otherSeries.copy()
series[this_mask] = fill_value
Expand Down Expand Up @@ -3798,7 +3807,7 @@ def combine_first(self, other):
combined : DataFrame
"""
combiner = lambda x, y: np.where(isnull(x), y, x)
return self.combine(other, combiner)
return self.combine(other, combiner, overwrite=False)

def update(self, other, join='left', overwrite=True, filter_func=None,
raise_conflict=False):
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/test_frame.py
Expand Up @@ -7248,6 +7248,30 @@ def test_combine_first_mixed_bug(self):
combined = frame1.combine_first(frame2)
self.assertEqual(len(combined.columns), 5)

# gh 3016 (same as in update)
df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]],
columns=['A','B','bool1','bool2'])

other = DataFrame([[45,45]],index=[0],columns=['A','B'])
result = df.combine_first(other)
assert_frame_equal(result, df)

df.ix[0,'A'] = np.nan
result = df.combine_first(other)
df.ix[0,'A'] = 45
assert_frame_equal(result, df)

# doc example
df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
'B' : [np.nan, 2., 3., np.nan, 6.]})

df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

result = df1.combine_first(df2)
expected = DataFrame({ 'A' : [1,2,3,5,3,7.], 'B' : [np.nan,2,3,4,6,8] })
assert_frame_equal(result,expected)

def test_update(self):
df = DataFrame([[1.5, nan, 3.],
[1.5, nan, 3.],
Expand Down

0 comments on commit 4b22372

Please sign in to comment.