Skip to content

Commit

Permalink
Allow merging on object / non-object column (#21681)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored and jreback committed Jan 3, 2019
1 parent c08de6b commit 819418e
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 45 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Expand Up @@ -432,6 +432,7 @@ Backwards incompatible API changes
- The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`)
- Incorrectly passing a :class:`DatetimeIndex` to :meth:`MultiIndex.from_tuples`, rather than a sequence of tuples, now raises a ``TypeError`` rather than a ``ValueError`` (:issue:`24024`)
- :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`)
- In 0.23.x, pandas would raise a ``ValueError`` on a merge of a numeric column (e.g. ``int`` dtyped column) and an ``object`` dtyped column (:issue:`9780`). We have re-enabled the ability to merge ``object`` and other dtypes (:issue:`21681`)

Percentage change on groupby
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
64 changes: 44 additions & 20 deletions pandas/core/reshape/merge.py
Expand Up @@ -20,7 +20,7 @@
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
needs_i8_conversion)
is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype

from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
Expand Down Expand Up @@ -901,6 +901,8 @@ def _maybe_coerce_merge_keys(self):

lk_is_cat = is_categorical_dtype(lk)
rk_is_cat = is_categorical_dtype(rk)
lk_is_object = is_object_dtype(lk)
rk_is_object = is_object_dtype(rk)

# if either left or right is a categorical
# then the must match exactly in categories & ordered
Expand All @@ -925,7 +927,7 @@ def _maybe_coerce_merge_keys(self):
# the same, then proceed
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
pass
continue

# check whether ints and floats
elif is_integer_dtype(rk) and is_float_dtype(lk):
Expand All @@ -934,29 +936,49 @@ def _maybe_coerce_merge_keys(self):
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)
continue

elif is_float_dtype(rk) and is_integer_dtype(lk):
if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
warnings.warn('You are merging on int and float '
'columns where the float values '
'are not equal to their int '
'representation', UserWarning)
continue

# let's infer and see if we are ok
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
pass
continue

# Check if we are trying to merge on obviously
# incompatible dtypes GH 9780, GH 15800

# boolean values are considered as numeric, but are still allowed
# to be merged on object boolean values
elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
and not is_numeric_dtype(rk)):
raise ValueError(msg)
elif (not is_numeric_dtype(lk)
and (is_numeric_dtype(rk) and not is_bool_dtype(rk))):
raise ValueError(msg)
# bool values are coerced to object
elif ((lk_is_object and is_bool_dtype(rk)) or
(is_bool_dtype(lk) and rk_is_object)):
pass

# object values are allowed to be merged
elif ((lk_is_object and is_numeric_dtype(rk)) or
(is_numeric_dtype(lk) and rk_is_object)):
inferred_left = lib.infer_dtype(lk)
inferred_right = lib.infer_dtype(rk)
bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']

# inferred bool
if (inferred_left in bool_types and
inferred_right in bool_types):
pass

# unless we are merging non-string-like with string-like
elif ((inferred_left in string_types and
inferred_right not in string_types) or
(inferred_right in string_types and
inferred_left not in string_types)):
raise ValueError(msg)

# datetimelikes must match exactly
elif is_datetimelike(lk) and not is_datetimelike(rk):
raise ValueError(msg)
elif not is_datetimelike(lk) and is_datetimelike(rk):
Expand All @@ -966,22 +988,24 @@ def _maybe_coerce_merge_keys(self):
elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
raise ValueError(msg)

elif lk_is_object and rk_is_object:
continue

# Houston, we have a problem!
# let's coerce to object if the dtypes aren't
# categorical, otherwise coerce to the category
# dtype. If we coerced categories to object,
# then we would lose type information on some
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
else:
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(typ)})

def _validate_specification(self):
# Hm, any way to make this logic less complicated??
Expand Down
55 changes: 30 additions & 25 deletions pandas/tests/reshape/merge/test_merge.py
Expand Up @@ -926,10 +926,6 @@ class TestMergeDtypes(object):
@pytest.mark.parametrize('right_vals', [
['foo', 'bar'],
Series(['foo', 'bar']).astype('category'),
[1, 2],
[1.0, 2.0],
Series([1, 2], dtype='uint64'),
Series([1, 2], dtype='int32')
])
def test_different(self, right_vals):

Expand All @@ -944,22 +940,8 @@ def test_different(self, right_vals):
# GH 9780
# We allow merging on object and categorical cols and cast
# categorical cols to object
if (is_categorical_dtype(right['A'].dtype) or
is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

# GH 9780
# We raise for merging on object col and int/float col and
# merging on categorical col and int/float col
else:
msg = ("You are trying to merge on "
"{lk_dtype} and {rk_dtype} columns. "
"If you wish to proceed you should use "
"pd.concat".format(lk_dtype=left['A'].dtype,
rk_dtype=right['A'].dtype))
with pytest.raises(ValueError, match=msg):
pd.merge(left, right, on='A')
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

@pytest.mark.parametrize('d1', [np.int64, np.int32,
np.int16, np.int8, np.uint8])
Expand Down Expand Up @@ -1058,6 +1040,33 @@ def test_merge_incompat_infer_boolean_object(self):
assert_frame_equal(result, expected)

@pytest.mark.parametrize('df1_vals, df2_vals', [
# merge on category coerces to object
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
# no not infer
([0, 1], pd.Series([False, True], dtype=object)),
([0, 1], pd.Series([False, True], dtype=bool)),
])
def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
# these are explicity allowed incompat merges, that pass thru
# the result type is dependent on if the values on the rhs are
# inferred, otherwise these will be coereced to object

df1 = DataFrame({'A': df1_vals})
df2 = DataFrame({'A': df2_vals})

result = pd.merge(df1, df2, on=['A'])
assert is_object_dtype(result.A.dtype)
result = pd.merge(df2, df1, on=['A'])
assert is_object_dtype(result.A.dtype)

@pytest.mark.parametrize('df1_vals, df2_vals', [
# do not infer to numeric
(Series([1, 2], dtype='uint64'), ["a", "b", "c"]),
(Series([1, 2], dtype='int32'), ["a", "b", "c"]),
([0, 1, 2], ["0", "1", "2"]),
([0.0, 1.0, 2.0], ["0", "1", "2"]),
([0, 1, 2], [u"0", u"1", u"2"]),
Expand All @@ -1067,12 +1076,8 @@ def test_merge_incompat_infer_boolean_object(self):
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
(pd.date_range('20130101', periods=3),
pd.date_range('20130101', periods=3, tz='US/Eastern')),
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
# TODO ([0, 1], pd.Series([False, True], dtype=bool)),
([0, 1], pd.Series([False, True], dtype=object))
])
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
# GH 9780, GH 15800
# Raise a ValueError when a user tries to merge on
# dtypes that are incompatible (e.g., obj and int/float)
Expand Down

0 comments on commit 819418e

Please sign in to comment.