Skip to content

BUG: Regression in merging Categorical and object dtypes (GH9426) #9597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ Bug Fixes


- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).

- Regression in merging Categoricals and object dtypes (:issue:`9426`)
- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)
- Fixed bug in ``Series.groupby`` where grouping on ``MultiIndex`` levels would ignore the sort argument (:issue:`9444`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,9 @@ def _maybe_promote(dtype, fill_value=np.nan):
dtype = np.object_

# in case we have a string that looked like a number
if issubclass(np.dtype(dtype).type, compat.string_types):
if is_categorical_dtype(dtype):
dtype = dtype
elif issubclass(np.dtype(dtype).type, compat.string_types):
dtype = np.object_

return dtype, fill_value
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4327,8 +4327,9 @@ def dtype(self):
if not self.needs_filling:
return self.block.dtype
else:
return np.dtype(com._maybe_promote(self.block.dtype,
self.block.fill_value)[0])
return com._get_dtype(com._maybe_promote(self.block.dtype,
self.block.fill_value)[0])

return self._dtype

@cache_readonly
Expand Down
56 changes: 28 additions & 28 deletions pandas/io/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_get_multi2(self):

# sanity checking

assert np.issubdtype(result.dtype, np.floating)
self.assertTrue(np.issubdtype(result.dtype, np.floating))
result = pan.Open.ix['Jan-15-12':'Jan-20-12']
self.assertEqual((4, 3), result.shape)
assert_n_failed_equals_n_null_columns(w, result)
Expand All @@ -121,11 +121,11 @@ def test_get_multi2(self):
def test_dtypes(self):
#GH3995, #GH8980
data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13')
assert np.issubdtype(data.Open.dtype, np.number)
assert np.issubdtype(data.Close.dtype, np.number)
assert np.issubdtype(data.Low.dtype, np.number)
assert np.issubdtype(data.High.dtype, np.number)
assert np.issubdtype(data.Volume.dtype, np.number)
self.assertTrue(np.issubdtype(data.Open.dtype, np.number))
self.assertTrue(np.issubdtype(data.Close.dtype, np.number))
self.assertTrue(np.issubdtype(data.Low.dtype, np.number))
self.assertTrue(np.issubdtype(data.High.dtype, np.number))
self.assertTrue(np.issubdtype(data.Volume.dtype, np.number))

@network
def test_unicode_date(self):
Expand Down Expand Up @@ -183,15 +183,15 @@ def test_get_components_dow_jones(self):
raise nose.SkipTest('unreliable test, receive partial components back for dow_jones')

df = web.get_components_yahoo('^DJI') #Dow Jones
assert isinstance(df, pd.DataFrame)
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(len(df), 30)

@network
def test_get_components_dax(self):
raise nose.SkipTest('unreliable test, receive partial components back for dax')

df = web.get_components_yahoo('^GDAXI') #DAX
assert isinstance(df, pd.DataFrame)
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(len(df), 30)
self.assertEqual(df[df.name.str.contains('adidas', case=False)].index,
'ADS.DE')
Expand All @@ -202,13 +202,13 @@ def test_get_components_nasdaq_100(self):
raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100')

df = web.get_components_yahoo('^NDX') #NASDAQ-100
assert isinstance(df, pd.DataFrame)
self.assertIsInstance(df, pd.DataFrame)

if len(df) > 1:
# Usual culprits, should be around for a while
assert 'AAPL' in df.index
assert 'GOOG' in df.index
assert 'AMZN' in df.index
self.assertTrue('AAPL' in df.index)
self.assertTrue('GOOG' in df.index)
self.assertTrue('AMZN' in df.index)
else:
expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'},
index=['@^NDX'])
Expand Down Expand Up @@ -256,7 +256,7 @@ def test_get_data_multiple_symbols_two_dates(self):
self.assertEqual(len(result), 3)

# sanity checking
assert np.issubdtype(result.dtype, np.floating)
self.assertTrue(np.issubdtype(result.dtype, np.floating))

expected = np.array([[18.99, 28.4, 25.18],
[18.58, 28.31, 25.13],
Expand All @@ -276,7 +276,7 @@ def test_get_date_ret_index(self):
self.assertEqual(result, 1.0)

# sanity checking
assert np.issubdtype(pan.values.dtype, np.floating)
self.assertTrue(np.issubdtype(pan.values.dtype, np.floating))


class TestYahooOptions(tm.TestCase):
Expand Down Expand Up @@ -383,26 +383,26 @@ def test_get_underlying_price(self):
quote_price = options_object._underlying_price_from_root(root)
except RemoteDataError as e:
raise nose.SkipTest(e)
self.assert_(isinstance(quote_price, float))
self.assertIsInstance(quote_price, float)

def test_sample_page_price_quote_time1(self):
#Tests the weekend quote time format
price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html1)
self.assert_(isinstance(price, (int, float, complex)))
self.assert_(isinstance(quote_time, (datetime, Timestamp)))
self.assertIsInstance(price, (int, float, complex))
self.assertIsInstance(quote_time, (datetime, Timestamp))

def test_chop(self):
#regression test for #7625
self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan)
chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100)
self.assert_(isinstance(chopped, DataFrame))
self.assertIsInstance(chopped, DataFrame)
self.assertTrue(len(chopped) > 1)

def test_chop_out_of_strike_range(self):
#regression test for #7625
self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan)
chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100000)
self.assert_(isinstance(chopped, DataFrame))
self.assertIsInstance(chopped, DataFrame)
self.assertTrue(len(chopped) > 1)


Expand All @@ -411,8 +411,8 @@ def test_sample_page_price_quote_time2(self):
#Tests the EDT page format
#regression test for #8741
price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html2)
self.assert_(isinstance(price, (int, float, complex)))
self.assert_(isinstance(quote_time, (datetime, Timestamp)))
self.assertIsInstance(price, (int, float, complex))
self.assertIsInstance(quote_time, (datetime, Timestamp))

@network
def test_sample_page_chg_float(self):
Expand Down Expand Up @@ -452,26 +452,26 @@ def test_is_s3_url(self):
@network
def test_read_yahoo(self):
gs = DataReader("GS", "yahoo")
assert isinstance(gs, DataFrame)
self.assertIsInstance(gs, DataFrame)

@network
def test_read_google(self):
gs = DataReader("GS", "google")
assert isinstance(gs, DataFrame)
self.assertIsInstance(gs, DataFrame)

@network
def test_read_fred(self):
vix = DataReader("VIXCLS", "fred")
assert isinstance(vix, DataFrame)
self.assertIsInstance(vix, DataFrame)

@network
def test_read_famafrench(self):
for name in ("F-F_Research_Data_Factors",
"F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
"F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"):
ff = DataReader(name, "famafrench")
assert ff
assert isinstance(ff, dict)
self.assertTrue(ff is not None)
self.assertIsInstance(ff, dict)


class TestFred(tm.TestCase):
Expand All @@ -498,7 +498,7 @@ def test_fred_nan(self):
start = datetime(2010, 1, 1)
end = datetime(2013, 1, 27)
df = web.DataReader("DFII5", "fred", start, end)
assert pd.isnull(df.ix['2010-01-01'][0])
self.assertTrue(pd.isnull(df.ix['2010-01-01'][0]))

@network
def test_fred_parts(self):
Expand All @@ -510,7 +510,7 @@ def test_fred_parts(self):
self.assertEqual(df.ix['2010-05-01'][0], 217.23)

t = df.CPIAUCSL.values
assert np.issubdtype(t.dtype, np.floating)
self.assertTrue(np.issubdtype(t.dtype, np.floating))
self.assertEqual(t.shape, (37,))

@network
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2423,6 +2423,39 @@ def f():
df.append(df_wrong_categories)
self.assertRaises(ValueError, f)


def test_merge(self):
# GH 9426

right = DataFrame({'c': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'},
'd': {0: 'null', 1: 'null', 2: 'null', 3: 'null', 4: 'null'}})
left = DataFrame({'a': {0: 'f', 1: 'f', 2: 'f', 3: 'f', 4: 'f'},
'b': {0: 'g', 1: 'g', 2: 'g', 3: 'g', 4: 'g'}})
df = pd.merge(left, right, how='left', left_on='b', right_on='c')

# object-object
expected = df.copy()

# object-cat
cright = right.copy()
cright['d'] = cright['d'].astype('category')
result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
tm.assert_frame_equal(result, expected)

# cat-object
cleft = left.copy()
cleft['b'] = cleft['b'].astype('category')
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
tm.assert_frame_equal(result, expected)

# cat-cat
cright = right.copy()
cright['d'] = cright['d'].astype('category')
cleft = left.copy()
cleft['b'] = cleft['b'].astype('category')
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
tm.assert_frame_equal(result, expected)

def test_na_actions(self):

cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def test_repr(self):

# dateutil zone change (only matters for repr)
import dateutil
if dateutil.__version__ >= LooseVersion('2.3'):
if dateutil.__version__ >= LooseVersion('2.3') and dateutil.__version__ <= LooseVersion('2.4'):
timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']
else:
timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles']
Expand Down