From 385ca3e42e752d3fd3c4f1d5478e100d14876eed Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Wed, 22 Feb 2017 17:17:40 -0500 Subject: [PATCH 1/4] BUG: GH #12223, GH #15262. Allow ints for names in MultiIndex --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/core/frame.py | 6 +++--- pandas/core/groupby.py | 6 +++--- pandas/core/reshape.py | 3 ++- pandas/formats/format.py | 2 +- pandas/indexes/base.py | 5 +++++ pandas/indexes/multi.py | 7 ++++--- pandas/io/sql.py | 2 +- pandas/tests/frame/test_combine_concat.py | 17 +++++++++++++++++ pandas/util/doctools.py | 6 +++--- 10 files changed, 41 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e65276fe51fe8..5e2a6a6a127b2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -185,11 +185,12 @@ Other enhancements - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) -- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) +- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`)<<<<<<< f4edb053e17e51e8c2bed7c16755c4f7f3222117 - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. +- Using numerical names in ``MultiIndex`` causes less errors. (:issue:`12223`) (:issue:`15262`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bfef2cfbd0d51..ce3481fc17c5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2876,7 +2876,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [x for x in self.index.names] if isinstance(self.index, MultiIndex): for i in range(self.index.nlevels): - arrays.append(self.index.get_level_values(i)) + arrays.append(self.index._get_level_values(i)) else: arrays.append(self.index) @@ -2886,9 +2886,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, # append all but the last column so we don't have to modify # the end of this loop for n in range(col.nlevels - 1): - arrays.append(col.get_level_values(n)) + arrays.append(col._get_level_values(n)) - level = col.get_level_values(col.nlevels - 1) + level = col._get_level_values(col.nlevels - 1) names.extend(col.names) elif isinstance(col, Series): level = col._values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0b3fcba1c1ba5..831ca3886773e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -291,8 +291,8 @@ def _set_grouper(self, obj, sort=False): # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - ax = Index(ax.get_level_values( - level), name=ax.names[level]) + ax = Index(ax._get_level_values(level), + name=ax.names[level]) else: if level not in (0, ax.name): @@ -761,7 +761,7 @@ def _index_with_as_index(self, b): gp = self.grouper levels = chain((gp.levels[i][gp.labels[i][b]] for i in range(len(gp.groupings))), - (original.get_level_values(i)[b] + (original._get_level_values(i)[b] for i in range(original.nlevels))) new = MultiIndex.from_arrays(list(levels)) new.names = gp.names + original.names diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5fc0d590a6885..87cb088c2e91e 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -811,7 +811,8 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index - mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N) + mdata[col] = np.asanyarray(frame.columns + ._get_level_values(i)).repeat(N) return DataFrame(mdata, columns=mcolumns) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 6b235b5e1bc33..4c081770e0125 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1566,7 +1566,7 @@ def _save_header(self): if isinstance(index_label, list) and len(index_label) > 1: col_line.extend([''] * (len(index_label) - 1)) - col_line.extend(columns.get_level_values(i)) + col_line.extend(columns._get_level_values(i)) writer.writerow(col_line) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f1f37622b2a74..9e2c796e38ca5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2352,6 +2352,11 @@ def get_level_values(self, level): self._validate_index_level(level) return self + def _get_level_values(self, num): + # Used to mirror implementation for MultiIndex + # GH #10461 + return self.get_level_values(num) + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 18e1da7303d6d..4e721a06eef2c 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -966,7 +966,8 @@ def to_frame(self, index=True): """ from pandas import DataFrame - result = DataFrame({(name or level): self.get_level_values(level) + result = DataFrame({(name or level): + self._get_level_values(level) for name, level in zip(self.names, range(len(self.levels)))}, copy=False) @@ -1301,8 +1302,8 @@ def append(self, other): for o in other): arrays = [] for i in range(self.nlevels): - label = self.get_level_values(i) - appended = [o.get_level_values(i) for o in other] + label = self._get_level_values(i) + appended = [o._get_level_values(i) for o in other] arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index bace43e785dff..2ab642b3af0c7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -749,7 +749,7 @@ def _get_column_names_and_types(self, dtype_mapper): if self.index is not None: for i, idx_label in enumerate(self.index): idx_type = dtype_mapper( - self.frame.index.get_level_values(i)) + self.frame.index._get_level_values(i)) column_names_and_types.append((text_type(idx_label), idx_type, True)) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index eed4d6261d6e8..d8ca2a5043308 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -422,6 +422,23 @@ def test_concat_axis_parameter(self): with assertRaisesRegexp(ValueError, 'No axis named'): pd.concat([series1, series2], axis='something') + def test_concat_numerical_names(self): + # #15262 # #12223 + df = pd.DataFrame({'col': range(9)}, + index=(pd.MultiIndex + .from_product([['A0', 'A1', 'A2'], + ['B0', 'B1', 'B2']], + names=[1, 2]))) + result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) + expected = pd.DataFrame({'col': [0, 1, 7, 8]}, + dtype='int32', + index=pd.MultiIndex.from_tuples([('A0', 'B0'), + ('A0', 'B1'), + ('A2', 'B1'), + ('A2', 'B2')], + names=[1, 2])) + tm.assert_frame_equal(result, expected) + class TestDataFrameCombineFirst(tm.TestCase, TestData): diff --git a/pandas/util/doctools.py b/pandas/util/doctools.py index 62dcba1405581..6df6444aeafab 100644 --- a/pandas/util/doctools.py +++ b/pandas/util/doctools.py @@ -113,12 +113,12 @@ def _insert_index(self, data): else: for i in range(idx_nlevels): data.insert(i, 'Index{0}'.format(i), - data.index.get_level_values(i)) + data.index._get_level_values(i)) col_nlevels = data.columns.nlevels if col_nlevels > 1: - col = data.columns.get_level_values(0) - values = [data.columns.get_level_values(i).values + col = data.columns._get_level_values(0) + values = [data.columns._get_level_values(i).values for i in range(1, col_nlevels)] col_df = pd.DataFrame(values) data.columns = col_df.columns From 89350689fdb934b49e173035c29157947082f8d9 Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Thu, 23 Feb 2017 10:12:45 -0500 Subject: [PATCH 2/4] resolve conflicts --- pandas/indexes/multi.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 4e721a06eef2c..d01edad10a9d1 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -846,7 +846,7 @@ def _try_mi(k): raise InvalidIndexError(key) - def _get_level_values(self, level): + def _get_level_values(self, level, copy=True): """ Return vector of label values for requested level, equal to the length of the index @@ -856,6 +856,7 @@ def _get_level_values(self, level): Parameters ---------- level : int level + copy : bool whether copy of results should be done Returns ------- @@ -866,7 +867,11 @@ def _get_level_values(self, level): labels = self.labels[level] filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - return filled + if copy: + values = unique._shallow_copy(filled) + else: + values = filled + return values def get_level_values(self, level): """ @@ -882,7 +887,7 @@ def get_level_values(self, level): values : Index """ level = self._get_level_number(level) - values = self._get_level_values(level) + values = self._get_level_values(level, copy=False) return self.levels[level]._shallow_copy(values) def format(self, space=2, sparsify=None, adjoin=True, names=False, From 10667a3b9b3f8157f48057a027f7e9dadb5cdca2 Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Thu, 23 Feb 2017 12:08:51 -0500 Subject: [PATCH 3/4] Fix types for test --- pandas/tests/frame/test_combine_concat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index d8ca2a5043308..6f06a55ad065e 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -425,6 +425,7 @@ def test_concat_axis_parameter(self): def test_concat_numerical_names(self): # #15262 # #12223 df = pd.DataFrame({'col': range(9)}, + dtype='int32', index=(pd.MultiIndex .from_product([['A0', 'A1', 'A2'], ['B0', 'B1', 'B2']], From 15d843307a23c169f4f34eb3ca1fa52c036354bc Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Thu, 23 Feb 2017 15:48:36 -0500 Subject: [PATCH 4/4] Address jreback comments --- doc/source/whatsnew/v0.20.0.txt | 4 ++-- pandas/indexes/base.py | 17 +++++++++++++++-- pandas/indexes/multi.py | 14 +++++--------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5e2a6a6a127b2..04423f7016c57 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -185,12 +185,12 @@ Other enhancements - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) -- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`)<<<<<<< f4edb053e17e51e8c2bed7c16755c4f7f3222117 +- ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. -- Using numerical names in ``MultiIndex`` causes less errors. (:issue:`12223`) (:issue:`15262`) +- Fixed issue when using ``pd.concat`` that affected ``MultiIndex`` output formatting when names of index were int (:issue:`12223`, :issue:`15262`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 9e2c796e38ca5..6b29c320f98b5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2353,8 +2353,21 @@ def get_level_values(self, level): return self def _get_level_values(self, num): - # Used to mirror implementation for MultiIndex - # GH #10461 + """ + Return vector of label values for requested level, equal to the length + of the index + + **this is an internal method** + + Parameters + ---------- + level : int + + Returns + ------- + values : ndarray + """ + # Needed to address discussion in GH #10461 return self.get_level_values(num) _index_shared_docs['get_indexer'] = """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d01edad10a9d1..809f758311e2d 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -684,7 +684,7 @@ def is_monotonic_increasing(self): """ # reversed() because lexsort() wants the most significant key last. - values = [self._get_level_values(i) + values = [self._get_level_values(i).values for i in reversed(range(len(self.levels)))] try: sort_order = np.lexsort(values) @@ -846,7 +846,7 @@ def _try_mi(k): raise InvalidIndexError(key) - def _get_level_values(self, level, copy=True): + def _get_level_values(self, level): """ Return vector of label values for requested level, equal to the length of the index @@ -856,7 +856,6 @@ def _get_level_values(self, level, copy=True): Parameters ---------- level : int level - copy : bool whether copy of results should be done Returns ------- @@ -867,10 +866,7 @@ def _get_level_values(self, level, copy=True): labels = self.labels[level] filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - if copy: - values = unique._shallow_copy(filled) - else: - values = filled + values = unique._shallow_copy(filled) return values def get_level_values(self, level): @@ -887,8 +883,8 @@ def get_level_values(self, level): values : Index """ level = self._get_level_number(level) - values = self._get_level_values(level, copy=False) - return self.levels[level]._shallow_copy(values) + values = self._get_level_values(level) + return values def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None):