Join GitHub today
GitHub is home to over 20 million developers working together to host and review code, manage projects, and build software together.
BUG/PERF: Sort mixed-int in Py3, fix Index.difference #13514
Conversation
pijucha
and 1 other
commented on an outdated diff
Jun 26, 2016
| @@ -1923,14 +1924,38 @@ def difference(self, other): | ||
| other, result_name = self._convert_can_do_setop(other) | ||
| - theDiff = sorted(set(self) - set(other)) | ||
| - return Index(theDiff, name=result_name) | ||
| + this = self._get_unique_index() | ||
| + | ||
| + # In what follows, get_indexer doesn't treat NaN's specially, | ||
| + # so it would break the existing behavior, e.g.: | ||
| + # `Index([1, nan]).difference(Index[nan]) == Index([nan, 1])`. | ||
| + # Thus, dropping NaN's from `other` is a hack for backward compat. | ||
| + # We do it here instead of manipulating `the_diff` later. | ||
| + # (Check for MultiIndex is to get around a .hasnans exception) | ||
| + dropna = not isinstance(self, ABCMultiIndex) and \ | ||
| + not isinstance(other, ABCMultiIndex) and \ | ||
| + self.hasnans and other.hasnans | ||
| + other = other._get_unique_index(dropna=dropna) |
pijucha
Contributor
|
pijucha
and 1 other
commented on an outdated diff
Jun 26, 2016
| + """ | ||
| + def sort_mixed(values): | ||
| + # order ints before strings, safe in py3 | ||
| + str_pos = np.array([isinstance(x, string_types) for x in values], | ||
| + dtype=bool) | ||
| + nums = np.sort(values[~str_pos]) | ||
| + strs = np.sort(values[str_pos]) | ||
| + return com._ensure_object(np.concatenate([nums, strs])) | ||
| + | ||
| + sorter = None | ||
| + try: | ||
| + sorter = values.argsort() | ||
| + ordered = values.take(sorter) | ||
| + except: | ||
| + # unorderable in py3 if mixed str/int | ||
| + ordered = sort_mixed(values) |
pijucha
Contributor
|
jreback
commented on the diff
Jun 26, 2016
| @@ -142,6 +142,71 @@ def isin(comps, values): | ||
| return f(comps, values) | ||
| +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): | ||
| + """ | ||
| + Sort ``values`` and reorder corresponding ``labels``. | ||
| + ``values`` should be unique if ``labels`` is not None. | ||
| + Safe for use with mixed types (int, str), orders ints before strs. | ||
| + |
|
|
jreback
commented on an outdated diff
Jun 26, 2016
| + nums = np.sort(values[~str_pos]) | ||
| + strs = np.sort(values[str_pos]) | ||
| + return com._ensure_object(np.concatenate([nums, strs])) | ||
| + | ||
| + sorter = None | ||
| + try: | ||
| + sorter = values.argsort() | ||
| + ordered = values.take(sorter) | ||
| + except: | ||
| + # unorderable in py3 if mixed str/int | ||
| + ordered = sort_mixed(values) | ||
| + | ||
| + if labels is None: | ||
| + return ordered | ||
| + | ||
| + if not assume_unique and len(values) != len(set(values)): |
|
|
jreback
commented on an outdated diff
Jun 26, 2016
| + ------- | ||
| + ordered : ndarray | ||
| + Sorted ``values`` | ||
| + new_labels : ndarray | ||
| + Reordered ``labels``; returned when ``labels`` is not None | ||
| + """ | ||
| + def sort_mixed(values): | ||
| + # order ints before strings, safe in py3 | ||
| + str_pos = np.array([isinstance(x, string_types) for x in values], | ||
| + dtype=bool) | ||
| + nums = np.sort(values[~str_pos]) | ||
| + strs = np.sort(values[str_pos]) | ||
| + return com._ensure_object(np.concatenate([nums, strs])) | ||
| + | ||
| + sorter = None | ||
| + try: |
jreback
Contributor
|
jreback
commented on an outdated diff
Jun 26, 2016
| + new_labels : ndarray | ||
| + Reordered ``labels``; returned when ``labels`` is not None | ||
| + """ | ||
| + def sort_mixed(values): | ||
| + # order ints before strings, safe in py3 | ||
| + str_pos = np.array([isinstance(x, string_types) for x in values], | ||
| + dtype=bool) | ||
| + nums = np.sort(values[~str_pos]) | ||
| + strs = np.sort(values[str_pos]) | ||
| + return com._ensure_object(np.concatenate([nums, strs])) | ||
| + | ||
| + sorter = None | ||
| + try: | ||
| + sorter = values.argsort() | ||
| + ordered = values.take(sorter) | ||
| + except: |
|
|
jreback
commented on the diff
Jun 26, 2016
| - t.map_locations(com._ensure_object(uniques)) | ||
| - | ||
| - # order ints before strings | ||
| - ordered = np.concatenate([ | ||
| - np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], | ||
| - dtype=object)) for f in | ||
| - [lambda x: not isinstance(x, string_types), | ||
| - lambda x: isinstance(x, string_types)]]) | ||
| - sorter = com._ensure_platform_int(t.lookup( | ||
| - com._ensure_object(ordered))) | ||
| - | ||
| - reverse_indexer = np.empty(len(sorter), dtype=np.int_) | ||
| - reverse_indexer.put(sorter, np.arange(len(sorter))) | ||
| - | ||
| - mask = labels < 0 | ||
| - labels = reverse_indexer.take(labels) |
|
|
jreback
commented on the diff
Jun 26, 2016
jreback
commented on an outdated diff
Jun 26, 2016
| @@ -1923,14 +1924,38 @@ def difference(self, other): | ||
| other, result_name = self._convert_can_do_setop(other) | ||
| - theDiff = sorted(set(self) - set(other)) | ||
| - return Index(theDiff, name=result_name) | ||
| + this = self._get_unique_index() | ||
| + | ||
| + # In what follows, get_indexer doesn't treat NaN's specially, | ||
| + # so it would break the existing behavior, e.g.: | ||
| + # `Index([1, nan]).difference(Index[nan]) == Index([nan, 1])`. | ||
| + # Thus, dropping NaN's from `other` is a hack for backward compat. | ||
| + # We do it here instead of manipulating `the_diff` later. | ||
| + # (Check for MultiIndex is to get around a .hasnans exception) | ||
| + dropna = not isinstance(self, ABCMultiIndex) and \ | ||
| + not isinstance(other, ABCMultiIndex) and \ | ||
| + self.hasnans and other.hasnans |
jreback
Contributor
|
jreback
commented on an outdated diff
Jun 26, 2016
| + # We do it here instead of manipulating `the_diff` later. | ||
| + # (Check for MultiIndex is to get around a .hasnans exception) | ||
| + dropna = not isinstance(self, ABCMultiIndex) and \ | ||
| + not isinstance(other, ABCMultiIndex) and \ | ||
| + self.hasnans and other.hasnans | ||
| + other = other._get_unique_index(dropna=dropna) | ||
| + | ||
| + indexer = this.get_indexer(other) | ||
| + indexer = indexer.take((indexer != -1).nonzero()[0]) | ||
| + | ||
| + label_diff = np.setdiff1d(np.arange(this.size), indexer, | ||
| + assume_unique=True) | ||
| + the_diff = this.values.take(label_diff) | ||
| + try: | ||
| + the_diff = algos.safe_sort(the_diff) | ||
| + except: |
|
|
jreback
commented on an outdated diff
Jun 26, 2016
| + ``values`` should be unique if ``labels`` is not None. | ||
| + Safe for use with mixed types (int, str), orders ints before strs. | ||
| + | ||
| + Parameters | ||
| + ---------- | ||
| + values : ndarray (1-d) | ||
| + Sequence; must be unique if ``labels`` is not None. | ||
| + labels : ndarray (1-d) | ||
| + Indices to ``values`` | ||
| + na_sentinel : int, default -1 | ||
| + Value in ``labels`` to mark "not found". | ||
| + Ignored when ``labels`` is None. | ||
| + assume_unique : bool, default False | ||
| + When True, doesn't check for uniqness of ``values``. | ||
| + Ignored when ``labels`` is None. | ||
| + |
|
|
jreback
commented on the diff
Jun 26, 2016
| + values : ndarray (1-d) | ||
| + Sequence; must be unique if ``labels`` is not None. | ||
| + labels : ndarray (1-d) | ||
| + Indices to ``values`` | ||
| + na_sentinel : int, default -1 | ||
| + Value in ``labels`` to mark "not found". | ||
| + Ignored when ``labels`` is None. | ||
| + assume_unique : bool, default False | ||
| + When True, doesn't check for uniqness of ``values``. | ||
| + Ignored when ``labels`` is None. | ||
| + | ||
| + Returns | ||
| + ------- | ||
| + ordered : ndarray | ||
| + Sorted ``values`` | ||
| + new_labels : ndarray |
jreback
Contributor
|
jreback
commented on an outdated diff
Jun 26, 2016
| @@ -1967,8 +1992,33 @@ def symmetric_difference(self, other, result_name=None): | ||
| if result_name is None: | ||
| result_name = result_name_update | ||
| - the_diff = sorted(set((self.difference(other)). | ||
| - union(other.difference(self)))) | ||
| + this = self._get_unique_index() | ||
| + | ||
| + # Dropping NaN's is a hack for NaN's backward compatibility | ||
| + # (see the comment in `Index.difference`). | ||
| + dropna = not isinstance(self, ABCMultiIndex) and \ |
jreback
Contributor
|
jreback
commented on the diff
Jun 26, 2016
jreback
and 1 other
commented on an outdated diff
Jun 26, 2016
| @@ -738,6 +742,13 @@ def test_difference(self): | ||
| self.assertEqual(len(result), 0) | ||
| self.assertEqual(result.name, first.name) | ||
| + # mixed, GH 13432 | ||
| + idx1 = Index([0, 1, 'A', 'B']) |
jreback
Contributor
|
jreback
commented on an outdated diff
Jun 26, 2016
| @@ -3169,6 +3169,12 @@ def test_groupby_nonstring_columns(self): | ||
| expected = df.groupby(df[0]).mean() | ||
| assert_frame_equal(result, expected) | ||
| + def test_groupby_mixed_type_columns(self): | ||
| + # GH 13432, unorderable types in py3 | ||
| + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) | ||
| + df.groupby('A').first() |
|
|
jreback
commented on an outdated diff
Jun 26, 2016
| @@ -548,6 +548,16 @@ def test_join_many_non_unique_index(self): | ||
| assert_frame_equal(inner, left) | ||
| assert_frame_equal(inner, right) | ||
| + def test_join_mixed_non_unique_index(self): | ||
| + # GH 12814, unorderable types in py3 with a non-unique index | ||
| + df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) | ||
| + df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) | ||
| + df1.join(df2) | ||
| + |
|
|
jreback
commented on the diff
Jun 26, 2016
| @@ -1202,16 +1202,11 @@ def _sort_labels(uniques, left, right): | ||
| # tuplesafe | ||
| uniques = Index(uniques).values | ||
| - sorter = uniques.argsort() | ||
| + l = len(left) | ||
| + labels = np.concatenate([left, right]) |
jreback
Contributor
|
|
xref #13504 |
|
@jreback Thanks for the comments. |
jreback
added Indexing Dtypes Compat Performance
labels
Jun 27, 2016
codecov-io
commented
Jul 3, 2016
•
Current coverage is 84.39%@@ master #13514 diff @@
==========================================
Files 142 142
Lines 51223 51278 +55
Methods 0 0
Messages 0 0
Branches 0 0
==========================================
+ Hits 43224 43276 +52
- Misses 7999 8002 +3
Partials 0 0
|
pijucha
commented on the diff
Jul 3, 2016
| @@ -1977,6 +2009,36 @@ def symmetric_difference(self, other, result_name=None): | ||
| sym_diff = deprecate('sym_diff', symmetric_difference) | ||
| + def _get_unique_index(self, dropna=False): | ||
| + """ | ||
| + Returns an index containing unique values. | ||
| + | ||
| + Parameters | ||
| + ---------- | ||
| + dropna : bool | ||
| + If True, NaN values are dropped. |
pijucha
Contributor
|
pijucha
and 1 other
commented on an outdated diff
Jul 3, 2016
| @@ -48,7 +48,8 @@ def setUp(self): | ||
| catIndex=tm.makeCategoricalIndex(100), | ||
| empty=Index([]), | ||
| tuples=MultiIndex.from_tuples(lzip( | ||
| - ['foo', 'bar', 'baz'], [1, 2, 3]))) | ||
| + ['foo', 'bar', 'baz'], [1, 2, 3])), | ||
| + mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) |
pijucha
Contributor
|
jreback
commented on an outdated diff
Jul 3, 2016
| @@ -412,6 +412,32 @@ Furthermore: | ||
| - Passing duplicated ``percentiles`` will now raise a ``ValueError``. | ||
| - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) | ||
| +.. _whatsnew_0182.api.difference: | ||
| + | ||
| +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as amy other values. |
|
|
jreback
commented on an outdated diff
Jul 3, 2016
| @@ -412,6 +412,32 @@ Furthermore: | ||
| - Passing duplicated ``percentiles`` will now raise a ``ValueError``. | ||
| - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) | ||
| +.. _whatsnew_0182.api.difference: | ||
| + | ||
| +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as amy other values. | ||
| + | ||
| +.. ipython:: python |
jreback
Contributor
|
jreback
commented on an outdated diff
Jul 3, 2016
| @@ -444,7 +470,7 @@ Performance Improvements | ||
| - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) | ||
| - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) | ||
| - | ||
| +- Improved performance of ``Index.difference`` (:issue: `12044`) |
|
|
jreback
and 1 other
commented on an outdated diff
Jul 3, 2016
| @@ -527,3 +553,5 @@ Bug Fixes | ||
| - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) | ||
| +- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) | ||
| +- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`), (:issue:`12814`) |
|
|
jreback
commented on the diff
Jul 3, 2016
| @@ -142,6 +142,103 @@ def isin(comps, values): | ||
| return f(comps, values) | ||
| +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): | ||
| + """ | ||
| + Sort ``values`` and reorder corresponding ``labels``. | ||
| + ``values`` should be unique if ``labels`` is not None. | ||
| + Safe for use with mixed types (int, str), orders ints before strs. | ||
| + | ||
| + .. versionadded:: 0.18.2 | ||
| + | ||
| + Parameters | ||
| + ---------- | ||
| + values : list-like | ||
| + Sequence; must be unique if ``labels`` is not None. |
|
|
jreback
commented on the diff
Jul 3, 2016
| + ``values`` should be unique if ``labels`` is not None. | ||
| + Safe for use with mixed types (int, str), orders ints before strs. | ||
| + | ||
| + .. versionadded:: 0.18.2 | ||
| + | ||
| + Parameters | ||
| + ---------- | ||
| + values : list-like | ||
| + Sequence; must be unique if ``labels`` is not None. | ||
| + labels : list_like | ||
| + Indices to ``values``. All out of bound indices are treated as | ||
| + "not found" and will be masked with ``na_sentinel``. | ||
| + na_sentinel : int, default -1 | ||
| + Value in ``labels`` to mark "not found". | ||
| + Ignored when ``labels`` is None. | ||
| + assume_unique : bool, default False |
pijucha
Contributor
|
jreback
and 1 other
commented on an outdated diff
Jul 3, 2016
| + ordered = values.take(sorter) | ||
| + except TypeError: | ||
| + # try this anyway | ||
| + ordered = sort_mixed(values) | ||
| + | ||
| + # labels: | ||
| + | ||
| + if labels is None: | ||
| + return ordered | ||
| + | ||
| + if not com.is_list_like(labels): | ||
| + raise TypeError("Only list-like objects or None are allowed to be" | ||
| + "passed to safe_sort as labels") | ||
| + labels = com._ensure_platform_int(np.asarray(labels)) | ||
| + | ||
| + if not assume_unique and len(values) != len(set(values)): |
jreback
Contributor
|
jreback
commented on the diff
Jul 3, 2016
| + if self.is_unique and not dropna: | ||
| + return self | ||
| + | ||
| + values = self.values | ||
| + | ||
| + if not self.is_unique: | ||
| + values = self.unique() | ||
| + | ||
| + if dropna: | ||
| + try: | ||
| + if self.hasnans: | ||
| + values = values[~isnull(values)] | ||
| + except NotImplementedError: | ||
| + pass | ||
| + | ||
| + return self._shallow_copy(values) |
jreback
Contributor
|
jreback
commented on an outdated diff
Jul 3, 2016
| @@ -287,12 +287,58 @@ def test_duplicates(self): | ||
| self.assertEqual(result.name, 'foo') | ||
| self.assert_index_equal(result, Index([ind[0]], name='foo')) | ||
| + def test_get_unique_index(self): | ||
| + for ind in self.indices.values(): | ||
| + | ||
| + if not len(ind): | ||
| + continue | ||
| + | ||
| + idx = ind[[0] * 5] | ||
| + # A workaround for MultiIndex |
jreback
Contributor
|
jreback
and 1 other
commented on an outdated diff
Jul 3, 2016
| @@ -325,6 +372,11 @@ def test_argsort(self): | ||
| def test_numpy_argsort(self): | ||
| for k, ind in self.indices.items(): | ||
| + | ||
| + # 'mixedIndex' unorderable in Python3 | ||
| + if k in ['mixedIndex']: |
jreback
Contributor
|
|
I just noticed that Also, I was thinking about MultiIndex #13504. I see two possible solutions:
|
|
ok @pijucha looks good. minor doc updates. ping when green. other changes leave to other PR's |
pijucha
and 1 other
commented on an outdated diff
Jul 8, 2016
| @@ -1615,6 +1632,122 @@ def test_string_index_repr(self): | ||
| self.assertEqual(coerce(idx), expected) | ||
| + def test_mixed_int_index(self): |
pijucha
Contributor
|
pijucha
commented on the diff
Jul 8, 2016
| + s3 = s1 * s2 | ||
| + self.assertEqual(s3.index.name, 'mario') | ||
| + | ||
| + # test_union_base | ||
| + first = ind[3:] | ||
| + second = ind[:5] | ||
| + | ||
| + if PY3: | ||
| + with tm.assert_produces_warning(RuntimeWarning): | ||
| + # unorderable types | ||
| + result = first.union(second) | ||
| + expected = Index(['b', 2, 'c', 0, 'a', 1]) | ||
| + self.assert_index_equal(result, expected) | ||
| + else: | ||
| + result = first.union(second) | ||
| + expected = Index(['b', 2, 'c', 0, 'a', 1]) |
pijucha
Contributor
|
|
@jreback Done |
pijucha
referenced
this pull request
Jul 8, 2016
Closed
BUG: Fix bug with symmetric difference of two equal MultiIndexes GH12490 #13504
|
@pijucha pls rebase, merging lots of stuff. ping when green. I think this lgtm. |
jreback
added this to the
0.19.0
milestone
Jul 10, 2016
jreback
commented on an outdated diff
Jul 10, 2016
| @@ -412,6 +412,32 @@ Furthermore: | ||
| - Passing duplicated ``percentiles`` will now raise a ``ValueError``. | ||
| - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) | ||
| +.. _whatsnew_0182.api.difference: |
|
|
|
@jreback Updated |
|
@jreback I see several merges have been made in the meantime. Should I rebase again? |
|
yes |
|
@jreback Rebased, tests green. |
jreback
commented on an outdated diff
Jul 14, 2016
| @@ -446,6 +446,32 @@ Furthermore: | ||
| - Passing duplicated ``percentiles`` will now raise a ``ValueError``. | ||
| - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) | ||
| +.. _whatsnew_0190.api.difference: |
|
|
jreback
commented on an outdated diff
Jul 14, 2016
| @@ -163,6 +163,104 @@ def isin(comps, values): | ||
| return f(comps, values) | ||
| +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): | ||
| + """ | ||
| + Sort ``values`` and reorder corresponding ``labels``. | ||
| + ``values`` should be unique if ``labels`` is not None. | ||
| + Safe for use with mixed types (int, str), orders ints before strs. | ||
| + | ||
| + .. versionadded:: 0.18.2 |
|
|
jreback
and 1 other
commented on an outdated diff
Jul 14, 2016
| result = idx1.symmetric_difference(idx2) | ||
| - # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) | ||
| + expected = Index([0.0, 2.0, 3.0]) | ||
| + tm.assert_index_equal(result, expected) | ||
| + | ||
| + # GH #6444, sorting of nans. Make sure the number of nans is right | ||
| + # and the correct non-nan values are there. punt on sorting. | ||
| + result = idx1.symmetric_difference(idx3) | ||
| + # expected = Index([0.0, 2.0, 3.0, np.nan]) |
pijucha
Contributor
|
jreback
commented on an outdated diff
Jul 14, 2016
| + result = ind.argsort() | ||
| + else: | ||
| + result = ind.argsort() | ||
| + expected = np.array(ind).argsort() | ||
| + tm.assert_numpy_array_equal(result, expected, check_dtype=False) | ||
| + | ||
| + # test_numpy_argsort | ||
| + if PY3: | ||
| + with tm.assertRaisesRegexp(TypeError, "unorderable types"): | ||
| + result = np.argsort(ind) | ||
| + else: | ||
| + result = np.argsort(ind) | ||
| + expected = ind.argsort() | ||
| + tm.assert_numpy_array_equal(result, expected) | ||
| + | ||
| + # test_copy_name |
|
|
jreback
commented on an outdated diff
Jul 14, 2016
| + self.assertTrue(ind.equals(first)) | ||
| + | ||
| + self.assertEqual(first.name, 'mario') | ||
| + self.assertEqual(second.name, 'mario') | ||
| + | ||
| + s1 = Series(2, index=first) | ||
| + s2 = Series(3, index=second[:-1]) | ||
| + if PY3: | ||
| + with tm.assert_produces_warning(RuntimeWarning): | ||
| + # unorderable types | ||
| + s3 = s1 * s2 | ||
| + else: | ||
| + s3 = s1 * s2 | ||
| + self.assertEqual(s3.index.name, 'mario') | ||
| + | ||
| + # test_union_base |
jreback
Contributor
|
jreback
commented on the diff
Jul 14, 2016
| @@ -1877,6 +1877,15 @@ def test_duplicate_meta_data(self): | ||
| self.assertTrue(idx.has_duplicates) | ||
| self.assertEqual(idx.drop_duplicates().names, idx.names) | ||
| + def test_get_unique_index(self): | ||
| + idx = self.index[[0, 1, 0, 1, 1, 0, 0]] |
pijucha
Contributor
|
|
@pijucha looks really good. just some tests splitting, minor corrections. ping on green. |
|
@jreback OK, cleaned some tests. It's green. |
jreback
and 1 other
commented on an outdated diff
Jul 15, 2016
| Parameters | ||
| ---------- | ||
| - other : Index or array-like | ||
| + other: Index or array-like |
jreback
Contributor
|
jreback
commented on the diff
Jul 15, 2016
| + expected = Index([0, 1, 'a']) | ||
| + self.assert_index_equal(result, expected) | ||
| + | ||
| + def test_symmetric_difference(self): | ||
| + # (same results for py2 and py3 but sortedness not tested elsewhere) | ||
| + idx = self.create_index() | ||
| + first = idx[:4] | ||
| + second = idx[3:] | ||
| + | ||
| + result = first.symmetric_difference(second) | ||
| + expected = Index([0, 1, 2, 'a', 'c']) | ||
| + self.assert_index_equal(result, expected) | ||
| + | ||
| + def test_logical_compat(self): | ||
| + idx = self.create_index() | ||
| + self.assertEqual(idx.all(), idx.values.all()) |
|
|
|
minor doc change. ping on green. |
|
some failures on windows. we ultimately test on appveyor.com (you just need a free account), but its very slow. so easiest to spin up a vm if you can. these are prob because you need to use
|
|
OK, I think I've found the problem but need some time to test it on windows. |
pijucha
commented on the diff
Jul 17, 2016
| - reverse_indexer = np.empty(len(sorter), dtype=np.int64) | ||
| - reverse_indexer.put(sorter, np.arange(len(sorter))) | ||
| - | ||
| - new_left = reverse_indexer.take(_ensure_platform_int(left)) | ||
| - np.putmask(new_left, left == -1, -1) | ||
| - | ||
| - new_right = reverse_indexer.take(_ensure_platform_int(right)) | ||
| - np.putmask(new_right, right == -1, -1) | ||
| + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) | ||
| + new_labels = _ensure_int64(new_labels) |
pijucha
Contributor
|
|
@jreback The fix was straightforward and it should be ok now. However, I'm getting these two errors on windows (64bit) - also on the master branch, so they're independent of this PR. Either I messed up the windows setup or it's not being tested thoroughly.
|
|
2nd is already fixed |
|
Ah, then it's probably ok. I tested 0.18.1+202.g0a70b5f on windows. |
|
Yes, everything is all right. |
|
is #13432 completely closed by this? if not pls make checkboxes in the top (and tick off the things that are done and leave open things that are not). |
jreback
closed this
in b225cac
Jul 19, 2016
|
thanks @pijucha pretty awesome! |
|
thanks! |
pijucha
deleted the
pijucha:setop13432 branch
Jul 19, 2016
|
@pijucha this is failing on windows / py2.7 (3 is clean), and only windows: https://ci.appveyor.com/project/jreback/pandas-465/build/1.0.762/job/ghokkwmadwog983y not sure what is going on |
|
@jreback According to test_categorical.py, numpy (>= 1.10) should sort mixed int-datetime array. But it doesn't: In [3]: arr = np.array([1, 2, datetime.now(), 0, 3], dtype='O')
In [4]: np.sort(arr)
/home/users/piotr/workspace/pandas-pijucha/pandas_dev_python2/lib/python2.7/site-packages/numpy/core/fromnumeric.py:825: RuntimeWarning: tp_compare didn't return -1 or -2 for exception
a.sort(axis, kind, order)
Out[4]: array([1, 2, datetime.datetime(2016, 7, 19, 9, 49, 28, 214675), 0, 3], dtype=object)
In [6]: np.__version__
Out[6]: '1.11.0'Ipython probably interferes here because in pure python2.7 I'm getting
In the old code in ordered = [np.sort(np.array([e for e in arr if f(e)], dtype=object))
for f in [lambda x: True, lambda x: False]]I haven't yet caught it precisely but it looks as if it sometimes swallowed an exception. (New code in It looks to me that |
|
ok can u make a new issue about this? (pretty much copy your above comment( |
|
Sure, i will. |
pijucha commentedJun 26, 2016
git diff upstream/master | flake8 --diffsafe_sortto safely sort mixed-integerarrays in Python3.
in order to:
non-unique indexes (issue with sorting mixed-ints, #12814)
arguments was a named empty Index (#13432)
Benchmarks (for
index_objectonly):