Skip to content

Commit

Permalink
ENH: added rsplit to StringMethods
Browse files Browse the repository at this point in the history
  • Loading branch information
mortada committed Jun 8, 2015
1 parent 342c91b commit bc66f43
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ strings and apply several methods to it. These can be acccessed like
Series.str.slice
Series.str.slice_replace
Series.str.split
Series.str.rsplit
Series.str.startswith
Series.str.strip
Series.str.swapcase
Expand Down
14 changes: 14 additions & 0 deletions doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ Easy to expand this to return a DataFrame using ``expand``.
s2.str.split('_', expand=True)
It is also possible to limit the number of splits:

.. ipython:: python
s2.str.split('_', expand=True, n=1)
``rsplit`` is similar to ``split`` except it works in the reverse direction,
i.e., from the end of the string to the beginning of the string:

.. ipython:: python
s2.str.rsplit('_', expand=True, n=1)
Methods like ``replace`` and ``findall`` take `regular expressions
<https://docs.python.org/2/library/re.html>`__, too:

Expand Down Expand Up @@ -239,6 +252,7 @@ Method Summary

:meth:`~Series.str.cat`,Concatenate strings
:meth:`~Series.str.split`,Split strings on delimiter
:meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string
:meth:`~Series.str.get`,Index into each element (retrieve i-th element)
:meth:`~Series.str.join`,Join strings in each element of the Series with passed separator
:meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ See the :ref:`documentation <basics.pipe>` for more. (:issue:`10129`)
.. _magrittr: https://github.com/smbache/magrittr
.. _R: http://www.r-project.org

- Added `rsplit` to Index/Series StringMethods (:issue:`10303`)

.. _whatsnew_0162.enhancements.other:

Other enhancements
Expand Down
34 changes: 34 additions & 0 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,35 @@ def str_split(arr, pat=None, n=None):
return res


def str_rsplit(arr, pat=None, n=None):
"""
Split each string in the Series/Index by the given delimiter
string, starting at the end of the string and working to the front.
Equivalent to :meth:`str.rsplit`.
.. versionadded:: 0.16.2
Parameters
----------
pat : string, default None
Separator to split on. If None, splits on whitespace
n : int, default -1 (all)
None, 0 and -1 will be interpreted as return all splits
expand : bool, default False
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index.
Returns
-------
split : Series/Index or DataFrame/MultiIndex of objects
"""
if n is None or n == 0:
n = -1
f = lambda x: x.rsplit(pat, n)
res = _na_map(f, arr)
return res


def str_slice(arr, start=None, stop=None, step=None):
"""
Slice substrings from each element in the Series/Index
Expand Down Expand Up @@ -1115,6 +1144,11 @@ def split(self, pat=None, n=-1, expand=False):
result = str_split(self.series, pat, n=n)
return self._wrap_result_expand(result, expand=expand)

@copy(str_rsplit)
def rsplit(self, pat=None, n=-1, expand=False):
result = str_rsplit(self.series, pat, n=n)
return self._wrap_result_expand(result, expand=expand)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
containing the part before the separator, the separator itself,
Expand Down
131 changes: 122 additions & 9 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ def test_empty_str_methods(self):
tm.assert_series_equal(empty_str, empty.str.pad(42))
tm.assert_series_equal(empty_str, empty.str.center(42))
tm.assert_series_equal(empty_list, empty.str.split('a'))
tm.assert_series_equal(empty_list, empty.str.rsplit('a'))
tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
Expand Down Expand Up @@ -1212,15 +1213,15 @@ def test_split(self):
# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
None, 1, 2.])
rs = mixed.str.split('_')
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
result = mixed.str.split('_')
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
NA, NA, NA])
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
tm.assert_isinstance(result, Series)
tm.assert_almost_equal(result, exp)

rs = mixed.str.split('_', expand=False)
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
result = mixed.str.split('_', expand=False)
tm.assert_isinstance(result, Series)
tm.assert_almost_equal(result, exp)

# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
Expand All @@ -1234,12 +1235,75 @@ def test_split(self):
result = values.str.split('_', expand=False)
tm.assert_series_equal(result, exp)

# regex split
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
result = values.str.split('[,_]')
exp = Series([[u('a'), u('b'), u('c')],
[u('c'), u('d'), u('e')], NA,
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)

def test_rsplit(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.rsplit('_')
exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
tm.assert_series_equal(result, exp)

# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
result = values.str.rsplit('__')
tm.assert_series_equal(result, exp)

result = values.str.rsplit('__', expand=False)
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
None, 1, 2.])
result = mixed.str.rsplit('_')
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
NA, NA, NA])
tm.assert_isinstance(result, Series)
tm.assert_almost_equal(result, exp)

result = mixed.str.rsplit('_', expand=False)
tm.assert_isinstance(result, Series)
tm.assert_almost_equal(result, exp)

# unicode
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
result = values.str.rsplit('_')
exp = Series([[u('a'), u('b'), u('c')],
[u('c'), u('d'), u('e')], NA,
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)

result = values.str.rsplit('_', expand=False)
tm.assert_series_equal(result, exp)

# regex split is not supported by rsplit
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
result = values.str.rsplit('[,_]')
exp = Series([[u('a,b_c')],
[u('c_d,e')],
NA,
[u('f,g,h')]])
tm.assert_series_equal(result, exp)

# setting max number of splits, make sure it's from reverse
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
result = values.str.rsplit('_', n=1)
exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
tm.assert_series_equal(result, exp)

def test_split_noargs(self):
# #1859
s = Series(['Wes McKinney', 'Travis Oliphant'])

result = s.str.split()
self.assertEqual(result[1], ['Travis', 'Oliphant'])
expected = ['Travis', 'Oliphant']
self.assertEqual(result[1], expected)
result = s.str.rsplit()
self.assertEqual(result[1], expected)

def test_split_maxsplit(self):
# re.split 0, str.split -1
Expand Down Expand Up @@ -1348,6 +1412,55 @@ def test_split_to_multiindex_expand(self):
with tm.assertRaisesRegexp(ValueError, "expand must be"):
idx.str.split('_', return_type="some_invalid_type")

def test_rsplit_to_dataframe_expand(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

result = s.str.rsplit('_', expand=True, n=2)
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

result = s.str.rsplit('_', expand=True, n=1)
exp = DataFrame({0: ['some_equal', 'with_no'],
1: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.rsplit('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

def test_rsplit_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.rsplit('_', expand=True)
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.rsplit('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 3)

idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.rsplit('_', expand=True, n=1)
exp = MultiIndex.from_tuples([('some_equal', 'splits'),
('with_no', 'nans')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 2)

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])

Expand Down

0 comments on commit bc66f43

Please sign in to comment.