Skip to content

Commit

Permalink
ENH/BUG: str.extractall doesn't support index
Browse files Browse the repository at this point in the history
closes #10008

Author: sinhrks <sinhrks@gmail.com>

Closes #13156 from sinhrks/str_extractall and squashes the following commits:

ed854ef [sinhrks] ENH/BUG: str.extractall doesn't support index
  • Loading branch information
sinhrks authored and jreback committed May 13, 2016
1 parent 00d4ec3 commit 82f54bd
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 17 deletions.
13 changes: 12 additions & 1 deletion doc/source/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match),

.. ipython:: python
s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"])
s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
s
two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
s.str.extract(two_groups, expand=True)
Expand Down Expand Up @@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as
extractall_result
extractall_result.xs(0, level="match")
``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the
same result as a ``Series.str.extractall`` with a default index (starts from 0).

.. versionadded:: 0.18.2

.. ipython:: python
pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
Testing for Strings that Match or Contain a Pattern
---------------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ Other enhancements

- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)

- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) <text.extractall>` (:issue:`10008`, :issue:`13156`)

.. ipython:: python

idx = pd.Index(["a1a2", "b1", "c1"])
idx.str.extractall("[ab](?P<digit>\d)")

.. _whatsnew_0182.api:

Expand Down Expand Up @@ -120,6 +125,7 @@ Bug Fixes



- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)


- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`)
Expand Down
38 changes: 24 additions & 14 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas.core.algorithms import take_1d
import pandas.compat as compat
from pandas.core.base import AccessorProperty, NoNewAttributesMixin
from pandas.types import api as gt
from pandas.util.decorators import Appender, deprecate_kwarg
import re
import pandas.lib as lib
Expand Down Expand Up @@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object):


def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
from pandas.core.series import Series

if not len(arr):
return np.ndarray(0, dtype=dtype)

if isinstance(arr, Series):
if isinstance(arr, gt.ABCSeries):
arr = arr.values
if not isinstance(arr, np.ndarray):
arr = np.asarray(arr, dtype=object)
Expand Down Expand Up @@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0):
C 0 NaN 1
"""
from pandas import DataFrame, MultiIndex

regex = re.compile(pat, flags=flags)
# the regex must contain capture groups.
if regex.groups == 0:
raise ValueError("pattern contains no capture groups")

if isinstance(arr, gt.ABCIndex):
arr = arr.to_series().reset_index(drop=True)

names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
columns = [names.get(1 + i, i) for i in range(regex.groups)]
match_list = []
index_list = []
is_mi = arr.index.nlevels > 1

for subject_key, subject in arr.iteritems():
if isinstance(subject, compat.string_types):
try:
key_list = list(subject_key)
except TypeError:
key_list = [subject_key]

if not is_mi:
subject_key = (subject_key, )

for match_i, match_tuple in enumerate(regex.findall(subject)):
na_tuple = [
np.NaN if group == "" else group for group in match_tuple]
na_tuple = [np.NaN if group == "" else group
for group in match_tuple]
match_list.append(na_tuple)
result_key = tuple(key_list + [match_i])
result_key = tuple(subject_key + (match_i, ))
index_list.append(result_key)

if 0 < len(index_list):
from pandas import MultiIndex
index = MultiIndex.from_tuples(
index_list, names=arr.index.names + ["match"])
else:
index = None
result = DataFrame(match_list, index, columns)
result = arr._constructor_expanddim(match_list, index=index,
columns=columns)
return result


Expand Down Expand Up @@ -1804,9 +1812,9 @@ class StringAccessorMixin(object):

# string methods
def _make_str_accessor(self):
from pandas.core.series import Series
from pandas.core.index import Index
if (isinstance(self, Series) and

if (isinstance(self, gt.ABCSeries) and
not ((is_categorical_dtype(self.dtype) and
is_object_dtype(self.values.categories)) or
(is_object_dtype(self.dtype)))):
Expand All @@ -1819,6 +1827,8 @@ def _make_str_accessor(self):
"values, which use np.object_ dtype in "
"pandas")
elif isinstance(self, Index):
# can't use ABCIndex to exclude non-str

This comment has been minimized.

Copy link
@jbrockmendel

jbrockmendel Jul 14, 2017

Member

Why does isinstance(self, ABCIndex) fail here? Are there are cases where checking against ABCFoo will be unreliable?

This comment has been minimized.

Copy link
@jreback

jreback Jul 14, 2017

Contributor

because ABCindex just detect an Index object not that it's a string inferred

though i don't really see a problem with this falling thru (as the inferred will catch this anyhow)


# see scc/inferrence.pyx which can contain string values
allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
if self.inferred_type not in allowed_types:
Expand Down
28 changes: 26 additions & 2 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,30 @@ def test_extractall_no_matches(self):
"second"])
tm.assert_frame_equal(r, e)

def test_extractall_stringindex(self):
s = Series(["a1a2", "b1", "c1"], name='xxx')
res = s.str.extractall("[ab](?P<digit>\d)")
exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
names=[None, 'match'])
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
tm.assert_frame_equal(res, exp)

# index should return the same result as the default index without name
# thus index.name doesn't affect to the result
for idx in [Index(["a1a2", "b1", "c1"]),
Index(["a1a2", "b1", "c1"], name='xxx')]:

res = idx.str.extractall("[ab](?P<digit>\d)")
tm.assert_frame_equal(res, exp)

s = Series(["a1a2", "b1", "c1"], name='s_name',
index=Index(["XX", "yy", "zz"], name='idx_name'))
res = s.str.extractall("[ab](?P<digit>\d)")
exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
names=["idx_name", 'match'])
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
tm.assert_frame_equal(res, exp)

def test_extractall_errors(self):
# Does not make sense to use extractall with a regex that has
# no capture groups. (it returns DataFrame with one column for
Expand All @@ -991,8 +1015,8 @@ def test_extractall_errors(self):
s.str.extractall(r'[a-z]')

def test_extract_index_one_two_groups(self):
s = Series(
['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name')
s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
name='series_name')
r = s.index.str.extract(r'([A-Z])', expand=True)
e = DataFrame(['A', "B", "D"])
tm.assert_frame_equal(r, e)
Expand Down

0 comments on commit 82f54bd

Please sign in to comment.