ENH/BUG: str.extractall doesn't support index

closes #10008 Author: sinhrks <sinhrks@gmail.com> Closes #13156 from sinhrks/str_extractall and squashes the following commits: ed854ef [sinhrks] ENH/BUG: str.extractall doesn't support index
pandas-dev · May 13, 2016 · 82f54bd · jbrockmendel · Jul 14, 2017 · jreback
1 parent 00d4ec3
commit 82f54bd
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 17 deletions.
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match),
 
 .. ipython:: python
 
-   s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"])
+   s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
    s
    two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
    s.str.extract(two_groups, expand=True)
@@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as
    extractall_result
    extractall_result.xs(0, level="match")
 
+``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the
+same result as a ``Series.str.extractall`` with a default index (starts from 0).
+
+.. versionadded:: 0.18.2
+
+.. ipython:: python
+
+   pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
+
+   pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
+
 
 Testing for Strings that Match or Contain a Pattern
 ---------------------------------------------------

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -31,7 +31,12 @@ Other enhancements
 
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
 
+- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) <text.extractall>` (:issue:`10008`, :issue:`13156`)
 
+  .. ipython:: python
+
+     idx = pd.Index(["a1a2", "b1", "c1"])
+     idx.str.extractall("[ab](?P<digit>\d)")
 
 .. _whatsnew_0182.api:
 
@@ -120,6 +125,7 @@ Bug Fixes
 
 
 
+- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError``  (:issue:`13156`)
 
 
 - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -8,6 +8,7 @@
 from pandas.core.algorithms import take_1d
 import pandas.compat as compat
 from pandas.core.base import AccessorProperty, NoNewAttributesMixin
+from pandas.types import api as gt
 from pandas.util.decorators import Appender, deprecate_kwarg
 import re
 import pandas.lib as lib
@@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object):
 
 
 def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
-    from pandas.core.series import Series
-
     if not len(arr):
         return np.ndarray(0, dtype=dtype)
 
-    if isinstance(arr, Series):
+    if isinstance(arr, gt.ABCSeries):
         arr = arr.values
     if not isinstance(arr, np.ndarray):
         arr = np.asarray(arr, dtype=object)
@@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0):
     C 0        NaN     1
 
     """
-    from pandas import DataFrame, MultiIndex
+
     regex = re.compile(pat, flags=flags)
     # the regex must contain capture groups.
     if regex.groups == 0:
         raise ValueError("pattern contains no capture groups")
+
+    if isinstance(arr, gt.ABCIndex):
+        arr = arr.to_series().reset_index(drop=True)
+
     names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
     columns = [names.get(1 + i, i) for i in range(regex.groups)]
     match_list = []
     index_list = []
+    is_mi = arr.index.nlevels > 1
+
     for subject_key, subject in arr.iteritems():
         if isinstance(subject, compat.string_types):
-            try:
-                key_list = list(subject_key)
-            except TypeError:
-                key_list = [subject_key]
+
+            if not is_mi:
+                subject_key = (subject_key, )
+
             for match_i, match_tuple in enumerate(regex.findall(subject)):
-                na_tuple = [
-                    np.NaN if group == "" else group for group in match_tuple]
+                na_tuple = [np.NaN if group == "" else group
+                            for group in match_tuple]
                 match_list.append(na_tuple)
-                result_key = tuple(key_list + [match_i])
+                result_key = tuple(subject_key + (match_i, ))
                 index_list.append(result_key)
+
     if 0 < len(index_list):
+        from pandas import MultiIndex
         index = MultiIndex.from_tuples(
             index_list, names=arr.index.names + ["match"])
     else:
         index = None
-    result = DataFrame(match_list, index, columns)
+    result = arr._constructor_expanddim(match_list, index=index,
+                                        columns=columns)
     return result
 
 
@@ -1804,9 +1812,9 @@ class StringAccessorMixin(object):
 
     # string methods
     def _make_str_accessor(self):
-        from pandas.core.series import Series
         from pandas.core.index import Index
-        if (isinstance(self, Series) and
+
+        if (isinstance(self, gt.ABCSeries) and
                 not ((is_categorical_dtype(self.dtype) and
                       is_object_dtype(self.values.categories)) or
                      (is_object_dtype(self.dtype)))):
@@ -1819,6 +1827,8 @@ def _make_str_accessor(self):
                                  "values, which use np.object_ dtype in "
                                  "pandas")
         elif isinstance(self, Index):
+            # can't use ABCIndex to exclude non-str
+
             # see scc/inferrence.pyx which can contain string values
             allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
             if self.inferred_type not in allowed_types:

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -982,6 +982,30 @@ def test_extractall_no_matches(self):
                                "second"])
         tm.assert_frame_equal(r, e)
 
+    def test_extractall_stringindex(self):
+        s = Series(["a1a2", "b1", "c1"], name='xxx')
+        res = s.str.extractall("[ab](?P<digit>\d)")
+        exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
+                                         names=[None, 'match'])
+        exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
+        tm.assert_frame_equal(res, exp)
+
+        # index should return the same result as the default index without name
+        # thus index.name doesn't affect to the result
+        for idx in [Index(["a1a2", "b1", "c1"]),
+                    Index(["a1a2", "b1", "c1"], name='xxx')]:
+
+            res = idx.str.extractall("[ab](?P<digit>\d)")
+            tm.assert_frame_equal(res, exp)
+
+        s = Series(["a1a2", "b1", "c1"], name='s_name',
+                   index=Index(["XX", "yy", "zz"], name='idx_name'))
+        res = s.str.extractall("[ab](?P<digit>\d)")
+        exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
+                                         names=["idx_name", 'match'])
+        exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
+        tm.assert_frame_equal(res, exp)
+
     def test_extractall_errors(self):
         # Does not make sense to use extractall with a regex that has
         # no capture groups. (it returns DataFrame with one column for
@@ -991,8 +1015,8 @@ def test_extractall_errors(self):
             s.str.extractall(r'[a-z]')
 
     def test_extract_index_one_two_groups(self):
-        s = Series(
-            ['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name')
+        s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
+                   name='series_name')
         r = s.index.str.extract(r'([A-Z])', expand=True)
         e = DataFrame(['A', "B", "D"])
         tm.assert_frame_equal(r, e)