Add splitdown function (#430)

* Added splitdown function (resolves #386, #386). Allows the use of a regex to split a field into new rows. * Added unit test for splitdown function. All tests pass for py26, py27 and py36.
petl-developers · Aug 6, 2019 · d819ceb · d819ceb
2 parents 014746c + 07b69dc
commit d819ceb
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 1 deletion.
diff --git a/petl/test/transform/test_regex.py b/petl/test/transform/test_regex.py
@@ -6,7 +6,7 @@
 
 
 from petl.test.helpers import ieq, eq_
-from petl.transform.regex import capture, split, search, searchcomplement
+from petl.transform.regex import capture, split, search, searchcomplement, splitdown
 from petl.transform.basics import TransformError
 
 
@@ -245,4 +245,35 @@ def test_search_unicode():
     ieq(expect, actual)
 
 
+def test_splitdown():
+
+    tbl = ((u'name', u'roles'),
+           (u'Jane Doe', u'president,engineer,tailor,lawyer'),
+           (u'John Doe', u'rocket scientist,optometrist,chef,knight,sailor'))
+
+    actual = splitdown(tbl, 'roles', ',')
+    expect = ((u'name', u'roles'),
+              (u'Jane Doe', u'president'),
+              (u'Jane Doe', u'engineer'),
+              (u'Jane Doe', u'tailor'),
+              (u'Jane Doe', u'lawyer'),
+              (u'John Doe', u'rocket scientist'),
+              (u'John Doe', u'optometrist'),
+              (u'John Doe', u'chef'),
+              (u'John Doe', u'knight'),
+              (u'John Doe', u'sailor'))
+
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+    ieq(expect, actual)
+
+
 # TODO test sub()
+
diff --git a/petl/transform/regex.py b/petl/transform/regex.py
@@ -380,3 +380,81 @@ def searchcomplement(table, *args, **kwargs):
 
 
 Table.searchcomplement = searchcomplement
+
+
+def splitdown(table, field, pattern, maxsplit=0, flags=0):
+    """
+    Split a field into multiple rows using a regular expression. E.g.:
+
+        >>> import petl as etl
+        >>> table1 = [['name', 'roles'],
+        ...           ['Jane Doe', 'president,engineer,tailor,lawyer'],
+        ...           ['John Doe', 'rocket scientist,optometrist,chef,knight,sailor']]
+        >>> table2 = etl.splitdown(table1, 'roles', ',')
+        >>> table2
+        +------------+--------------------+
+        | name       | roles              |
+        +============+====================+
+        | 'Jane Doe' | 'president'        |
+        +------------+-----+--------------+
+        | 'Jane Doe' | 'engineer'         |
+        +------------+-----+--------------+
+        | 'Jane Doe' | 'tailor'           |
+        +------------+-----+--------------+
+        | 'Jane Doe' | 'lawyer'           |
+        +------------+-----+--------------+
+        | 'John Doe' | 'rocket scientist' |
+        +------------+-----+--------------+
+        | 'John Doe' | 'optometrist'      |
+        +------------+-----+--------------+
+        | 'John Doe' | 'chef'             |
+        +------------+-----+--------------+
+        | 'John Doe' | 'knight'           |
+        +------------+-----+--------------+
+        | 'John Doe' | 'sailor'           |
+        +------------+-----+--------------+
+    
+    """
+
+    return SplitDownView(table, field, pattern, maxsplit, flags)
+
+
+Table.splitdown = splitdown
+
+
+class SplitDownView(Table):
+
+    def __init__(self, table, field, pattern, maxsplit=0, flags=0):
+        self.table = table
+        self.field = field
+        self.pattern = pattern
+        self.maxsplit = maxsplit
+        self.flags = flags
+
+    def __iter__(self):
+        return itersplitdown(self.table, self.field, self.pattern,
+                             self.maxsplit, self.flags)
+
+
+def itersplitdown(table, field, pattern, maxsplit, flags):
+
+    prog = re.compile(pattern, flags)
+    it = iter(table)
+    hdr = next(it)
+    flds = list(map(text_type, hdr))
+
+    if isinstance(field, int) and field < len(hdr):
+        field_index = field
+        field = hdr[field_index]
+    elif field in flds:
+        field_index = flds.index(field)
+    else:
+        raise ArgumentError('field invalid: must be either field name or index')
+
+    yield tuple(hdr)
+
+    for row in it:
+        value = row[field_index]
+        for v in prog.split(value, maxsplit):
+            yield tuple(v if i == field_index else row[i] for i in range(len(hdr)))
+