Skip to content

Commit

Permalink
Add splitdown function (#430)
Browse files Browse the repository at this point in the history
* Added splitdown function (resolves #386, #386).

Allows the use of a regex to split a field into new rows.

* Added unit test for splitdown function.

All tests pass for py26, py27 and py36.
  • Loading branch information
alimanfoo committed Aug 6, 2019
2 parents 014746c + 07b69dc commit d819ceb
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 1 deletion.
33 changes: 32 additions & 1 deletion petl/test/transform/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


from petl.test.helpers import ieq, eq_
from petl.transform.regex import capture, split, search, searchcomplement
from petl.transform.regex import capture, split, search, searchcomplement, splitdown
from petl.transform.basics import TransformError


Expand Down Expand Up @@ -245,4 +245,35 @@ def test_search_unicode():
ieq(expect, actual)


def test_splitdown():

tbl = ((u'name', u'roles'),
(u'Jane Doe', u'president,engineer,tailor,lawyer'),
(u'John Doe', u'rocket scientist,optometrist,chef,knight,sailor'))

actual = splitdown(tbl, 'roles', ',')
expect = ((u'name', u'roles'),
(u'Jane Doe', u'president'),
(u'Jane Doe', u'engineer'),
(u'Jane Doe', u'tailor'),
(u'Jane Doe', u'lawyer'),
(u'John Doe', u'rocket scientist'),
(u'John Doe', u'optometrist'),
(u'John Doe', u'chef'),
(u'John Doe', u'knight'),
(u'John Doe', u'sailor'))

ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)
ieq(expect, actual)


# TODO test sub()

78 changes: 78 additions & 0 deletions petl/transform/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,3 +380,81 @@ def searchcomplement(table, *args, **kwargs):


Table.searchcomplement = searchcomplement


def splitdown(table, field, pattern, maxsplit=0, flags=0):
"""
Split a field into multiple rows using a regular expression. E.g.:
>>> import petl as etl
>>> table1 = [['name', 'roles'],
... ['Jane Doe', 'president,engineer,tailor,lawyer'],
... ['John Doe', 'rocket scientist,optometrist,chef,knight,sailor']]
>>> table2 = etl.splitdown(table1, 'roles', ',')
>>> table2
+------------+--------------------+
| name | roles |
+============+====================+
| 'Jane Doe' | 'president' |
+------------+-----+--------------+
| 'Jane Doe' | 'engineer' |
+------------+-----+--------------+
| 'Jane Doe' | 'tailor' |
+------------+-----+--------------+
| 'Jane Doe' | 'lawyer' |
+------------+-----+--------------+
| 'John Doe' | 'rocket scientist' |
+------------+-----+--------------+
| 'John Doe' | 'optometrist' |
+------------+-----+--------------+
| 'John Doe' | 'chef' |
+------------+-----+--------------+
| 'John Doe' | 'knight' |
+------------+-----+--------------+
| 'John Doe' | 'sailor' |
+------------+-----+--------------+
"""

return SplitDownView(table, field, pattern, maxsplit, flags)


Table.splitdown = splitdown


class SplitDownView(Table):

def __init__(self, table, field, pattern, maxsplit=0, flags=0):
self.table = table
self.field = field
self.pattern = pattern
self.maxsplit = maxsplit
self.flags = flags

def __iter__(self):
return itersplitdown(self.table, self.field, self.pattern,
self.maxsplit, self.flags)


def itersplitdown(table, field, pattern, maxsplit, flags):

prog = re.compile(pattern, flags)
it = iter(table)
hdr = next(it)
flds = list(map(text_type, hdr))

if isinstance(field, int) and field < len(hdr):
field_index = field
field = hdr[field_index]
elif field in flds:
field_index = flds.index(field)
else:
raise ArgumentError('field invalid: must be either field name or index')

yield tuple(hdr)

for row in it:
value = row[field_index]
for v in prog.split(value, maxsplit):
yield tuple(v if i == field_index else row[i] for i in range(len(hdr)))

0 comments on commit d819ceb

Please sign in to comment.