Skip to content

Commit

Permalink
Merge pull request #157 from mailgun/brendan/fallback_last_word
Browse files Browse the repository at this point in the history
Optionally fallback to parsing last word of addresses
  • Loading branch information
b0d0nne11 committed May 10, 2017
2 parents 89b911d + d5a2233 commit 66507ba
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 27 deletions.
47 changes: 39 additions & 8 deletions flanker/addresslib/address.py
Expand Up @@ -56,11 +56,15 @@


@metrics_wrapper()
def parse(address, addr_spec_only=False, metrics=False):
def parse(address, addr_spec_only=False, strict=False, metrics=False):
"""
Given a string, returns a scalar object representing a single full
mailbox (display name and addr-spec), addr-spec, or a url.
If parsing the entire string fails and strict is not set to True, fall back
to trying to parse the last word only and assume everything else is the
display name.
Returns an Address object and optionally metrics on processing
time if requested.
Expand Down Expand Up @@ -102,9 +106,33 @@ def parse(address, addr_spec_only=False, metrics=False):
retval = _lift_parser_result(parser.parse(address.strip(), lexer=lexer.clone()))
mtimes['parsing'] = time() - bstart
except (LexError, YaccError, SyntaxError):
retval = None
mtimes['parsing'] = time() - bstart

if retval is None and not strict:
try:
bstart = time()

addr_parts = address.split(' ')
addr_spec = addr_parts[-1]
display_name = ' '.join(addr_parts[0:-1])

retval = _lift_parser_result(parser.parse(addr_spec, lexer=lexer.clone()))
retval._display_name = display_name
if isinstance(retval._display_name, str):
retval._display_name = retval._display_name.decode('utf-8')

mtimes['parsing'] += time() - bstart

log.warning('Relaxed parsing matched address: %s',
address.decode('utf-8', 'replace'))
except (LexError, YaccError, SyntaxError):
retval = None
mtimes['parsing'] += time() - bstart

if retval is None:
log.warning('Failed to parse address: %s',
address.decode('utf-8', 'replace'))
return None, mtimes

return retval, mtimes

Expand Down Expand Up @@ -163,6 +191,10 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False):
delimiter (comma (,) or semi-colon (;)), returns an AddressList object
(an iterable list representing parsed email addresses and urls).
Given a list of email addresses, the strict parameter is passed to the
parse call for each element. Given a string the strict parameter is
ignored.
The parser can return a list of parsed addresses or a tuple containing
the parsed and unparsed portions. The parser also returns the parsing
time metrics if requested.
Expand All @@ -183,9 +215,6 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False):
>>> address.parse_list('A <a@b>, D <d@e>, http://localhost')
[A <a@b>, D <d@e>, http://localhost]
"""
if strict:
log.warning('strict parsing has been removed, ignoring')

mtimes = {'parsing': 0}

if not address_list:
Expand All @@ -197,7 +226,7 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False):
parsed, unparsed = AddressList(), []
for address in address_list:
if isinstance(address, basestring):
retval, metrics = parse(address, metrics=True)
retval, metrics = parse(address, strict=strict, metrics=True)
mtimes['parsing'] += metrics['parsing']
if retval:
parsed.append(retval)
Expand All @@ -214,6 +243,8 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False):
log.warning('address list exceeds maximum length of %s', MAX_ADDRESS_LIST_LENGTH)
parsed, unparsed = AddressList(), [address_list]
elif isinstance(address_list, basestring):
if not strict:
log.info('relaxed parsing is not available for discrete lists, ignoring')
retval, metrics = parse_discrete_list(address_list, metrics=True)
mtimes['parsing'] += metrics['parsing']
if retval:
Expand Down Expand Up @@ -264,7 +295,7 @@ def validate_address(addr_spec, metrics=False):

# run parser against address
bstart = time()
paddr = parse('@'.join(addr_parts), addr_spec_only=True)
paddr = parse('@'.join(addr_parts), addr_spec_only=True, strict=True)
mtimes['parsing'] = time() - bstart
if paddr is None:
log.warning('failed parse check for %s', addr_spec)
Expand Down Expand Up @@ -318,7 +349,7 @@ def validate_list(addr_list, as_tuple=False, metrics=False):

# parse addresses
bstart = time()
parsed_addresses, unparseable = parse_list(addr_list, as_tuple=True)
parsed_addresses, unparseable = parse_list(addr_list, strict=True, as_tuple=True)
mtimes['parsing'] = time() - bstart

plist = AddressList()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -4,7 +4,7 @@


setup(name='flanker',
version='0.6.11',
version='0.6.12',
description='Mailgun Parsing Tools',
long_description=open('README.rst').read(),
classifiers=[],
Expand Down
16 changes: 16 additions & 0 deletions tests/addresslib/address_test.py
Expand Up @@ -258,3 +258,19 @@ def test_requires_non_ascii():
def test_contains_domain_literal():
eq_(EmailAddress(None, 'foo@bar.com').contains_domain_literal(), False)
eq_(EmailAddress(None, 'foo@[1.2.3.4]').contains_domain_literal(), True)


def test_parse_relaxed():
eq_(u'foo <foo@bar.com>', parse('foo <foo@bar.com>').to_unicode())
eq_(u'foo <foo@bar.com>', parse('foo foo@bar.com').to_unicode())
eq_(u'foo <foo@bar.com>', parse('foo (comment) <foo@bar.com>').to_unicode())
eq_(u'"foo (comment)" <foo@bar.com>', parse('foo (comment) foo@bar.com').to_unicode())
eq_(u'"not@valid" <foo@bar.com>', parse('not@valid <foo@bar.com>').to_unicode())
eq_(u'"not@valid" <foo@bar.com>', parse('not@valid foo@bar.com').to_unicode())
eq_(u'Маруся <мария@example.com>', parse('Маруся мария@example.com').to_unicode())


def test_parse_list_relaxed():
addr_list = ['foo <foo@bar.com>', 'foo foo@bar.com', 'not@valid <foo@bar.com>']
expected = ['foo <foo@bar.com>', 'foo <foo@bar.com>', '"not@valid" <foo@bar.com>']
eq_(expected, [addr.to_unicode() for addr in parse_list(addr_list)])
8 changes: 4 additions & 4 deletions tests/addresslib/external_dataset_test.py
Expand Up @@ -22,7 +22,7 @@ def test_mailbox_valid_set():
if match:
continue

mbox = address.parse(line)
mbox = address.parse(line, strict=True)
assert_not_equal(mbox, None)

def test_mailbox_invalid_set():
Expand All @@ -37,7 +37,7 @@ def test_mailbox_invalid_set():
if match:
continue

mbox = address.parse(line)
mbox = address.parse(line, strict=True)
assert_equal(mbox, None)

def test_url_valid_set():
Expand All @@ -52,7 +52,7 @@ def test_url_valid_set():
if match:
continue

mbox = address.parse(line)
mbox = address.parse(line, strict=True)
assert_not_equal(mbox, None)

def test_url_invalid_set():
Expand All @@ -67,5 +67,5 @@ def test_url_invalid_set():
if match:
continue

mbox = address.parse(line)
mbox = address.parse(line, strict=True)
assert_equal(mbox, None)
22 changes: 11 additions & 11 deletions tests/addresslib/parser_address_list_test.py
Expand Up @@ -16,7 +16,7 @@ def powerset(iterable):

@nottest
def run_test(string, expected_mlist):
mlist = parse_list(string)
mlist = parse_list(string, strict=True)
assert_equal(mlist, expected_mlist)


Expand All @@ -36,7 +36,7 @@ def test_sanity():

def test_simple_valid():
s = '''http://foo.com:8080; "Ev K." <ev@host.com>, "Alex K" <alex@yahoo.net>, "Tom, S" <"tom+[a]"@s.com>'''
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(4, len(addrs))

Expand All @@ -61,7 +61,7 @@ def test_simple_valid():


s = '''"Allan G\'o" <allan@example.com>, "Os Wi" <oswi@example.com>'''
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(2, len(addrs))

Expand All @@ -77,7 +77,7 @@ def test_simple_valid():


s = u'''I am also A <a@HOST.com>, Zeka <EV@host.coM> ;Gonzalo Bañuelos<gonz@host.com>'''
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(3, len(addrs))

Expand All @@ -98,7 +98,7 @@ def test_simple_valid():


s = r'''"Escaped" <"\e\s\c\a\p\e\d"@sld.com>; http://userid:password@example.com:8080, "Dmitry" <my|'`!#_~%$&{}?^+-*@host.com>'''
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(3, len(addrs))

Expand All @@ -118,7 +118,7 @@ def test_simple_valid():


s = "http://foo.com/blah_blah_(wikipedia)"
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(1, len(addrs))

Expand All @@ -128,7 +128,7 @@ def test_simple_valid():


s = "Sasha Klizhentas <klizhentas@gmail.com>"
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(1, len(addrs))

Expand All @@ -139,7 +139,7 @@ def test_simple_valid():


s = "admin@mailgunhq.com,lift@example.com"
addrs = parse_list(s)
addrs = parse_list(s, strict=True)

assert_equal(2, len(addrs))

Expand All @@ -156,13 +156,13 @@ def test_simple_valid():

def test_simple_invalid():
s = '''httd://foo.com:8080\r\n; "Ev K." <ev@ host.com>\n "Alex K" alex@ , "Tom, S" "tom+[" a]"@s.com'''
assert_equal(AddressList(), parse_list(s))
assert_equal(AddressList(), parse_list(s, strict=True))

s = ""
assert_equal(AddressList(), parse_list(s))
assert_equal(AddressList(), parse_list(s, strict=True))

s = "crap"
assert_equal(AddressList(), parse_list(s))
assert_equal(AddressList(), parse_list(s, strict=True))


def test_endpoints():
Expand Down
6 changes: 3 additions & 3 deletions tests/addresslib/parser_mailbox_test.py
Expand Up @@ -22,19 +22,19 @@ def chunks(l, n):

@nottest
def run_full_mailbox_test(string, expected, full_spec=None):
mbox = address.parse(string)
mbox = address.parse(string, strict=True)
if mbox:
assert_equal(expected.display_name, mbox.display_name)
assert_equal(expected.address, mbox.address)
if full_spec:
assert_equal(full_spec, mbox.full_spec())
assert_equal(mbox, address.parse(mbox.to_unicode())) # check symmetry
assert_equal(mbox, address.parse(mbox.to_unicode(), strict=True)) # check symmetry
return
assert_equal(expected, mbox)

@nottest
def run_mailbox_test(string, expected_string):
mbox = address.parse(string)
mbox = address.parse(string, strict=True)
if mbox:
assert_equal(expected_string, mbox.address)
return
Expand Down

0 comments on commit 66507ba

Please sign in to comment.