diff --git a/flanker/addresslib/address.py b/flanker/addresslib/address.py index 1d092011..76c8c695 100644 --- a/flanker/addresslib/address.py +++ b/flanker/addresslib/address.py @@ -56,11 +56,15 @@ @metrics_wrapper() -def parse(address, addr_spec_only=False, metrics=False): +def parse(address, addr_spec_only=False, strict=False, metrics=False): """ Given a string, returns a scalar object representing a single full mailbox (display name and addr-spec), addr-spec, or a url. + If parsing the entire string fails and strict is not set to True, fall back + to trying to parse the last word only and assume everything else is the + display name. + Returns an Address object and optionally metrics on processing time if requested. @@ -102,9 +106,33 @@ def parse(address, addr_spec_only=False, metrics=False): retval = _lift_parser_result(parser.parse(address.strip(), lexer=lexer.clone())) mtimes['parsing'] = time() - bstart except (LexError, YaccError, SyntaxError): + retval = None + mtimes['parsing'] = time() - bstart + + if retval is None and not strict: + try: + bstart = time() + + addr_parts = address.split(' ') + addr_spec = addr_parts[-1] + display_name = ' '.join(addr_parts[0:-1]) + + retval = _lift_parser_result(parser.parse(addr_spec, lexer=lexer.clone())) + retval._display_name = display_name + if isinstance(retval._display_name, str): + retval._display_name = retval._display_name.decode('utf-8') + + mtimes['parsing'] += time() - bstart + + log.warning('Relaxed parsing matched address: %s', + address.decode('utf-8', 'replace')) + except (LexError, YaccError, SyntaxError): + retval = None + mtimes['parsing'] += time() - bstart + + if retval is None: log.warning('Failed to parse address: %s', address.decode('utf-8', 'replace')) - return None, mtimes return retval, mtimes @@ -163,6 +191,10 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False): delimiter (comma (,) or semi-colon (;)), returns an AddressList object (an iterable list representing parsed email addresses and urls). + Given a list of email addresses, the strict parameter is passed to the + parse call for each element. Given a string the strict parameter is + ignored. + The parser can return a list of parsed addresses or a tuple containing the parsed and unparsed portions. The parser also returns the parsing time metrics if requested. @@ -183,9 +215,6 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False): >>> address.parse_list('A , D , http://localhost') [A , D , http://localhost] """ - if strict: - log.warning('strict parsing has been removed, ignoring') - mtimes = {'parsing': 0} if not address_list: @@ -197,7 +226,7 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False): parsed, unparsed = AddressList(), [] for address in address_list: if isinstance(address, basestring): - retval, metrics = parse(address, metrics=True) + retval, metrics = parse(address, strict=strict, metrics=True) mtimes['parsing'] += metrics['parsing'] if retval: parsed.append(retval) @@ -214,6 +243,8 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False): log.warning('address list exceeds maximum length of %s', MAX_ADDRESS_LIST_LENGTH) parsed, unparsed = AddressList(), [address_list] elif isinstance(address_list, basestring): + if not strict: + log.info('relaxed parsing is not available for discrete lists, ignoring') retval, metrics = parse_discrete_list(address_list, metrics=True) mtimes['parsing'] += metrics['parsing'] if retval: @@ -264,7 +295,7 @@ def validate_address(addr_spec, metrics=False): # run parser against address bstart = time() - paddr = parse('@'.join(addr_parts), addr_spec_only=True) + paddr = parse('@'.join(addr_parts), addr_spec_only=True, strict=True) mtimes['parsing'] = time() - bstart if paddr is None: log.warning('failed parse check for %s', addr_spec) @@ -318,7 +349,7 @@ def validate_list(addr_list, as_tuple=False, metrics=False): # parse addresses bstart = time() - parsed_addresses, unparseable = parse_list(addr_list, as_tuple=True) + parsed_addresses, unparseable = parse_list(addr_list, strict=True, as_tuple=True) mtimes['parsing'] = time() - bstart plist = AddressList() diff --git a/setup.py b/setup.py index 7d843850..2b4337e7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup(name='flanker', - version='0.6.11', + version='0.6.12', description='Mailgun Parsing Tools', long_description=open('README.rst').read(), classifiers=[], diff --git a/tests/addresslib/address_test.py b/tests/addresslib/address_test.py index 8950df15..3f501e57 100644 --- a/tests/addresslib/address_test.py +++ b/tests/addresslib/address_test.py @@ -258,3 +258,19 @@ def test_requires_non_ascii(): def test_contains_domain_literal(): eq_(EmailAddress(None, 'foo@bar.com').contains_domain_literal(), False) eq_(EmailAddress(None, 'foo@[1.2.3.4]').contains_domain_literal(), True) + + +def test_parse_relaxed(): + eq_(u'foo ', parse('foo ').to_unicode()) + eq_(u'foo ', parse('foo foo@bar.com').to_unicode()) + eq_(u'foo ', parse('foo (comment) ').to_unicode()) + eq_(u'"foo (comment)" ', parse('foo (comment) foo@bar.com').to_unicode()) + eq_(u'"not@valid" ', parse('not@valid ').to_unicode()) + eq_(u'"not@valid" ', parse('not@valid foo@bar.com').to_unicode()) + eq_(u'Маруся <мария@example.com>', parse('Маруся мария@example.com').to_unicode()) + + +def test_parse_list_relaxed(): + addr_list = ['foo ', 'foo foo@bar.com', 'not@valid '] + expected = ['foo ', 'foo ', '"not@valid" '] + eq_(expected, [addr.to_unicode() for addr in parse_list(addr_list)]) diff --git a/tests/addresslib/external_dataset_test.py b/tests/addresslib/external_dataset_test.py index e87d888b..ba242895 100644 --- a/tests/addresslib/external_dataset_test.py +++ b/tests/addresslib/external_dataset_test.py @@ -22,7 +22,7 @@ def test_mailbox_valid_set(): if match: continue - mbox = address.parse(line) + mbox = address.parse(line, strict=True) assert_not_equal(mbox, None) def test_mailbox_invalid_set(): @@ -37,7 +37,7 @@ def test_mailbox_invalid_set(): if match: continue - mbox = address.parse(line) + mbox = address.parse(line, strict=True) assert_equal(mbox, None) def test_url_valid_set(): @@ -52,7 +52,7 @@ def test_url_valid_set(): if match: continue - mbox = address.parse(line) + mbox = address.parse(line, strict=True) assert_not_equal(mbox, None) def test_url_invalid_set(): @@ -67,5 +67,5 @@ def test_url_invalid_set(): if match: continue - mbox = address.parse(line) + mbox = address.parse(line, strict=True) assert_equal(mbox, None) diff --git a/tests/addresslib/parser_address_list_test.py b/tests/addresslib/parser_address_list_test.py index d457c772..abf5f9c9 100644 --- a/tests/addresslib/parser_address_list_test.py +++ b/tests/addresslib/parser_address_list_test.py @@ -16,7 +16,7 @@ def powerset(iterable): @nottest def run_test(string, expected_mlist): - mlist = parse_list(string) + mlist = parse_list(string, strict=True) assert_equal(mlist, expected_mlist) @@ -36,7 +36,7 @@ def test_sanity(): def test_simple_valid(): s = '''http://foo.com:8080; "Ev K." , "Alex K" , "Tom, S" <"tom+[a]"@s.com>''' - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(4, len(addrs)) @@ -61,7 +61,7 @@ def test_simple_valid(): s = '''"Allan G\'o" , "Os Wi" ''' - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(2, len(addrs)) @@ -77,7 +77,7 @@ def test_simple_valid(): s = u'''I am also A , Zeka ;Gonzalo Bañuelos''' - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(3, len(addrs)) @@ -98,7 +98,7 @@ def test_simple_valid(): s = r'''"Escaped" <"\e\s\c\a\p\e\d"@sld.com>; http://userid:password@example.com:8080, "Dmitry" ''' - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(3, len(addrs)) @@ -118,7 +118,7 @@ def test_simple_valid(): s = "http://foo.com/blah_blah_(wikipedia)" - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(1, len(addrs)) @@ -128,7 +128,7 @@ def test_simple_valid(): s = "Sasha Klizhentas " - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(1, len(addrs)) @@ -139,7 +139,7 @@ def test_simple_valid(): s = "admin@mailgunhq.com,lift@example.com" - addrs = parse_list(s) + addrs = parse_list(s, strict=True) assert_equal(2, len(addrs)) @@ -156,13 +156,13 @@ def test_simple_valid(): def test_simple_invalid(): s = '''httd://foo.com:8080\r\n; "Ev K." \n "Alex K" alex@ , "Tom, S" "tom+[" a]"@s.com''' - assert_equal(AddressList(), parse_list(s)) + assert_equal(AddressList(), parse_list(s, strict=True)) s = "" - assert_equal(AddressList(), parse_list(s)) + assert_equal(AddressList(), parse_list(s, strict=True)) s = "crap" - assert_equal(AddressList(), parse_list(s)) + assert_equal(AddressList(), parse_list(s, strict=True)) def test_endpoints(): diff --git a/tests/addresslib/parser_mailbox_test.py b/tests/addresslib/parser_mailbox_test.py index 12400a11..78ea6805 100644 --- a/tests/addresslib/parser_mailbox_test.py +++ b/tests/addresslib/parser_mailbox_test.py @@ -22,19 +22,19 @@ def chunks(l, n): @nottest def run_full_mailbox_test(string, expected, full_spec=None): - mbox = address.parse(string) + mbox = address.parse(string, strict=True) if mbox: assert_equal(expected.display_name, mbox.display_name) assert_equal(expected.address, mbox.address) if full_spec: assert_equal(full_spec, mbox.full_spec()) - assert_equal(mbox, address.parse(mbox.to_unicode())) # check symmetry + assert_equal(mbox, address.parse(mbox.to_unicode(), strict=True)) # check symmetry return assert_equal(expected, mbox) @nottest def run_mailbox_test(string, expected_string): - mbox = address.parse(string) + mbox = address.parse(string, strict=True) if mbox: assert_equal(expected_string, mbox.address) return