diff --git a/flanker/addresslib/plugins/_tokenizer.py b/flanker/addresslib/plugins/_tokenizer.py index 625a7b42..b2f562d8 100644 --- a/flanker/addresslib/plugins/_tokenizer.py +++ b/flanker/addresslib/plugins/_tokenizer.py @@ -8,71 +8,8 @@ """ import re - import six -LBRACKET = '<' -AT_SYMBOL = '@' -RBRACKET = '>' -DQUOTE = '"' - -BAD_DOMAIN = re.compile(r''' # start or end - ^-|-$ # with - - ''', re.MULTILINE | re.VERBOSE) - -DELIMITER = re.compile(r''' - [,;][,;\s]* # delimiter - ''', re.MULTILINE | re.VERBOSE) - -WHITESPACE = re.compile(r''' - (\ |\t)+ # whitespace - ''', re.MULTILINE | re.VERBOSE) - -UNI_WHITE = re.compile(u''' - [ - \u0020\u00a0\u1680\u180e - \u2000-\u200a - \u2028\u202f\u205f\u3000 - ]* - ''', re.MULTILINE | re.VERBOSE | re.UNICODE) - -RELAX_ATOM = re.compile(r''' - ([^\s<>;,"]+) - ''', re.MULTILINE | re.VERBOSE) - -ATOM = re.compile(r''' - [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext - ''', re.MULTILINE | re.VERBOSE) - -DOT_ATOM = re.compile(r''' - [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext - (\.[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+)* # (dot atext)* - ''', re.MULTILINE | re.VERBOSE) - -UNI_ATOM = re.compile(r''' - ([^\s<>;,"]+) - ''', re.MULTILINE | re.VERBOSE | re.UNICODE) - -UNI_QSTR = re.compile(r''' - " - (?P([^"]+)) - " - ''', re.MULTILINE | re.VERBOSE | re.UNICODE) - -QSTRING = re.compile(r''' - " # dquote - (\s* # whitespace - ([\x21\x23-\x5b\x5d-\x7e] # qtext - | # or - \\[\x21-\x7e\t\ ]))* # quoted-pair - \s* # whitespace - " # dquote - ''', re.MULTILINE | re.VERBOSE) - -URL = re.compile(r''' - (?:http|https):// - [^\s<>{}|\^~\[\]`;,]+ - ''', re.MULTILINE | re.VERBOSE | re.UNICODE) class TokenStream(object): """ diff --git a/flanker/addresslib/plugins/aol.py b/flanker/addresslib/plugins/aol.py index 8ab1774c..62aa0642 100644 --- a/flanker/addresslib/plugins/aol.py +++ b/flanker/addresslib/plugins/aol.py @@ -20,26 +20,30 @@ ''' import re from flanker.addresslib.plugins._tokenizer import TokenStream +from flanker.addresslib._parser.lexer import _UNICODE_CHAR ALPHA = re.compile(r''' - [A-Za-z]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) NUMERIC = re.compile(r''' - [0-9]+ - ''', re.MULTILINE | re.VERBOSE) + ( [0-9] + )+ + ''', + re.MULTILINE | re.VERBOSE) ALPHANUM = re.compile(r''' - [A-Za-z0-9]+ - ''', re.MULTILINE | re.VERBOSE) - -DOT = re.compile(r''' - \. - ''', re.MULTILINE | re.VERBOSE) - -UNDERSCORE = re.compile(r''' - \_ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) + +DOT = re.compile(r'\.', re.MULTILINE | re.VERBOSE) +UNDERSCORE = re.compile(r'\_', re.MULTILINE | re.VERBOSE) AOL_UNMANAGED = ['verizon.net'] diff --git a/flanker/addresslib/plugins/gmail.py b/flanker/addresslib/plugins/gmail.py index c6dc61eb..1b5cfef1 100644 --- a/flanker/addresslib/plugins/gmail.py +++ b/flanker/addresslib/plugins/gmail.py @@ -29,23 +29,19 @@ ''' import re from flanker.addresslib.plugins._tokenizer import TokenStream -from flanker.addresslib.plugins._tokenizer import ATOM +from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR - -GMAIL_BASE = re.compile(r''' - [A-Za-z0-9\.]+ - ''', re.MULTILINE | re.VERBOSE) +ATOM = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE) ALPHANUM = re.compile(r''' - [A-Za-z0-9]+ - ''', re.MULTILINE | re.VERBOSE) - -PLUS = re.compile(r''' - [\+] - ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' - [\.] - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) + +PLUS = re.compile(r'\+', re.MULTILINE | re.VERBOSE) +DOT = re.compile(r'\.', re.MULTILINE | re.VERBOSE) def validate(email_addr): diff --git a/flanker/addresslib/plugins/google.py b/flanker/addresslib/plugins/google.py index 310bce17..c862eb61 100644 --- a/flanker/addresslib/plugins/google.py +++ b/flanker/addresslib/plugins/google.py @@ -29,36 +29,33 @@ ''' import re from flanker.addresslib.plugins._tokenizer import TokenStream -from flanker.addresslib.plugins._tokenizer import ATOM +from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR +ATOM = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE) -GOOGLE_BASE = re.compile(r''' - [A-Za-z0-9_\-'\.]+ - ''', re.MULTILINE | re.VERBOSE) +GOOGLE_BASE = re.compile(r''' + ( [A-Za-z0-9_\-'\.] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) ALPHANUM = re.compile(r''' - [A-Za-z0-9]+ - ''', re.MULTILINE | re.VERBOSE) - -UNDERSCORE = re.compile(r''' - [_]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) APOSTROPHES = re.compile(r''' - [']+ - ''', re.MULTILINE | re.VERBOSE) - -DASH = re.compile(r''' - [-]+ - ''', re.MULTILINE | re.VERBOSE) - -DOTS = re.compile(r''' - [.]+ - ''', re.MULTILINE | re.VERBOSE) - -PLUS = re.compile(r''' - [\+]+ - ''', re.MULTILINE | re.VERBOSE) + \' + ''', + re.MULTILINE | re.VERBOSE) + +UNDERSCORE = re.compile(r'\_', re.MULTILINE | re.VERBOSE) +DASH = re.compile(r'\-', re.MULTILINE | re.VERBOSE) +DOTS = re.compile(r'\.', re.MULTILINE | re.VERBOSE) +PLUS = re.compile(r'\+', re.MULTILINE | re.VERBOSE) def validate(email_addr): diff --git a/flanker/addresslib/plugins/hotmail.py b/flanker/addresslib/plugins/hotmail.py index cb9aab80..4377ced9 100644 --- a/flanker/addresslib/plugins/hotmail.py +++ b/flanker/addresslib/plugins/hotmail.py @@ -31,26 +31,30 @@ ''' import re from flanker.addresslib.plugins._tokenizer import TokenStream +from flanker.addresslib._parser.lexer import _UNICODE_CHAR HOTMAIL_PREFIX = re.compile(r''' - [A-Za-z0-9]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) HOTMAIL_BASE = re.compile(r''' - [A-Za-z0-9\.\-\_]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9\.\-\_] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) HOTMAIL_SUFFIX = re.compile(r''' - [A-Za-z0-9\-\_]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9\-\_] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) -PLUS = re.compile(r''' - \+ - ''', re.MULTILINE | re.VERBOSE) - -PERIODS = re.compile(r''' - \.{2,} - ''', re.MULTILINE | re.VERBOSE) +PLUS = re.compile(r'\+', re.MULTILINE | re.VERBOSE) def validate(email_addr): @@ -82,10 +86,6 @@ def validate(email_addr): if localpart.count('+') > 1: return False - # no consecutive periods (..) - if PERIODS.search(localpart): - return False - # grammar check retval = _validate(real_localpart) return retval diff --git a/flanker/addresslib/plugins/icloud.py b/flanker/addresslib/plugins/icloud.py index bfcd570b..176c4426 100644 --- a/flanker/addresslib/plugins/icloud.py +++ b/flanker/addresslib/plugins/icloud.py @@ -34,31 +34,32 @@ ''' import re from flanker.addresslib.plugins._tokenizer import TokenStream +from flanker.addresslib._parser.lexer import _UNICODE_CHAR -ALPHA = re.compile(r''' - [A-Za-z]+ - ''', re.MULTILINE | re.VERBOSE) +ALPHA = re.compile(r''' + ( [A-Za-z] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) ALPHANUM = re.compile(r''' - [A-Za-z0-9]+ - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) -ICLOUD_PREFIX = re.compile(r''' - [A-Za-z]+ - ''', re.MULTILINE | re.VERBOSE) - ICLOUD_BASE = re.compile(r''' - [A-Za-z0-9\+]+ - ''', re.MULTILINE | re.VERBOSE) - -DOT = re.compile(r''' - \. - ''', re.MULTILINE | re.VERBOSE) + ( [A-Za-z0-9\+] + | {unicode_char} + )+ + '''.format(unicode_char=_UNICODE_CHAR), + re.MULTILINE | re.VERBOSE) -UNDERSCORE = re.compile(r''' - \_ - ''', re.MULTILINE | re.VERBOSE) +DOT = re.compile(r'\.', re.MULTILINE | re.VERBOSE) +UNDERSCORE = re.compile(r'\_', re.MULTILINE | re.VERBOSE) def validate(email_addr): @@ -97,7 +98,7 @@ def _validate(localpart): stream = TokenStream(localpart) # localpart must start with alpha - alpa = stream.get_token(ICLOUD_PREFIX) + alpa = stream.get_token(ALPHA) if alpa is None: return False diff --git a/flanker/addresslib/plugins/yahoo.py b/flanker/addresslib/plugins/yahoo.py index f77c73f9..1b89f101 100644 --- a/flanker/addresslib/plugins/yahoo.py +++ b/flanker/addresslib/plugins/yahoo.py @@ -43,30 +43,28 @@ import re from flanker.addresslib.plugins._tokenizer import TokenStream +from flanker.addresslib._parser.lexer import _UNICODE_CHAR ALPHA = re.compile(r''' - [A-Za-z]+ + ( [A-Za-z] + | {unicode_char} + )+ ''', re.MULTILINE | re.VERBOSE) NUMERIC = re.compile(r''' - [0-9]+ + ( [0-9] + )+ ''', re.MULTILINE | re.VERBOSE) ALPHANUM = re.compile(r''' - [A-Za-z0-9]+ + ( [A-Za-z0-9] + | {unicode_char} + )+ ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' - \. - ''', re.MULTILINE | re.VERBOSE) - -UNDERSCORE = re.compile(r''' - \_ - ''', re.MULTILINE | re.VERBOSE) - -HYPHEN = re.compile(r''' - \- - ''', re.MULTILINE | re.VERBOSE) +DOT = re.compile(r'\.', re.MULTILINE | re.VERBOSE) +UNDERSCORE = re.compile(r'\_', re.MULTILINE | re.VERBOSE) +HYPHEN = re.compile(r'\-', re.MULTILINE | re.VERBOSE) YAHOO_MANAGED = ['yahoo.com', 'ymail.com', 'rocketmail.com']