Allow unicode characters in custom grammer checks

mailgun · May 16, 2018 · b4323ce · b4323ce
1 parent 3409e48
commit b4323ce
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 190 deletions.
diff --git a/flanker/addresslib/plugins/_tokenizer.py b/flanker/addresslib/plugins/_tokenizer.py
@@ -8,71 +8,8 @@
 """
 
 import re
-
 import six
 
-LBRACKET   = '<'
-AT_SYMBOL  = '@'
-RBRACKET   = '>'
-DQUOTE     = '"'
-
-BAD_DOMAIN = re.compile(r'''                                    # start or end
-                        ^-|-$                                   # with -
-                        ''', re.MULTILINE | re.VERBOSE)
-
-DELIMITER  = re.compile(r'''
-                        [,;][,;\s]*                             # delimiter
-                        ''', re.MULTILINE | re.VERBOSE)
-
-WHITESPACE = re.compile(r'''
-                        (\ |\t)+                                # whitespace
-                        ''', re.MULTILINE | re.VERBOSE)
-
-UNI_WHITE  = re.compile(u'''
-                        [
-                            \u0020\u00a0\u1680\u180e
-                            \u2000-\u200a
-                            \u2028\u202f\u205f\u3000
-                        ]*
-                        ''', re.MULTILINE | re.VERBOSE | re.UNICODE)
-
-RELAX_ATOM = re.compile(r'''
-                        ([^\s<>;,"]+)
-                        ''', re.MULTILINE | re.VERBOSE)
-
-ATOM       = re.compile(r'''
-                        [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+        # atext
-                        ''', re.MULTILINE | re.VERBOSE)
-
-DOT_ATOM   = re.compile(r'''
-                        [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+        # atext
-                        (\.[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+)*   # (dot atext)*
-                        ''', re.MULTILINE | re.VERBOSE)
-
-UNI_ATOM = re.compile(r'''
-                        ([^\s<>;,"]+)
-                        ''', re.MULTILINE | re.VERBOSE | re.UNICODE)
-
-UNI_QSTR   = re.compile(r'''
-                        "
-                        (?P<qstr>([^"]+))
-                        "
-                        ''', re.MULTILINE | re.VERBOSE | re.UNICODE)
-
-QSTRING    = re.compile(r'''
-                        "                                       # dquote
-                        (\s*                                    # whitespace
-                        ([\x21\x23-\x5b\x5d-\x7e]               # qtext
-                        |                                       # or
-                        \\[\x21-\x7e\t\ ]))*                    # quoted-pair
-                        \s*                                     # whitespace
-                        "                                       # dquote
-                        ''', re.MULTILINE | re.VERBOSE)
-
-URL        = re.compile(r'''
-                        (?:http|https)://
-                        [^\s<>{}|\^~\[\]`;,]+
-                        ''', re.MULTILINE | re.VERBOSE | re.UNICODE)
 
 class TokenStream(object):
     """
@@ -122,27 +59,6 @@ def end_of_stream(self):
             return True
         return False
 
-    def synchronize(self):
-        """
-        Advances the stream to synchronizes to the delimiter token. Used primarily
-        in relaxed mode parsing.
-        """
-        start_pos = self.position
-        end_pos = len(self.stream)
-
-        match = DELIMITER.search(self.stream, self.position)
-        if match:
-            self.position = match.start()
-            end_pos = match.start()
-        else:
-            self.position = end_pos
-
-        skip = self.stream[start_pos:end_pos]
-        if skip.strip() == '':
-            return None
-
-        return skip
-
     def peek(self, token=None):
         """
         Peek at the stream to see what the next token is or peek for a

diff --git a/flanker/addresslib/plugins/aol.py b/flanker/addresslib/plugins/aol.py
@@ -20,26 +20,30 @@
 '''
 import re
 from flanker.addresslib.plugins._tokenizer import TokenStream
+from flanker.addresslib._parser.lexer import _UNICODE_CHAR
 
 ALPHA      = re.compile(r'''
-                        [A-Za-z]+
-                        ''', re.MULTILINE | re.VERBOSE)
+                        ( [A-Za-z]
+                        | {unicode_char}
+                        )+
+                        '''.format(unicode_char=_UNICODE_CHAR),
+                        re.MULTILINE | re.VERBOSE)
 
 NUMERIC    = re.compile(r'''
-                        [0-9]+
-                        ''', re.MULTILINE | re.VERBOSE)
+                        ( [0-9]
+                        )+
+                        ''',
+                        re.MULTILINE | re.VERBOSE)
 
 ALPHANUM   = re.compile(r'''
-                        [A-Za-z0-9]+
-                        ''', re.MULTILINE | re.VERBOSE)
-
-DOT        = re.compile(r'''
-                        \.
-                        ''', re.MULTILINE | re.VERBOSE)
-
-UNDERSCORE = re.compile(r'''
-                        \_
-                        ''', re.MULTILINE | re.VERBOSE)
+                        ( [A-Za-z0-9]
+                        | {unicode_char}
+                        )+
+                        '''.format(unicode_char=_UNICODE_CHAR),
+                        re.MULTILINE | re.VERBOSE)
+
+DOT        = '.'
+UNDERSCORE = '_'
 
 AOL_UNMANAGED = ['verizon.net']
 

diff --git a/flanker/addresslib/plugins/gmail.py b/flanker/addresslib/plugins/gmail.py
@@ -29,23 +29,19 @@
 '''
 import re
 from flanker.addresslib.plugins._tokenizer import TokenStream
-from flanker.addresslib.plugins._tokenizer import ATOM
+from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR
 
-
-GMAIL_BASE = re.compile(r'''
-                        [A-Za-z0-9\.]+
-                        ''', re.MULTILINE | re.VERBOSE)
+ATOM       = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE)
 
 ALPHANUM   = re.compile(r'''
-                        [A-Za-z0-9]+
-                        ''', re.MULTILINE | re.VERBOSE)
-
-PLUS       = re.compile(r'''
-                        [\+]
-                        ''', re.MULTILINE | re.VERBOSE)
-DOT        = re.compile(r'''
-                        [\.]
-                        ''', re.MULTILINE | re.VERBOSE)
+                        ( [A-Za-z0-9]
+                        | {unicode_char}
+                        )+
+                        '''.format(unicode_char=_UNICODE_CHAR),
+                        re.MULTILINE | re.VERBOSE)
+
+PLUS       = '+'
+DOT        = '.'
 
 
 def validate(email_addr):

diff --git a/flanker/addresslib/plugins/google.py b/flanker/addresslib/plugins/google.py
@@ -29,36 +29,34 @@
 '''
 import re
 from flanker.addresslib.plugins._tokenizer import TokenStream
-from flanker.addresslib.plugins._tokenizer import ATOM
+from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR
 
+ATOM        = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE)
 
-GOOGLE_BASE  = re.compile(r'''
-                        [A-Za-z0-9_\-'\.]+
-                        ''', re.MULTILINE | re.VERBOSE)
+GOOGLE_BASE = re.compile(r'''
+                         ( [A-Za-z0-9_\-'\.]
+                         | {unicode_char}
+                         )+
+                         '''.format(unicode_char=_UNICODE_CHAR),
+                         re.MULTILINE | re.VERBOSE)
 
 ALPHANUM    = re.compile(r'''
-                        [A-Za-z0-9]+
-                        ''', re.MULTILINE | re.VERBOSE)
+                         ( [A-Za-z0-9]
+                         | {unicode_char}
+                         )+
+                         '''.format(unicode_char=_UNICODE_CHAR),
+                         re.MULTILINE | re.VERBOSE)
 
-UNDERSCORE  = re.compile(r'''
-                        [_]+
-                        ''', re.MULTILINE | re.VERBOSE)
+APOSTROPHE  = re.compile(r'''
+                         \'
+                         ''',
+                         re.MULTILINE | re.VERBOSE)
 
-APOSTROPHES = re.compile(r'''
-                        [']+
-                        ''', re.MULTILINE | re.VERBOSE)
+UNDERSCORE  = re.compile(r'\_', re.MULTILINE | re.VERBOSE)
+DASH        = re.compile(r'\-', re.MULTILINE | re.VERBOSE)
 
-DASH        = re.compile(r'''
-                        [-]+
-                        ''', re.MULTILINE | re.VERBOSE)
-
-DOTS        = re.compile(r'''
-                        [.]+
-                        ''', re.MULTILINE | re.VERBOSE)
-
-PLUS        = re.compile(r'''
-                         [\+]+
-                         ''', re.MULTILINE | re.VERBOSE)
+DOTS        = '.'
+PLUS        = '+'
 
 
 def validate(email_addr):
@@ -80,21 +78,21 @@ def validate(email_addr):
     # if only one character, must be alphanum, underscore (_), or apostrophe (')
     if len(localpart) == 1 or l == 1:
         if ALPHANUM.match(localpart) or UNDERSCORE.match(localpart) or \
-            APOSTROPHES.match(localpart):
+            APOSTROPHE.match(localpart):
             return True
         return False
 
-    # must start with: alphanum, underscore (_), dash (-), or apostrophes(')
+    # must start with: alphanum, underscore (_), dash (-), or apostrophe(')
     if len(real_localpart) > 0:
         if not ALPHANUM.match(real_localpart[0]) and not UNDERSCORE.match(real_localpart[0]) \
-            and not DASH.match(real_localpart[0]) and not APOSTROPHES.match(real_localpart[0]):
+            and not DASH.match(real_localpart[0]) and not APOSTROPHE.match(real_localpart[0]):
             return False
     else:
         return False
 
-    # must end with: alphanum, underscore(_), dash(-), or apostrophes(')
+    # must end with: alphanum, underscore(_), dash(-), or apostrophe(')
     if not ALPHANUM.match(real_localpart[-1]) and not UNDERSCORE.match(real_localpart[-1]) \
-        and not DASH.match(real_localpart[-1]) and not APOSTROPHES.match(real_localpart[-1]):
+        and not DASH.match(real_localpart[-1]) and not APOSTROPHE.match(real_localpart[-1]):
         return False
 
     # grammar check

diff --git a/flanker/addresslib/plugins/hotmail.py b/flanker/addresslib/plugins/hotmail.py
@@ -31,26 +31,30 @@
 '''
 import re
 from flanker.addresslib.plugins._tokenizer import TokenStream
+from flanker.addresslib._parser.lexer import _UNICODE_CHAR
 
 HOTMAIL_PREFIX  = re.compile(r'''
-                            [A-Za-z0-9]+
-                            ''', re.MULTILINE | re.VERBOSE)
+                            ( [A-Za-z0-9]
+                            | {unicode_char}
+                            )+
+                            '''.format(unicode_char=_UNICODE_CHAR),
+                            re.MULTILINE | re.VERBOSE)
 
 HOTMAIL_BASE    = re.compile(r'''
-                            [A-Za-z0-9\.\-\_]+
-                            ''', re.MULTILINE | re.VERBOSE)
+                            ( [A-Za-z0-9\.\-\_]
+                            | {unicode_char}
+                            )+
+                            '''.format(unicode_char=_UNICODE_CHAR),
+                            re.MULTILINE | re.VERBOSE)
 
 HOTMAIL_SUFFIX  = re.compile(r'''
-                            [A-Za-z0-9\-\_]+
-                            ''', re.MULTILINE | re.VERBOSE)
+                            ( [A-Za-z0-9\-\_]
+                            | {unicode_char}
+                            )+
+                            '''.format(unicode_char=_UNICODE_CHAR),
+                            re.MULTILINE | re.VERBOSE)
 
-PLUS            = re.compile(r'''
-                            \+
-                            ''', re.MULTILINE | re.VERBOSE)
-
-PERIODS         = re.compile(r'''
-                            \.{2,}
-                            ''', re.MULTILINE | re.VERBOSE)
+PLUS            = '+'
 
 
 def validate(email_addr):
@@ -82,10 +86,6 @@ def validate(email_addr):
     if localpart.count('+') > 1:
         return False
 
-    # no consecutive periods (..)
-    if PERIODS.search(localpart):
-        return False
-
     # grammar check
     retval = _validate(real_localpart)
     return retval

diff --git a/flanker/addresslib/plugins/icloud.py b/flanker/addresslib/plugins/icloud.py
@@ -34,31 +34,32 @@
 '''
 import re
 from flanker.addresslib.plugins._tokenizer import TokenStream
+from flanker.addresslib._parser.lexer import _UNICODE_CHAR
 
-ALPHA          = re.compile(r'''
-                            [A-Za-z]+
-                            ''', re.MULTILINE | re.VERBOSE)
+ALPHA         = re.compile(r'''
+                           ( [A-Za-z]
+                           | {unicode_char}
+                           )+
+                           '''.format(unicode_char=_UNICODE_CHAR),
+                           re.MULTILINE | re.VERBOSE)
 
 ALPHANUM      = re.compile(r'''
-                           [A-Za-z0-9]+
-                           ''', re.MULTILINE | re.VERBOSE)
+                           ( [A-Za-z0-9]
+                           | {unicode_char}
+                           )+
+                           '''.format(unicode_char=_UNICODE_CHAR),
+                           re.MULTILINE | re.VERBOSE)
 
 
-ICLOUD_PREFIX = re.compile(r'''
-                           [A-Za-z]+
-                           ''', re.MULTILINE | re.VERBOSE)
-
 ICLOUD_BASE   = re.compile(r'''
-                           [A-Za-z0-9\+]+
-                           ''', re.MULTILINE | re.VERBOSE)
-
-DOT           = re.compile(r'''
-                           \.
-                           ''', re.MULTILINE | re.VERBOSE)
+                           ( [A-Za-z0-9\+]
+                           | {unicode_char}
+                           )+
+                           '''.format(unicode_char=_UNICODE_CHAR),
+                           re.MULTILINE | re.VERBOSE)
 
-UNDERSCORE    = re.compile(r'''
-                           \_
-                           ''', re.MULTILINE | re.VERBOSE)
+DOT           = '.'
+UNDERSCORE    = '_'
 
 
 def validate(email_addr):
@@ -97,7 +98,7 @@ def _validate(localpart):
     stream = TokenStream(localpart)
 
     # localpart must start with alpha
-    alpa = stream.get_token(ICLOUD_PREFIX)
+    alpa = stream.get_token(ALPHA)
     if alpa is None:
         return False