Merge pull request #81 from rillian/warnings

Fix remaining travis warnings
oracc · Jun 25, 2019 · 61cc9e5 · 61cc9e5
2 parents c32960b + 6959cd9
commit 61cc9e5
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 50 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -38,13 +38,6 @@ before_install:
           jython -c "print ''";
           jython -c "import sys; print sys.version"
         fi
-        if [[ $TRAVIS_OS_NAME == 'osx' ]]; then
-          brew update
-          brew upgrade
-          brew upgrade python
-          brew install python3
-          python3 --version
-        fi
 
 install:
     - |
@@ -55,7 +48,8 @@ install:
         fi
         $PIP install wheel
         $PIP install setuptools
-        $PIP install ply pep8 mako
+        $PIP install ply mako
+        $PIP install pycodestyle
         if [ "$MYPYTHON" != "jython" ]; then
           $PIP install --upgrade pytest pytest-cov codecov
         fi
@@ -67,12 +61,12 @@ script:
       $MYPYTHON -c "from pyoracc import _generate_parsetab; _generate_parsetab()"
       echo "Running tests"
       if [ "$MYPYTHON" == "jython" ]; then
-        py.test
+        pytest
       else
         pytest --cov=pyoracc
       fi
 
-  - pep8 --exclude=parsetab.py .
+  - pycodestyle --exclude=parsetab.py
 
 after_success:
   - |

diff --git a/pyoracc/atf/common/atflex.py b/pyoracc/atf/common/atflex.py
@@ -69,16 +69,16 @@ def resolve_keyword(self, value, source, fallback=None, extra=None):
 
     states = AtfLexicon.STATES
 
-    t_AMPERSAND = r'\&'
-    t_HASH = r'\#'
-    t_EXCLAIM = r'\!'
+    t_AMPERSAND = r'&'
+    t_HASH = r'#'
+    t_EXCLAIM = r'!'
     t_QUERY = r'\?'
     t_STAR = r'\*'
     t_DOLLAR = r'\$'
-    t_MINUS = r'\-'
-    t_FROM = r'\<\<'
-    t_TO = r'\>\>'
-    t_COMMA = r'\,'
+    t_MINUS = r'-'
+    t_FROM = r'<<'
+    t_TO = r'>>'
+    t_COMMA = r','
     t_PARBAR = r'\|\|'
 
     t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)'
@@ -88,22 +88,22 @@ def t_INITIAL_transctrl_WHITESPACE(self, t):
         # NO TOKEN
 
     def t_MULTILINGUAL(self, t):
-        r'\=\='
+        r'=='
         t.lexer.push_state("text")
         return t
 
     def t_EQUALBRACE(self, t):
-        r'^\=\{'
+        r'^=\{'
         t.lexer.push_state('text')
         return t
 
     def t_EQUALS(self, t):
-        r'\='
+        r'='
         t.lexer.push_state('flagged')
         return t
 
     def t_INITIAL_parallel_labeled_COMMENT(self, t):
-        r'^\#+(?![a-zA-Z]+\:)'
+        r'^#+(?![a-zA-Z]+:)'
         # Negative lookahead to veto protocols as comments
         t.lexer.push_state('absorb')
         return t
@@ -121,7 +121,7 @@ def t_NEWLINE(self, t):
         return t
 
     def t_INITIAL_parallel_labeled_ATID(self, t):
-        r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
+        r'^@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
         t.value = t.value[1:]
         t.lexpos += 1
         t.type = self.resolve_keyword(t.value,
@@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t):
         return t
 
     def t_labeled_OPENR(self, t):
-        r'\@\('
+        r'@\('
         t.lexer.push_state("para")
         t.lexer.push_state("transctrl")
         return t
 
     def t_INITIAL_parallel_labeled_HASHID(self, t):
-        r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
+        r'#[a-zA-Z][a-zA-Z0-9\[\]]+:'
         # Note that \:? absorbs a trailing colon in protocol keywords
         t.value = t.value[1:-1]
         t.lexpos += 1
@@ -213,19 +213,19 @@ def t_INITIAL_parallel_labeled_HASHID(self, t):
         return t
 
     def t_LINELABEL(self, t):
-        r'^[^\ \t\n]*\.'
+        r'^[^ \t\n]*\.'
         t.value = t.value[:-1]
         t.lexer.push_state('text')
         return t
 
     def t_score_SCORELABEL(self, t):
-        r'^[^.:\ \t\#][^.:\ \t]*\:'
+        r'^[^.: \t#][^.: \t]*:'
         t.value = t.value[:-1]
         t.lexer.push_state('text')
         return t
 
     def t_ID(self, t):
-        u'[a-zA-Z0-9][a-zA-Z\'\u2019\xb4\/\.0-9\:\-\[\]_\u2080-\u2089]*'
+        r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\xb4\u2080-\u2089]*'
         t.value = t.value.replace(u'\u2019', "'")
         t.value = t.value.replace(u'\xb4', "'")
         t.type = self.resolve_keyword(t.value,
@@ -271,7 +271,7 @@ def t_flagged_text_lemmatize_transctrl_nonequals_absorb_NEWLINE(self, t):
     # Unicode 2032  is PRIME
     # All of these could be used as prime
     def t_transctrl_ID(self, t):
-        u'[a-zA-Z0-9][a-zA-Z\'\u2019\u2032\u02CA\xb4\/\.0-9\:\-\[\]_' \
+        r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\u2032\u02CA\xb4' \
             u'\u2080-\u2089]*'
         t.value = t.value.replace(u'\u2019', "'")
         t.value = t.value.replace(u'\u2032', "'")
@@ -306,7 +306,7 @@ def t_transctrl_ID(self, t):
     t_parallel_QUERY = r'\?'
 
     def t_parallel_LINELABEL(self, t):
-        r'^([^\.\ \t]*)\.[\ \t]*'
+        r'^([^. \t]*)\.[ \t]*'
         t.value = t.value.strip(" \t.")
         return t
 
@@ -315,7 +315,7 @@ def t_parallel_labeled_DOLLAR(self, t):
         t.lexer.push_state("absorb")
         return t
 
-    t_transctrl_MINUS = r'\-\ '
+    t_transctrl_MINUS = r'- '
 
     def t_transctrl_CLOSER(self, t):
         r'\)'
@@ -347,12 +347,12 @@ def t_labeled_NEWLINE(self, t):
     # Flag characters (#! etc ) don't apply in translations
     # But reference anchors ^1^ etc do.
     # lines beginning with a space are continuations
-    white = r'[\ \t]*'
+    white = r'[ \t]*'
     # translation_regex1 and translation_regex2 are identical appart from the
     # fact that the first character may not be a ?
     # We are looking for a string that does not start with ? it may include
     # newlines if they are followed by a whitespace.
-    translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))'
+    translation_regex1 = r'([^?\^\n\r]|([\n\r](?=[ \t])))'
     translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*'
     translation_regex = white + translation_regex1 + translation_regex2 + white
 
@@ -366,7 +366,7 @@ def t_parallel_interlinear_ID(self, t):
         return t
 
     def t_parallel_labeled_AMPERSAND(self, t):
-        r'\&'
+        r'&'
         # New document, so leave translation state
         t.lexer.pop_state()
         return t
@@ -383,9 +383,9 @@ def t_parallel_labeled_AMPERSAND(self, t):
     # Used for states where only flag# characters! and ^1^ references
     # Are separately tokenised
 
-    nonflagnonwhite = r'[^\ \t\#\!\^\*\?\n\r\=]'
-    internalonly = r'[^\n\^\r\=]'
-    nonflag = r'[^\ \t\#\!\^\*\?\n\r\=]'
+    nonflagnonwhite = r'[^ \t#!\^*?\n\r=]'
+    internalonly = r'[^\n\^\r=]'
+    nonflag = r'[^ \t#!\^*?\n\r=]'
     many_int_then_nonflag = '(' + internalonly + '*' + nonflag + '+' + ')'
     many_nonflag = nonflag + '*'
     intern_or_nonflg = '(' + many_int_then_nonflag + '|' + many_nonflag + ')'
@@ -399,17 +399,17 @@ def t_flagged_ID(self, t):
         t.value = t.value.strip()
         return t
 
-    t_flagged_HASH = r'\#'
-    t_flagged_EXCLAIM = r'\!'
+    t_flagged_HASH = r'#'
+    t_flagged_EXCLAIM = r'!'
     t_flagged_QUERY = r'\?'
     t_flagged_STAR = r'\*'
-    t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*'
-    t_flagged_EQUALS = r'\='
+    t_flagged_parallel_para_HAT = r'[ \t]*\^[ \t]*'
+    t_flagged_EQUALS = r'='
     # --- Rules for paragaph state----------------------------------
     # Free text, ended by double new line
 
     terminates_para = \
-        "(\#|\@[^i][^\{]|\&|\Z|(^[0-9]+[\'\u2019\u2032\u02CA\xb4]?\.))"
+        r'(#|@[^i][^{]|&|\Z|(^[0-9]+' u'[\'\u2019\u2032\u02CA\xb4]?\\.))'
 
     @lex.TOKEN(r'([^\^\n\r]|(\r?\n(?!\s*\r?\n)(?!' +
                terminates_para + ')))+')
@@ -428,9 +428,9 @@ def t_para_NEWLINE(self, t):
     # BUT, exceptionally to fix existing bugs in active members of corpus,
     # it is also ended by an @label or an @(), or a new document,
     # Or a linelabel, or the end of the stream. Importantly it does not end
-    # by @i{xxx} which is used for un translated words.
-    # and these tokens are not absorbed by this token
-    # Translation paragraph state is ended by a double newline
+    # by @i{xxx} which is used for untranslated words.
+    # Those tokens are not absorbed by this token.
+    # Translation paragraph state is ended by a double newline.
     @lex.TOKEN(r'\r?\n(?=' + terminates_para + ')')
     def t_para_MAGICNEWLINE(self, t):
         t.lexer.lineno += t.value.count("\n")
@@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t):
     # --- RULES FOR THE nonequals STATE -----
     # Absorb everything except an equals
     def t_nonequals_ID(self, t):
-        r'[^\=\n\r]+'
+        r'[^=\n\r]+'
         t.value = t.value.strip()
         return t
 
-    t_nonequals_EQUALS = r'\='
+    t_nonequals_EQUALS = r'='
 
     # --- RULES FOR THE absorb STATE -----
     # Absorb everything
@@ -455,15 +455,15 @@ def t_absorb_ID(self, t):
         return t
 
     # --- RULES FOR THE text STATE ----
-    t_text_ID = r'[^\ \t \n\r]+'
+    t_text_ID = r'[^ \t\n\r]+'
 
     def t_text_SPACE(self, t):
-        r'[\ \t]'
+        r'[ \t]'
         # No token generated
 
     # --- RULES FOR THE lemmatize STATE
-    t_lemmatize_ID = r'[^\;\n\r]+'
-    t_lemmatize_SEMICOLON = r'\;[\ \t]*'
+    t_lemmatize_ID = r'[^;\n\r]+'
+    t_lemmatize_SEMICOLON = r';[ \t]*'
 
     # Error handling rule
     def t_ANY_error(self, t):

diff --git a/pyoracc/test/atf/test_atflexer.py b/pyoracc/test/atf/test_atflexer.py
@@ -574,6 +574,19 @@ def test_hash_note_UPPERCASE(self):
              "LINELABEL"] + ["ID"] * 6 + ["NEWLINE", "NOTE", "ID", "NEWLINE"]
         )
 
+    def test_hash_note_multiline(self):
+        # Notes can be free text until a double-newline.
+        line = "a-šar _saḫar.ḫi.a_ bu-bu-su-nu"
+        self.compare_tokens(
+            "1. " + line + "\n" +
+            "#note: Does this combine with the next line?\n"
+            "It should.\n\n",
+            ["LINELABEL"] + ["ID"] * len(line.split()) + ["NEWLINE"] +
+            ["NOTE", "ID", "NEWLINE"],
+            ['1'] + line.split() +
+            [None, None, "Does this combine with the next line?\nIt should."]
+        )
+
     def test_open_text_with_dots(self):
         # This must not come out as a linelabel of Hello.
         self.compare_tokens(
@@ -896,6 +909,40 @@ def test_note_ended_by_strucuture(self):
             ["REVERSE"]
         )
 
+    def compare_note_ended_by_line(self, line_label):
+        'Helper for Note para state termination.'
+        # Sample text.
+        line1 = u"a-šar _saḫar.ḫi.a_ bu-bu-su-nu"
+        line2 = u"a-kal-ši-na ṭi-id-di"
+        # Generate the successive line numbers in the same style.
+        label1 = line_label
+        next_label = int(label1[:1]) + 1
+        if _pyversion() == 2:
+            label2 = unicode(next_label) + label1[1:]
+        else:
+            label2 = str(next_label) + label1[1:]
+        self.compare_tokens(
+            label1 + ". " + line1 + "\n" +
+            "#note: Does this combine with the next line?\n" +
+            label2 + ". " + line2 + "\n",
+            ["LINELABEL"] + ["ID"] * len(line1.split()) + ["NEWLINE"] +
+            ["NOTE", "ID", "NEWLINE"] +
+            ["LINELABEL"] + ["ID"] * len(line2.split()) + ["NEWLINE"],
+            [label1] + line1.split() +
+            [None, None, "Does this combine with the next line?", None] +
+            [label2] + line2.split() + [None]
+        )
+
+    def test_note_ended_by_line(self):
+        'Notes can be free text until the next line label.'
+        for label in ["1",
+                      "2'",
+                      u"3\u2019",
+                      u"4\u2032",
+                      u"5\u02CA",
+                      u"6\xb4"]:
+            self.compare_note_ended_by_line(label)
+
     def test_milestone(self):
         self.compare_tokens(
             "@tablet\n" +