Work in progress (Part Deux)

jdlorimer · Jan 10, 2019 · 6640223 · 6640223
1 parent d7fb084
commit 6640223
Show file tree

Hide file tree

Showing 22 changed files with 424 additions and 495 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 *~
 .coverage
+.hypothesis/
 .mypy_cache/
 .prospector.yaml
 .python-version

diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,7 @@ before_install:
   - pip install --upgrade pytest
 
 install:
-  - pip install pytest-cov python-coveralls
+  - pip install hypothesis pytest-cov python-coveralls
 
 before_script:
   - export PYTHONPATH=.

diff --git a/chinese/behavior.py b/chinese/behavior.py
@@ -18,13 +18,13 @@
 
 from .bopomofo import bopomofo
 from .color import colorize, colorize_dict, colorize_fuse
-from .hanzi import separate_chars, silhouette, simplify, traditional
+from .hanzi import get_silhouette, get_simp, get_trad, split_hanzi
 from .main import config, dictionary
 from .ruby import hide_ruby, ruby
 from .sound import no_sound, sound
-from .transcribe import accentuate, no_tone, separate_trans, transcribe
+from .transcribe import accentuate, no_tone, split_transcript, transcribe
 from .translate import translate
-from .util import cleanup, get_first, has_field, hide, set_all
+from .util import cleanup, erase_fields, get_first, has_field, hide, set_all
 
 
 def get_classifier(hanzi, note):
@@ -95,14 +95,14 @@ def fill_all_defs(hanzi, note):
 
 
 def fill_silhouette(hanzi, note):
-    m = silhouette(hanzi)
+    m = get_silhouette(hanzi)
     set_all(config['fields']['silhouette'], note, to=m)
 
 
-def format_transcription(note):
+def format_transcript(note):
     t = colorize(
         accentuate(
-            separate_trans(
+            split_transcript(
                 cleanup(get_first(config['fields']['transcription'], note))
             )
         )
@@ -111,12 +111,12 @@ def format_transcription(note):
     set_all(config['fields']['transcription'], note, to=t)
 
 
-def fill_transcription(hanzi, note):
+def fill_transcript(hanzi, note):
     n_filled = 0
-    separated = separate_chars(hanzi)
+    separated = split_hanzi(hanzi)
 
     for key, target, func, only_one in [
-        ('transcription', None, format_transcription, True),
+        ('transcription', None, format_transcript, True),
         ('pinyin', 'Pinyin', format_pinyin, True),
         ('pinyinTaiwan', 'Pinyin (Taiwan)', format_taiwan_pinyin, True),
         ('cantonese', 'Cantonese', format_cantonese, False),
@@ -136,7 +136,7 @@ def fill_transcription(hanzi, note):
 def format_pinyin(note):
     t = colorize(
         accentuate(
-            separate_trans(
+            split_transcript(
                 cleanup(get_first(config['fields']['pinyin'], note)), True
             )
         )
@@ -148,7 +148,7 @@ def format_pinyin(note):
 def format_taiwan_pinyin(note):
     t = colorize(
         accentuate(
-            separate_trans(
+            split_transcript(
                 cleanup(get_first(config['fields']['pinyinTaiwan'], note)),
                 True,
             )
@@ -160,7 +160,7 @@ def format_taiwan_pinyin(note):
 
 def format_cantonese(note):
     t = colorize(
-        separate_trans(cleanup(get_first(config['fields']['cantonese'], note)))
+        split_transcript(cleanup(get_first(config['fields']['cantonese'], note)))
     )
     t = hide(t, no_tone(t))
     set_all(config['fields']['cantonese'], note, to=t)
@@ -173,7 +173,7 @@ def fill_bopomofo(hanzi, note):
         syllables = cleanup(field).split()
         n_filled = 0
     else:
-        syllables = transcribe(separate_chars(hanzi), 'Bopomofo')
+        syllables = transcribe(split_hanzi(hanzi), 'Bopomofo')
         n_filled = 1
 
     text = colorize(syllables)
@@ -216,7 +216,7 @@ def fill_simp(hanzi, note):
     if not get_first(config['fields']['simplified'], note) == '':
         return
 
-    s = simplify(hanzi)
+    s = get_simp(hanzi)
     if s is not None and s != hanzi:
         set_all(config['fields']['simplified'], note, to=s)
     else:
@@ -227,7 +227,7 @@ def fill_trad(hanzi, note):
     if not get_first(config['fields']['traditional'], note) == '':
         return
 
-    t = traditional(hanzi)
+    t = get_trad(hanzi)
     if t is not None and t != hanzi:
         set_all(config['fields']['traditional'], note, to=t)
     else:
@@ -278,11 +278,6 @@ def fill_all_rubies(hanzi, note):
         set_all(config['fields'][ruby_field], note, to=rubified)
 
 
-def erase_fields(note):
-    for f in config['fields'].values():
-        set_all(f, note, to='')
-
-
 def update_fields(note, focus_field, fields):
     if 'addon' in note.model():
         model = note.model()['addon']
@@ -308,7 +303,7 @@ def update_fields(note, focus_field, fields):
     elif focus_field in config['fields']['hanzi']:
         if copy[focus_field]:
             fill_all_defs(hanzi, copy)
-            fill_transcription(hanzi, copy)
+            fill_transcript(hanzi, copy)
             fill_color(hanzi, copy)
             fill_sound(hanzi, copy)
             fill_simp(hanzi, copy)
@@ -318,7 +313,7 @@ def update_fields(note, focus_field, fields):
         else:
             erase_fields(copy)
     elif focus_field in config['fields']['transcription']:
-        format_transcription(copy)
+        format_transcript(copy)
         fill_color(hanzi, copy)
         fill_all_rubies(hanzi, copy)
     elif focus_field in config['fields']['pinyin']:

diff --git a/chinese/color.py b/chinese/color.py
@@ -18,10 +18,17 @@
 
 from re import IGNORECASE, sub
 
-from .consts import pinyin_regex, half_ruby_regex, ruby_regex
-from .hanzi import separate_chars
+from .consts import (
+    COLOR_RUBY_TEMPLATE,
+    COLOR_TEMPLATE,
+    pinyin_regex,
+    half_ruby_regex,
+    HANZI_RANGE,
+    ruby_regex,
+)
+from .hanzi import split_hanzi
 from .sound import extract_sound_tags
-from .transcribe import accentuate, separate_trans, tone_number
+from .transcribe import tone_number, sanitize_transcript
 from .util import align, cleanup, is_punc, no_color
 
 
@@ -64,41 +71,7 @@ def repl(p):
     return ' '.join(colorized)
 
 
-def colorize_fuse(chars, trans, ruby=False):
-    """Colorize hanzi based on pinyin tone.
-
-    If ruby=True, then annotate hanzi with pinyin.
-    """
-
-    standard_fmt = '<span class="tone{tone}">{chars}</span>'
-    ruby_fmt = (
-        '<span class="tone{tone}"><ruby>{chars}<rt>{trans}</rt></ruby></span>'
-    )
-
-    chars = separate_chars(cleanup(chars), grouped=False)
-    trans = sanitize_pinyin(trans)
-    text = ''
-
-    for c, t in align(chars, trans):
-        if c is None or t is None:
-            continue
-        if is_punc(c) and is_punc(t):
-            text += c
-            continue
-        if ruby:
-            text += ruby_fmt.format(tone=tone_number(t), chars=c, trans=t)
-        else:
-            text += standard_fmt.format(tone=tone_number(t), chars=c)
-
-    return text
-
-
 def colorize_dict(text):
-    """Colorize text in the form: 你好[ni3 hao].
-
-    As used in the local dictionaries.
-    """
-
     def _sub(p):
         s = ''
         hanzi = p.group(1)
@@ -115,10 +88,25 @@ def _sub(p):
 
         return s
 
-    return sub(r'([\u3400-\u9fff|]+)\[(.*?)\]', _sub, text)
+    return sub(r'([\%s|]+)\[(.*?)\]' % HANZI_RANGE, _sub, text)
 
 
-def sanitize_pinyin(pinyin, grouped=False):
-    return ' '.join(
-        accentuate(separate_trans(cleanup(no_color(pinyin)), grouped))
-    ).split()
+def colorize_fuse(chars, transcript, ruby=False):
+    chars = split_hanzi(cleanup(chars), grouped=False)
+    transcript = sanitize_transcript(transcript)
+    colorized = ''
+
+    for c, t in align(chars, transcript):
+        if c is None or t is None:
+            continue
+        if is_punc(c) and is_punc(t):
+            colorized += c
+            continue
+        if ruby:
+            colorized += COLOR_RUBY_TEMPLATE.format(
+                tone=tone_number(t), chars=c, transcript=t
+            )
+        else:
+            colorized += COLOR_TEMPLATE.format(tone=tone_number(t), chars=c)
+
+    return colorized
diff --git a/chinese/config.json b/chinese/config.json
@@ -8,9 +8,9 @@
     "transcription": "Pinyin",
     "fields": {
         "hanzi": [
+            "Hanzi",
             "Chinese",
             "Expression",
-            "Hanzi",
             "中文",
             "汉字",
             "漢字"
@@ -31,8 +31,8 @@
             "英语"
         ],
         "german": [
-            "Deutsch",
             "German",
+            "Deutsch",
             "德文",
             "德語",
             "德语"
@@ -48,15 +48,13 @@
             "Reading"
         ],
         "pinyin": [
-            "PY",
             "Pinyin",
             "大陆拼音",
             "大陸拼音",
             "拼音"
         ],
         "pinyinTaiwan": [
-            "PYTW",
-            "PinyinTW",
+            "Pinyin (Taiwan)",
             "台湾拼音",
             "台灣拼音",
             "臺灣拼音"
@@ -76,97 +74,67 @@
         ],
         "bopomofo": [
             "Bopomofo",
+            "Zhuyin",
             "ㄅㄆㄇㄈ",
             "注音符号",
             "注音符號",
             "註音符號"
         ],
         "sound": [
-            "Audio",
             "Sound",
-            "Spoken",
+            "Audio",
             "声音",
             "聲音"
         ],
         "mandarinSound": [
+            "Sound (Mandarin)",
             "Sound - Mandarin"
         ],
         "cantoneseSound": [
+            "Sound (Cantonese)",
             "Sound - Cantonese"
         ],
         "simplified": [
-            "Simp",
-            "Simp.",
             "Simplified",
             "简体",
             "简体字",
-            "简化",
             "简化字",
-            "簡化",
             "簡化字",
-            "簡體",
             "簡體字"
         ],
         "traditional": [
-            "Trad",
-            "Trad.",
             "Traditional",
-            "繁体",
             "繁体字",
-            "繁體",
             "繁體字"
         ],
         "classifier": [
             "Classifier",
-            "MW",
-            "Mean Word",
-            "Mean",
             "Measure Word",
             "量詞",
             "量词"
         ],
         "alternative": [
             "Also Written",
-            "Alt",
             "Alternative"
         ],
         "color": [
             "Color",
-            "Colored Hanzi",
-            "Colour",
-            "Coloured Hanzi",
             "彩色"
         ],
-        "colorPinyin": [
-            "ColorPY",
-            "ColourPY"
-        ],
-        "colorPinyinTaiwan": [
-            "ColorPYTW",
-            "ColourPYTW"
-        ],
-        "colorCantonese": [
-            "ColorCANT",
-            "ColourCANT"
-        ],
-        "colorBopomofo": [
-            "ColorBPMF",
-            "ColourBPMF"
-        ],
         "ruby": [
             "Ruby"
         ],
         "rubyPinyin": [
-            "RubyPY"
+            "Ruby (Pinyin)"
         ],
         "rubyPinyinTaiwan": [
-            "RubyPYTW"
-        ],
-        "rubyCantonese": [
-            "RubyCANT"
+            "Ruby (Taiwan Pinyin)"
         ],
         "rubyBopomofo": [
-            "RubyBPMF"
+            "Ruby (Bopomofo)"
+        ],
+        "rubyCantonese": [
+            "Ruby (Cantonese)"
         ],
         "silhouette": [
             "Silhouette"