Permalink
Browse files

Add support for multiple variants.

Also refactor the unit tests to be easier to type/read.

Also add more tests.
  • Loading branch information...
sjl committed Sep 13, 2012
1 parent 4fd4aa1 commit 6e419957532c1e07736dcccafdf5f905de591e85
Showing with 118 additions and 108 deletions.
  1. +39 −13 unilangs/bcp47/parser.py
  2. +79 −95 unilangs/tests/test_bcp47_parsing.py
View
@@ -8,13 +8,15 @@
from pprint import pprint
+# Exceptions
class InvalidLanguageException(Exception):
pass
class MalformedLanguageCodeException(Exception):
pass
+# Convenience Functions
def _split_at(pred, seq):
"""Split the given sequence into a number of sequences as a generator.
@@ -50,6 +52,7 @@ def _split_at(pred, seq):
yield next
+# Parsing codes
def _next_chunk(code):
"""Split a chunk off of the given code, return (chunk, rest)."""
return code.split('-', 1) if '-' in code else (code, '')
@@ -90,17 +93,43 @@ def _parse_extensions(code):
return results
+def _parse_subtag(next, code, reg):
+ if next in reg:
+ st = reg[next]
+ next, code = _next_chunk(code)
+ return st, next, code
+ else:
+ return None, next, code
+
+def _parse_variants(next, code):
+ variants = []
+ variant, next, code = _parse_subtag(next, code, VARIANT_SUBTAGS)
+
+ while variant:
+ variants.append(variant)
+ variant, next, code = _parse_subtag(next, code, VARIANT_SUBTAGS)
+
+ return variants, next, code
+
def _parse_code(code):
"""Parse a BCP47 language code into its constituent parts.
A BCP47 language code looks like this:
language-extlang-script-region-variant-extension-privateuse
- Every one of those except for language is optional.
+ Every one of those except for language is optional. Multiple variant tags
+ can appear, as well as any number of extensions.
A dictionary of parts will be returned, with keys of 'language', 'extlang',
etc and values of the entries in the BCP47 registry.
+
+ The variant portion will be returned with a key of 'variants' and value of
+ a list of registry entries (possibly empty).
+
+ The extension portion will be returned as a list of tuples of (code,
+ [data...]). For example, "en-x-foo-bar" would have an 'extensions' value
+ of [('x', ['foo', 'bar'])].
Note that this function only validates the structure of the language code.
It doesn't look at the semantic values of each piece and check for things
@@ -109,7 +138,7 @@ def _parse_code(code):
"""
code = code.lower()
result = {'language': None, 'extlang': None, 'script': None, 'region': None,
- 'variant': None, 'grandfathered': None, 'extensions': []}
+ 'variants': [], 'grandfathered': None, 'extensions': []}
# Grandfathered tags take precedence over everything.
if code in GRANDFATHERED_SUBTAGS:
@@ -132,18 +161,10 @@ def _parse_code(code):
"Invalid primary language '%s'!" % language)
# Parse the rest of the subtags, in order.
- def _parse_subtag(next, code, reg):
- if next in reg:
- st = reg[next]
- next, code = _next_chunk(code)
- return st, next, code
- else:
- return None, next, code
-
result['extlang'], next, code = _parse_subtag(next, code, EXTLANG_SUBTAGS)
- result['script'], next, code = _parse_subtag(next, code, SCRIPT_SUBTAGS)
- result['region'], next, code = _parse_subtag(next, code, REGION_SUBTAGS)
- result['variant'], next, code = _parse_subtag(next, code, VARIANT_SUBTAGS)
+ result['script'], next, code = _parse_subtag(next, code, SCRIPT_SUBTAGS)
+ result['region'], next, code = _parse_subtag(next, code, REGION_SUBTAGS)
+ result['variants'], next, code = _parse_variants(next, code)
if next and not code:
raise MalformedLanguageCodeException(
@@ -157,6 +178,8 @@ def _parse_subtag(next, code, reg):
return result
+
+# Validating parsed codes
def _validate(l):
"""Validated that the parsed language dict makes sense."""
@@ -166,9 +189,12 @@ def _validate(l):
return l
+
+# Public API
def parse_code(bcp47_language_code):
return _validate(_parse_code(bcp47_language_code))
+
def _t(lc):
print '=' * 60
pprint(parse_code(lc))
@@ -16,51 +16,44 @@ def assertInvalid(self, code):
return self.assertRaises(InvalidLanguageException,
lambda: parse_code(code))
+ def assertTagDesc(self, subtag_dict, tag, desc):
+ self.assertEqual(subtag_dict['subtag'].lower(), tag)
+ self.assertEqual(subtag_dict['description'][0], desc)
+
+ def assertNil(self, language_dict, fields):
+ for f in fields:
+ val = language_dict[f]
+
+ if f in ['variants', 'extensions']:
+ self.assertEqual(val, [])
+ else:
+ self.assertIsNone(val)
+
def test_grandfathered(self):
p = parse_code('i-klingon')
self.assertEqual(p['grandfathered']['tag'], 'i-klingon')
- self.assertEqual(p['grandfathered']['preferred-value'], 'tlh')
self.assertEqual(p['grandfathered']['description'][0], 'Klingon')
- self.assertIsNone(p['language'])
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
+ self.assertNil(p, ['language', 'extlang', 'script', 'region',
+ 'variants', 'extensions'])
p = parse_code('art-lojban')
self.assertEqual(p['grandfathered']['tag'], 'art-lojban')
- self.assertEqual(p['grandfathered']['preferred-value'], 'jbo')
self.assertEqual(p['grandfathered']['description'][0], 'Lojban')
- self.assertIsNone(p['language'])
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
+ self.assertNil(p, ['language', 'extlang', 'script', 'region',
+ 'variants', 'extensions'])
def test_bare_language(self):
# Bare, simple language codes should parse fine.
p = parse_code('en')
- self.assertEqual(p['language']['subtag'], 'en')
- self.assertEqual(p['language']['description'][0], 'English')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'en', 'English')
+ self.assertNil(p, ['extlang', 'script', 'region', 'variants',
+ 'extensions', 'grandfathered'])
p = parse_code('de')
- self.assertEqual(p['language']['subtag'], 'de')
- self.assertEqual(p['language']['description'][0], 'German')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'de', 'German')
+ self.assertNil(p, ['extlang', 'script', 'region', 'variants',
+ 'extensions', 'grandfathered'])
# Language codes are case-insensitive.
self.assertEqual(parse_code('en'), parse_code('EN'))
@@ -73,26 +66,16 @@ def test_bare_language(self):
def test_language_script(self):
# Languages with scripts should parse fine.
p = parse_code('zh-Hans')
- self.assertEqual(p['language']['subtag'], 'zh')
- self.assertEqual(p['language']['description'][0], 'Chinese')
- self.assertEqual(p['script']['subtag'].lower(), 'hans')
- self.assertEqual(p['script']['description'][0], 'Han (Simplified variant)')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'zh', 'Chinese')
+ self.assertTagDesc(p['script'], 'hans', 'Han (Simplified variant)')
+ self.assertNil(p, ['extlang', 'region', 'variants', 'extensions',
+ 'grandfathered'])
p = parse_code('zh-HANT')
- self.assertEqual(p['language']['subtag'], 'zh')
- self.assertEqual(p['language']['description'][0], 'Chinese')
- self.assertEqual(p['script']['subtag'].lower(), 'hant')
- self.assertEqual(p['script']['description'][0], 'Han (Traditional variant)')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['region'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'zh', 'Chinese')
+ self.assertTagDesc(p['script'], 'hant', 'Han (Traditional variant)')
+ self.assertNil(p, ['extlang', 'region', 'variants', 'extensions',
+ 'grandfathered'])
# Scripts cannot stand without a language.
self.assertInvalid('Cyrl')
@@ -107,37 +90,22 @@ def test_language_script(self):
def test_language_region(self):
# Language with region codes should be fine.
p = parse_code('en-us')
- self.assertEqual(p['language']['subtag'], 'en')
- self.assertEqual(p['language']['description'][0], 'English')
- self.assertEqual(p['region']['subtag'].lower(), 'us')
- self.assertEqual(p['region']['description'][0], 'United States')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'en', 'English')
+ self.assertTagDesc(p['region'], 'us', 'United States')
+ self.assertNil(p, ['extlang', 'script', 'variants', 'extensions',
+ 'grandfathered'])
p = parse_code('en-gb')
- self.assertEqual(p['language']['subtag'], 'en')
- self.assertEqual(p['language']['description'][0], 'English')
- self.assertEqual(p['region']['subtag'].lower(), 'gb')
- self.assertEqual(p['region']['description'][0], 'United Kingdom')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'en', 'English')
+ self.assertTagDesc(p['region'], 'gb', 'United Kingdom')
+ self.assertNil(p, ['extlang', 'script', 'variants', 'extensions',
+ 'grandfathered'])
p = parse_code('es-419')
- self.assertEqual(p['language']['subtag'], 'es')
- self.assertEqual(p['language']['description'][0], 'Spanish')
- self.assertEqual(p['region']['subtag'].lower(), '419')
- self.assertEqual(p['region']['description'][0], 'Latin America and the Caribbean')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['script'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'es', 'Spanish')
+ self.assertTagDesc(p['region'], '419', 'Latin America and the Caribbean')
+ self.assertNil(p, ['extlang', 'script', 'variants', 'extensions',
+ 'grandfathered'])
# Regions cannot be given without a language.
self.assertInvalid('419')
@@ -151,28 +119,16 @@ def test_language_region(self):
def test_language_script_region(self):
p = parse_code('en-Latn-us')
- self.assertEqual(p['language']['subtag'], 'en')
- self.assertEqual(p['language']['description'][0], 'English')
- self.assertEqual(p['region']['subtag'].lower(), 'us')
- self.assertEqual(p['region']['description'][0], 'United States')
- self.assertEqual(p['script']['subtag'].lower(), 'latn')
- self.assertEqual(p['script']['description'][0], 'Latin')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'en', 'English')
+ self.assertTagDesc(p['script'], 'latn', 'Latin')
+ self.assertTagDesc(p['region'], 'us', 'United States')
+ self.assertNil(p, ['extlang', 'variants', 'extensions', 'grandfathered'])
p = parse_code('sr-Cyrl-RS')
- self.assertEqual(p['language']['subtag'], 'sr')
- self.assertEqual(p['language']['description'][0], 'Serbian')
- self.assertEqual(p['region']['subtag'].lower(), 'rs')
- self.assertEqual(p['region']['description'][0], 'Serbia')
- self.assertEqual(p['script']['subtag'].lower(), 'cyrl')
- self.assertEqual(p['script']['description'][0], 'Cyrillic')
- self.assertIsNone(p['extlang'])
- self.assertIsNone(p['variant'])
- self.assertEqual(p['extensions'], [])
- self.assertIsNone(p['grandfathered'])
+ self.assertTagDesc(p['language'], 'sr', 'Serbian')
+ self.assertTagDesc(p['script'], 'cyrl', 'Cyrillic')
+ self.assertTagDesc(p['region'], 'rs', 'Serbia')
+ self.assertNil(p, ['extlang', 'variants', 'extensions', 'grandfathered'])
# Scripts and regions still require a language.
self.assertInvalid('Latn-us')
@@ -181,3 +137,31 @@ def test_language_script_region(self):
self.assertInvalid('minecraft-Latn-us')
self.assertMalformed('en-cursive-us')
self.assertMalformed('en-Latn-murica')
+
+ def test_language_variants(self):
+ p = parse_code('sl-rozaj')
+ self.assertTagDesc(p['language'], 'sl', 'Slovenian')
+ self.assertTagDesc(p['variants'][0], 'rozaj', 'Resian')
+ self.assertNil(p, ['extlang', 'script', 'region', 'extensions',
+ 'grandfathered'])
+
+ p = parse_code('sl-rozaj-biske')
+ self.assertTagDesc(p['language'], 'sl', 'Slovenian')
+ self.assertTagDesc(p['variants'][0], 'rozaj', 'Resian')
+ self.assertTagDesc(p['variants'][1], 'biske', 'The San Giorgio dialect of Resian')
+ self.assertNil(p, ['extlang', 'script', 'region', 'extensions',
+ 'grandfathered'])
+
+ # Variants still require a language.
+ self.assertInvalid('rozaj')
+ self.assertInvalid('rozaj-biske')
+
+ # Invalid variants don't work.
+ self.assertMalformed('sl-rozajbad')
+
+ def test_language_region_variants(self):
+ p = parse_code('de-CH-1901')
+ self.assertTagDesc(p['language'], 'de', 'German')
+ self.assertTagDesc(p['region'], 'ch', 'Switzerland')
+ self.assertTagDesc(p['variants'][0], '1901', 'Traditional German orthography')
+ self.assertNil(p, ['extlang', 'script', 'extensions', 'grandfathered'])

0 comments on commit 6e41995

Please sign in to comment.