Skip to content
This repository has been archived by the owner on Jul 22, 2022. It is now read-only.

Commit

Permalink
Support latin1 characters
Browse files Browse the repository at this point in the history
with reproducable of issue #150

Signed-off-by: Hiroshi Miura <miurahr@linux.com>
  • Loading branch information
miurahr committed Apr 14, 2022
1 parent e28af7c commit 1d9a4ca
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 2 deletions.
103 changes: 101 additions & 2 deletions src/pykakasi/properties.py
Expand Up @@ -40,6 +40,8 @@ class Ch:
bracket_bra = 0x7B
tilda = 0x7E
delete = 0x7F
latin1_inverted_exclam = 0x00A1
latin1_y_diaeresis = 0x00FF
ideographic_space = 0x3000
postal_mark_face = 0x3020
wavy_dash = 0x3030
Expand Down Expand Up @@ -101,7 +103,7 @@ class Convert_Tables:
a2 f0 | Å ‰ ♯ ♭ ♪ † ‡ ¶ ◯
----------------------------------------------------------
Greek convertion table
Greek conversion table
----------------------------------------------------------
"Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
"Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
Expand Down Expand Up @@ -245,7 +247,7 @@ class Convert_Tables:
".",
"/",
]
# cyriilic
# cyrillic
cyrillic_table = { # basic cyrillic characters
"\u0410": "A",
"\u0411": "B",
Expand Down Expand Up @@ -359,6 +361,103 @@ class Convert_Tables:
"\uff40",
] # [\]^_`
alpha_table_3 = ["\uff5b", "\uff5c", "\uff5d", "\uff5e"] # {|}~
latin1_table = [
"!", # inverted exclamation
"cent", # cent mark
"GBP", # pound mark
"currency", # currency mark
"yen", # Yen mark
"|", # broken bar
"ss", # section sign
"..", # diaeresis
"(c)", # copyright
"a", # Feminine Ordinal Indicator
"<<", # left pointing double angle
"not", # not sign
"-", # soft hyphen
"(R)", # registered
"~", # macron
".", # degree symbol
"+-", # plus-minus sign
"^2", # superscript two
"^3", # superscript three
"`", # acute
"u", # micro sign
"D", # pilcrow sign
".", # middle dot
",", # cedilla
"^1", # superscript one
"", # Masculine ordinal indicator
">>", # right pointing double angle
"1/4", # Vulgar fraction one quarter
"1/2", # Vulgar fraction one half
"3/4", # Vulgar fraction three quarters
"?", # Inverted question mark
"A",
"A",
"A",
"A",
"A",
"A",
"AE",
"C",
"E",
"E",
"E",
"E",
"I",
"I",
"I",
"I",
"Eth",
"N",
"O",
"O",
"O",
"O",
"O",
"x",
"O",
"U",
"U",
"U",
"U",
"Y",
"",
"",
"a",
"a",
"a",
"a",
"a",
"a",
"ae",
"c",
"e",
"e",
"e",
"e",
"i",
"i",
"i",
"i",
"eth",
"n",
"o",
"o",
"o",
"o",
"o",
"/",
"o",
"u",
"u",
"u",
"u",
"y",
"",
"y",
]


Convert_Tables = Convert_Tables()
3 changes: 3 additions & 0 deletions src/pykakasi/scripts.py
Expand Up @@ -324,6 +324,7 @@ def isRegion(cls, char: str):
or (Ch.greece_alpha <= c <= Ch.greece_omega)
or (Ch.cyrillic_A <= c <= Ch.cyrillic_ya)
or (Ch.zenkaku_exc_mark <= c <= Ch.zenkaku_number_nine)
or (Ch.latin1_inverted_exclam <= c <= Ch.latin1_y_diaeresis)
or (0xFF20 <= c <= 0xFF5E)
or c == 0x0451
or c == 0x0401
Expand Down Expand Up @@ -351,6 +352,8 @@ def _convert(self, text):
return chr(0x0041 + c - 0xFF21) # u\ff21A => u\0041:@A..Z[\]^_`
elif 0xFF41 <= c < 0xFF5F:
return chr(0x0061 + c - 0xFF41) # u\ff41a => u\0061:a..z{|}
elif Ch.latin1_inverted_exclam <= c <= Ch.latin1_y_diaeresis:
return Convert_Tables.latin1_table[c - Ch.latin1_inverted_exclam]
else:
return "" # pragma: no cover

Expand Down
10 changes: 10 additions & 0 deletions tests/test_pykakasi_structured.py
Expand Up @@ -623,3 +623,13 @@ def test_kakasi_unihandecode(case, expected):
assert result[i]["hepburn"] == e["hepburn"]
assert result[i]["kunrei"] == e["kunrei"]
assert result[i]["passport"] == e["passport"]


def test_issue_150():
kakasi = pykakasi.kakasi()
result = kakasi.convert("三\u00D7五")
assert result[0]["hira"] == "さん"
assert result[1]["orig"] == "\u00D7"
assert result[1]["hira"] == "×"
assert result[2]["hira"] == "ご"
assert len(result) == 3

0 comments on commit 1d9a4ca

Please sign in to comment.