-
Notifications
You must be signed in to change notification settings - Fork 26
/
__init__.py
198 lines (162 loc) · 9.17 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# -*- coding: utf-8 -*-
from .mapping import TRANSLIT_DICT
import sys
def __encode_utf8(_string):
if sys.version_info < (3, 0):
return _string.encode('utf-8')
else:
return _string
def __decode_utf8(_string):
if sys.version_info < (3, 0):
return _string.decode('utf-8')
else:
return _string
def to_latin(string_to_transliterate, lang_code='sr'):
''' Transliterate cyrillic string of characters to latin string of characters.
:param string_to_transliterate: The cyrillic string to transliterate into latin characters.
:param lang_code: Indicates the cyrillic language code we are translating from. Defaults to Serbian (sr).
:return: A string of latin characters transliterated from the given cyrillic string.
'''
# First check if we support the cyrillic alphabet we want to transliterate to latin.
if lang_code.lower() not in TRANSLIT_DICT:
# If we don't support it, then just return the original string.
return string_to_transliterate
# If we do support it, check if the implementation is not missing before proceeding.
elif not TRANSLIT_DICT[lang_code.lower()]['tolatin']:
return string_to_transliterate
# Everything checks out, proceed with transliteration.
else:
# Get the character per character transliteration dictionary
transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tolatin']
# Initialize the output latin string variable
latinized_str = ''
# Transliterate by traversing the input string character by character.
string_to_transliterate = __decode_utf8(string_to_transliterate)
for c in string_to_transliterate:
# If character is in dictionary, it means it's a cyrillic so let's transliterate that character.
if c in transliteration_dict:
# Transliterate current character.
latinized_str += transliteration_dict[c]
# If character is not in character transliteration dictionary,
# it is most likely a number or a special character so just keep it.
else:
latinized_str += c
# Return the transliterated string.
return __encode_utf8(latinized_str)
def to_cyrillic(string_to_transliterate, lang_code='sr'):
''' Transliterate latin string of characters to cyrillic string of characters.
:param string_to_transliterate: The latin string to transliterate into cyrillic characters.
:param lang_code: Indicates the cyrillic language code we are translating to. Defaults to Serbian (sr).
:return: A string of cyrillic characters transliterated from the given latin string.
'''
# First check if we support the cyrillic alphabet we want to transliterate to latin.
if lang_code.lower() not in TRANSLIT_DICT:
# If we don't support it, then just return the original string.
return string_to_transliterate
# If we do support it, check if the implementation is not missing before proceeding.
elif not TRANSLIT_DICT[lang_code.lower()]['tocyrillic']:
return string_to_transliterate
else:
# Get the character per character transliteration dictionary
transliteration_dict = TRANSLIT_DICT[lang_code.lower()]['tocyrillic']
# Initialize the output cyrillic string variable
cyrillic_str = ''
string_to_transliterate = __decode_utf8(string_to_transliterate)
# Transliterate by traversing the inputted string character by character.
length_of_string_to_transliterate = len(string_to_transliterate)
index = 0
while index < length_of_string_to_transliterate:
# Grab a character from the string at the current index
c = string_to_transliterate[index]
# Watch out for Lj and lj. Don't want to interpret Lj/lj as L/l and j.
# Watch out for Nj and nj. Don't want to interpret Nj/nj as N/n and j.
# Watch out for Dž and and dž. Don't want to interpret Dž/dž as D/d and j.
c_plus_1 = u''
if index != length_of_string_to_transliterate - 1:
c_plus_1 = string_to_transliterate[index + 1]
c_plus_2 = u''
if index + 2 <= length_of_string_to_transliterate - 1:
c_plus_2 = string_to_transliterate[index + 2]
if ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \
((c == u'N' or c == u'n') and c_plus_1 == u'j') or \
((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \
(lang_code == 'mk' and (c == u'D' or c == u'd') and c_plus_1 == u'z') or \
(lang_code == 'bg' and (
(c in u'Zz' and c_plus_1 in u'Hh') or # Zh, zh
(c in u'Tt' and c_plus_1 in u'Ss') or # Ts, ts
(c in u'Ss' and c_plus_1 in u'Hh') or # Sh, sh (and also covers Sht, sht)
(c in u'Cc' and c_plus_1 in u'Hh') or # Ch, ch
(c in u'Yy' and c_plus_1 in u'Uu') or # Yu, yu
(c in u'Yy' and c_plus_1 in u'Aa') # Ya, ya
)) or \
(lang_code == 'ru' and (
(c in u'Cc' and c_plus_1 in u'HhKkZz') or # c, ch, ck, cz
(c in u'Tt' and c_plus_1 in u'Hh') or # th
(c in u'Ww' and c_plus_1 in u'Hh') or # wh
(c in u'Pp' and c_plus_1 in u'Hh') or # ph
(c in u'Ee' and c_plus_1 == u'\'') or # e'
(c == u'i' and c_plus_1 == u'y' and
string_to_transliterate[index + 2:index + 3] not in u'aou') or # iy[^AaOoUu]
(c in u'Jj' and c_plus_1 in u'UuAaEeIiOo') or # j, ju, ja, je, ji, jo
(c in u'Ss' and c_plus_1 in u'HhZz') or # s, sh, sz
(c in u'Yy' and c_plus_1 in u'AaOoUuEeIi\'') or # y, ya, yo, yu, ye, yi, y'
(c in u'Zz' and c_plus_1 in u'Hh') or # z, zh
(c == u'\'' and c_plus_1 == u'\'') # ''
)) or \
(lang_code == 'ua' and (
(c in u'Jj' and c_plus_1 in u'eau') or #je, ja, ju
(c in u'Šš' and c_plus_1 in u'č') #šč
)) or \
(lang_code == "mn" and (
(c in u'Kk' and c_plus_1 == u'h') or # Х х
(c in u'Ss' and c_plus_1 == u'h') or # Ш ш
(c in u'Tt' and c_plus_1 == u's') or # Ц ц
(c in u'Cc' and c_plus_1 == u'h') or # Ч ч
(c in u'Yy' and c_plus_1 in u'eoua') # Е Ё Ю Я
)):
index += 1
c += c_plus_1
# In Bulgarian, the letter "щ" is represented by three latin letters: "sht",
# so we need this logic to support the third latin letter
if lang_code == 'bg' and \
index + 2 <= length_of_string_to_transliterate - 1 and \
(c == 'sh' or c == 'Sh' or c == 'SH') and \
string_to_transliterate[index + 1] in u'Tt':
index += 1
c += string_to_transliterate[index]
# Similarly in Russian, the letter "щ" шы represented by "shh".
if lang_code == 'ru' and \
index + 2 <= length_of_string_to_transliterate - 1 and \
(c == u'sh' or c == 'Sh' or c == 'SH') and \
string_to_transliterate[index + 1] in u'Hh': # shh
index += 1
c += string_to_transliterate[index]
# In Mongolia the begining of if statement is not the truth
# ((c == u'L' or c == u'l') and c_plus_1 == u'j') or \
# ((c == u'N' or c == u'n') and c_plus_1 == u'j') or \
# ((c == u'D' or c == u'd') and c_plus_1 == u'ž') or \
# Sü(nj)idmaa -> Сүнжидмаагаа not Сүnjидмаа
# I add post-processing , wonder if @georgeslabreche would like to change the old code, thx
if lang_code == 'mn' and c in [u'Lj', u'lj', u'Nj', u'nj']:
index -= 1
c = c[:-1]
# If character is in dictionary, it means it's a cyrillic so let's transliterate that character.
if c in transliteration_dict:
# ay, ey, iy, oy, uy
if lang_code == 'ru' and c in u'Yy' and \
cyrillic_str and cyrillic_str[-1].lower() in u"аеиоуэя":
cyrillic_str += u"й" if c == u'y' else u"Й"
else:
# Transliterate current character.
cyrillic_str += transliteration_dict[c]
# If character is not in character transliteration dictionary,
# it is most likely a number or a special character so just keep it.
else:
cyrillic_str += c
index += 1
return __encode_utf8(cyrillic_str)
def supported():
''' Returns list of supported languages, sorted alphabetically.
:return:
'''
return sorted(TRANSLIT_DICT.keys())