Skip to content
This repository has been archived by the owner on Jul 22, 2022. It is now read-only.

Commit

Permalink
Merge 8dfd550 into 624d464
Browse files Browse the repository at this point in the history
  • Loading branch information
miurahr committed May 4, 2021
2 parents 624d464 + 8dfd550 commit dc79c89
Show file tree
Hide file tree
Showing 17 changed files with 1,786 additions and 752 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ setup_requires =
klepto
setuptools>=42
setuptools-scm[toml]>=3.5.0
wheel
install_requires =
klepto
importlib_metadata;python_version<"3.8"
Expand Down Expand Up @@ -74,6 +73,7 @@ check =
docutils
check-manifest
flake8
flake8-black
readme-renderer
pygments
isort
Expand Down
10 changes: 6 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@


class MyBuild(build_py):

def run(self):
build_py.run(self)

root_dir = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(1, os.path.join(root_dir, 'src'))
sys.path.insert(1, os.path.join(root_dir, "src"))

from kakasidict import Genkanwadict

if not self.dry_run:
kanwa = Genkanwadict()
dstdir = os.path.join(self.build_lib, 'pykakasi', 'data')
dstdir = os.path.join(self.build_lib, "pykakasi", "data")
kanwa.generate_dictionaries(dstdir)


setup(cmdclass={'build_py': MyBuild}, use_scm_version={"local_scheme": "no-local-version"})
setup(
cmdclass={"build_py": MyBuild}, use_scm_version={"local_scheme": "no-local-version"}
)
62 changes: 33 additions & 29 deletions src/kakasidict.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,26 @@
class Genkanwadict:
records = {} # type: Dict[str, Dict[str, List[Tuple[str, str]]]]

ESCAPE_SEQUENCE_RE = re.compile(r'''
ESCAPE_SEQUENCE_RE = re.compile(
r"""
( \\U........ # 8-digit hex escapes
| \\u.... # 4-digit hex escapes
| \\x.. # 2-digit hex escapes
| \\[0-7]{1,3} # Octal escapes
| \\N\{[^}]+\} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
)''', re.UNICODE | re.VERBOSE)
)""",
re.UNICODE | re.VERBOSE,
)

def decode_escapes(self, s: str) -> str:
def decode_match(match):
return codecs.decode(match.group(0), 'unicode-escape')
return codecs.decode(match.group(0), "unicode-escape")

return self.ESCAPE_SEQUENCE_RE.sub(decode_match, s)

def run(self, src: str, dst: str):
with open(src, 'r', encoding="utf-8") as f:
with open(src, "r", encoding="utf-8") as f:
for line in f:
self.parsekdict(line.strip())
f.close()
Expand All @@ -40,31 +44,31 @@ def mkdict(self, src: str, dst: str):
with open(src, "r", encoding="utf-8") as f:
for raw in f:
line = raw.strip()
if line.startswith(';;'): # skip comment
if line.startswith(";;"): # skip comment
continue
if re.match(r"^$", line):
continue
try:
(v, k) = self.decode_escapes(line).split(' ')
(v, k) = self.decode_escapes(line).split(" ")
dic[k] = v
max_key_len = max(max_key_len, len(k))
except ValueError:
raise Exception("Cannot process dictionary line: ", line)
d = file_archive(dst, dic, serialized=True)
d['_max_key_len_'] = max_key_len
d["_max_key_len_"] = max_key_len
d.dump()

def maketrans(self, src, dst):
dict = {}
with open(src, 'r', encoding='utf-8') as f:
with open(src, "r", encoding="utf-8") as f:
for raw in f:
line = raw.strip()
if line.startswith(';;'): # skip commnet
if line.startswith(";;"): # skip commnet
continue
if re.match(r"^$", line):
continue
try:
(v, k) = self.decode_escapes(line).split(' ')
(v, k) = self.decode_escapes(line).split(" ")
dict[ord(k)] = v
except ValueError:
raise Exception("Cannot process dictionary line: ", line)
Expand All @@ -78,14 +82,14 @@ def maketrans(self, src, dst):
# for kanwadict

def parsekdict(self, line: str):
if line.startswith(';;'): # skip comment
if line.startswith(";;"): # skip comment
return
(yomi, kanji) = line.split(' ')
if ord(yomi[-1:]) <= ord('z'):
(yomi, kanji) = line.split(" ")
if ord(yomi[-1:]) <= ord("z"):
tail = yomi[-1:]
yomi = yomi[:-1]
else:
tail = ''
tail = ""
self.updaterec(kanji, yomi, tail)

def updaterec(self, kanji: str, yomi: str, tail: str):
Expand All @@ -107,32 +111,32 @@ def kanwaout(self, out):

def generate_dictionaries(self, dstdir):
DICTS = [
('hepburndict.utf8', 'hepburndict3.db'),
('kunreidict.utf8', 'kunreidict3.db'),
('passportdict.utf8', 'passportdict3.db'),
('hepburnhira.utf8', 'hepburnhira3.db'),
('kunreihira.utf8', 'kunreihira3.db'),
('passporthira.utf8', 'passporthira3.db'),
('halfkana.utf8', 'halfkana3.db')
("hepburndict.utf8", "hepburndict3.db"),
("kunreidict.utf8", "kunreidict3.db"),
("passportdict.utf8", "passportdict3.db"),
("hepburnhira.utf8", "hepburnhira3.db"),
("kunreihira.utf8", "kunreihira3.db"),
("passporthira.utf8", "passporthira3.db"),
("halfkana.utf8", "halfkana3.db"),
]
srcdir = os.path.join(root_dir, 'src', 'data')
srcdir = os.path.join(root_dir, "src", "data")
if not os.path.exists(dstdir):
os.makedirs(dstdir)
for (src_f, pkl_f) in DICTS:
src = os.path.join(srcdir, src_f)
dst = os.path.join(dstdir, pkl_f)
if (os.path.exists(dst)):
if os.path.exists(dst):
os.unlink(dst)
self.mkdict(src, dst)

src = os.path.join(srcdir, 'itaijidict.utf8')
dst = os.path.join(dstdir, 'itaijidict4.db')
if (os.path.exists(dst)):
src = os.path.join(srcdir, "itaijidict.utf8")
dst = os.path.join(dstdir, "itaijidict4.db")
if os.path.exists(dst):
os.unlink(dst)
self.maketrans(src, dst)

src = os.path.join(srcdir, 'kakasidict.utf8')
dst = os.path.join(dstdir, 'kanwadict4.db')
if (os.path.exists(dst)):
src = os.path.join(srcdir, "kakasidict.utf8")
dst = os.path.join(dstdir, "kanwadict4.db")
if os.path.exists(dst):
os.unlink(dst)
self.run(src, dst)

0 comments on commit dc79c89

Please sign in to comment.