-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8ed97c3
Showing
6 changed files
with
3,281 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Copyright (c) 2021, oguna | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are met: | ||
* Redistributions of source code must retain the above copyright notice, | ||
this list of conditions and the following disclaimer. | ||
* Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
* Neither the name of the <organization> nor the names of its contributors | ||
may be used to endorse or promote products derived from this software | ||
without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY | ||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | ||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
Copyright 2010-2018, Google Inc. | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
met: | ||
|
||
* Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
* Redistributions in binary form must reproduce the above | ||
copyright notice, this list of conditions and the following disclaimer | ||
in the documentation and/or other materials provided with the | ||
distribution. | ||
* Neither the name of Google Inc. nor the names of its | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
Copyright (c) 2011-2021, The UniDic Consortium | ||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
met: | ||
|
||
* Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
|
||
* Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the | ||
distribution. | ||
|
||
* Neither the name of the UniDic Consortium nor the names of its | ||
contributors may be used to endorse or promote products derived | ||
from this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# yet-another-migemo-dict | ||
|
||
ライセンスの緩いMigemo用辞書を提供するプロジェクト。 | ||
|
||
C/Migemoで用いられているMigemo用辞書は、SKKプロジェクトの辞書から生成しているため、ファイルはGPLライセンス下であると考えられます。 | ||
この場合、Migemoを利用するプロジェクトでは、辞書ファイルをプログラムに同梱して配布しづらくなります。 | ||
|
||
そこで本プロジェクトでは、BSDライセンスであるMozcと、GPL/LGPL/BSDライセンスであるUniDicからMigemo用辞書を生成することで、ライセンス的に利用しやすい辞書を提供します。 | ||
|
||
## 辞書元 | ||
|
||
| ファイル | プロジェクト | ライセンス | | ||
|---|---|---| | ||
| single_kanji.tsv | [Mozc](https://github.com/google/mozc) | 3-clause BSD | | ||
| lex_x_x.csv | [UniDic](https://unidic.ninjal.ac.jp/) | GPL / LGPL / BSD | | ||
|
||
## 生成方法 | ||
|
||
[UniDic](https://unidic.ninjal.ac.jp/)から現代書き言葉のUniDicをダウンロードし、 | ||
ダウンロードした`.tar.gz`に格納されている`lex_x_x.csv` (`x`は数字)をこのフォルダ内に配置してください。 | ||
|
||
次に、`build.py` を実行すると、`migmeo-dict`ファイルを出力します。 | ||
このファイルの単語は、読みの辞書順に並んでいます。 | ||
|
||
```shell | ||
$ python build.py | ||
``` | ||
|
||
## 格納対象の単語 | ||
|
||
`single_kanji.tsv` に格納されている漢字と読みの対応はすべて格納対象としています。 | ||
|
||
一方、`lex_x_x.csv` からは、漢字のみか、漢字にひらがなが並んだ単語を対象としています。 | ||
(例:朝、謝まる) | ||
|
||
## ライセンス | ||
|
||
辞書元はどちらもBSDで配布されているため、本プロジェクトで生成した辞書もBSDとなります。 | ||
ライセンスの条項に従いご利用ください。 | ||
|
||
## TODO | ||
- 漢字の間にひらがながある単語のサポート(例:歩み行く) | ||
- [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd/)の適用による最新用語のサポート |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import re | ||
import csv | ||
|
||
dictionary = dict() | ||
|
||
# カタカナをひらがなに変換する | ||
def kata2hira(hira: str) -> str: | ||
s = '' | ||
for c in list(hira): | ||
if ord('ァ') <= ord(c) and ord(c) <= ord('ン'): | ||
s += chr(ord(c) - ord('ァ') + ord('ぁ')) | ||
else: | ||
s += c | ||
return s | ||
|
||
# 単漢字ファイルを読み込み | ||
with open('single_kanji.tsv', encoding='utf-8') as f: | ||
lines = f.readlines() | ||
for line in lines: | ||
line = line.strip() | ||
key = line.split('\t')[0] | ||
words = list(line.split('\t')[1]) | ||
if key in dictionary: | ||
dictionary[key].concat(words) | ||
else: | ||
dictionary[key] = words | ||
|
||
# UniDicのファイルを読み込み | ||
p = re.compile('([一-鿐]+)([ぁ-ん]*)') | ||
with open('lex_3_1.csv', encoding='utf-8') as f: | ||
reader = csv.reader(f, delimiter=',') | ||
for row in reader: | ||
word = row[0] | ||
reading = row[24] | ||
r = p.fullmatch(word) | ||
if r: | ||
kanji = r[1] | ||
okurigana = r[2] | ||
kana = kata2hira(reading) | ||
if okurigana: | ||
if kana.endswith(okurigana): | ||
kana = kana[:len(kana)-len(okurigana)] | ||
else: | ||
continue | ||
if kana=='' or kana=='*': | ||
continue | ||
if kana in dictionary: | ||
dictionary[kana].append(kanji) | ||
else: | ||
dictionary[kana] = [kanji] | ||
|
||
# 出力 | ||
sortedKeys = sorted(dictionary.keys()) | ||
with open('migemo-dict', mode='w', encoding='utf-16') as f: | ||
for k in sortedKeys: | ||
f.write(k + '\t' + '\t'.join(sorted(list(set(dictionary[k])))) + '\n') |
Oops, something went wrong.