Skip to content

Commit

Permalink
Merge 371df90 into 043c47d
Browse files Browse the repository at this point in the history
  • Loading branch information
mocobeta committed Apr 3, 2019
2 parents 043c47d + 371df90 commit 502d5f2
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
- "3.7-dev"
addons:
apt:
packages:
Expand Down
7 changes: 7 additions & 0 deletions ipadic/Noun.proper.csv.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
--- Noun.proper.csv 2007-07-31 23:50:07.000000000 +0900
+++ Noun.proper.csv.20190403 2019-04-03 13:50:50.052298892 +0900
@@ -27325,3 +27325,4 @@
桃ノ木鼻,1288,1288,8538,名詞,固有名詞,一般,*,*,*,桃ノ木鼻,モモノキハナ,モモノキハナ
ドウ坂,1288,1288,3765,名詞,固有名詞,一般,*,*,*,ドウ坂,ドウザカ,ドーザカ
戸城山,1288,1288,8538,名詞,固有名詞,一般,*,*,*,戸城山,トシロヤマ,トシロヤマ
+令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ
6 changes: 5 additions & 1 deletion ipadic/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ if [ ! -e ${IPADIC_DIR} ]; then
exit 1
fi

if [[ "${IPADIC_DIR%%/}" =~ mecab-ipadic-2.7.0-20070801$ ]]; then
# apply patch to mecab-ipadic
patch -u -N ${IPADIC_DIR%%/}/Noun.proper.csv < Noun.proper.csv.patch
fi

ENC=$2
if [ -z ${ENC} ]; then
ENC=euc-jp
Expand All @@ -28,7 +33,6 @@ if [ -e "${OUT_DIR}.zip" ]; then
rm "${OUT_DIR}.zip"
fi


if [ -e ${WORKING_DIR} ]; then
rm -rf ${WORKING_DIR}
fi
Expand Down
Binary file modified ipadic/sysdic.zip
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ def test_tokenize2(self):
self.assertEqual(1, len(tokens))
self._check_token(tokens[0], u'한국어', u'記号,一般,*,*,*,*,한국어,*,*', NodeType.UNKNOWN)

def test_tokenize_patched_dic(self):
text = u'令和元年'
tokens = Tokenizer().tokenize(text)
self.assertEqual(2, len(tokens))
self._check_token(tokens[0], u'令和', u'名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ', NodeType.SYS_DICT)
self._check_token(tokens[1], u'元年', u'名詞,一般,*,*,*,*,元年,ガンネン,ガンネン', NodeType.SYS_DICT)

def test_tokenize_unknown(self):
text = u'2009年10月16日'
tokens = Tokenizer().tokenize(text)
Expand Down

0 comments on commit 502d5f2

Please sign in to comment.