Merge 371df90 into 043c47d

mocobeta · Apr 3, 2019 · 502d5f2 · 502d5f2
2 parents 043c47d + 371df90
commit 502d5f2
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 2 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,7 @@ python:
   - "3.4"
   - "3.5"
   - "3.6"
-  - "3.7"
+  - "3.7-dev"
 addons:
   apt:
     packages:

diff --git a/ipadic/Noun.proper.csv.patch b/ipadic/Noun.proper.csv.patch
@@ -0,0 +1,7 @@
+--- Noun.proper.csv	2007-07-31 23:50:07.000000000 +0900
++++ Noun.proper.csv.20190403	2019-04-03 13:50:50.052298892 +0900
+@@ -27325,3 +27325,4 @@
+ 桃ノ木鼻,1288,1288,8538,名詞,固有名詞,一般,*,*,*,桃ノ木鼻,モモノキハナ,モモノキハナ
+ ドウ坂,1288,1288,3765,名詞,固有名詞,一般,*,*,*,ドウ坂,ドウザカ,ドーザカ
+ 戸城山,1288,1288,8538,名詞,固有名詞,一般,*,*,*,戸城山,トシロヤマ,トシロヤマ
++令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ
diff --git a/ipadic/build.sh b/ipadic/build.sh
@@ -15,6 +15,11 @@ if [ ! -e ${IPADIC_DIR} ]; then
   exit 1
 fi
 
+if [[ "${IPADIC_DIR%%/}" =~ mecab-ipadic-2.7.0-20070801$ ]]; then
+  # apply patch to mecab-ipadic
+  patch -u -N ${IPADIC_DIR%%/}/Noun.proper.csv < Noun.proper.csv.patch
+fi
+
 ENC=$2
 if [ -z ${ENC} ]; then
   ENC=euc-jp
@@ -28,7 +33,6 @@ if [ -e "${OUT_DIR}.zip" ]; then
   rm "${OUT_DIR}.zip"
 fi
 
-
 if [ -e ${WORKING_DIR} ]; then
   rm -rf ${WORKING_DIR}
 fi

diff --git a/ipadic/sysdic.zip b/ipadic/sysdic.zip
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -67,6 +67,13 @@ def test_tokenize2(self):
         self.assertEqual(1, len(tokens))
         self._check_token(tokens[0], u'한국어', u'記号,一般,*,*,*,*,한국어,*,*', NodeType.UNKNOWN)
 
+    def test_tokenize_patched_dic(self):
+        text = u'令和元年'
+        tokens = Tokenizer().tokenize(text)
+        self.assertEqual(2, len(tokens))
+        self._check_token(tokens[0], u'令和', u'名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ', NodeType.SYS_DICT)
+        self._check_token(tokens[1], u'元年', u'名詞,一般,*,*,*,*,元年,ガンネン,ガンネン', NodeType.SYS_DICT)
+
     def test_tokenize_unknown(self):
         text = u'2009年10月16日'
         tokens = Tokenizer().tokenize(text)