Permalink
Browse files

customize ik-analysis plugin

  • Loading branch information...
U-Medcl-THINK\Administrator U-Medcl-THINK\Administrator
U-Medcl-THINK\Administrator authored and U-Medcl-THINK\Administrator committed Jul 13, 2011
1 parent 71cc2e2 commit 21abad12a0096173e8836dd042ca403751ab7ad1
Showing with 557,846 additions and 407 deletions.
  1. +9 −0 .idea/libraries/junit.xml
  2. +1 −0 .idea/misc.xml
  3. +15 −14 .idea/modules.xml
  4. +1 −0 .idea/modules/elasticsearch-root.iml
  5. +5 −3 .idea/modules/plugin-analysis-icu.iml
  6. +20 −0 .idea/modules/plugin-analysis-ik.iml
  7. +10 −0 .idea/projectCodeStyle.xml
  8. +3 −0 .idea/vcs.xml
  9. +33 −2 config/elasticsearch.yml
  10. +9 −0 config/ik/IKAnalyzer.cfg.xml
  11. +33 −0 config/ik/custom/ext_stopword.dic
  12. +2 −0 config/ik/custom/mydict.dic
  13. +275,910 −0 config/ik/main.dic
  14. +25 −0 config/ik/preposition.dic
  15. +312 −0 config/ik/quantifier.dic
  16. +33 −0 config/ik/stopword.dic
  17. +37 −0 config/ik/suffix.dic
  18. +131 −0 config/ik/surname.dic
  19. +126 −388 modules/elasticsearch/src/main/java/config/names.txt
  20. +137 −0 plugins/analysis/ik/build.gradle
  21. +1 −0 plugins/analysis/ik/src/main/java/es-plugin.properties
  22. +15 −0 plugins/analysis/ik/src/main/java/org/elasticsearch/index/analysis/IkAnalysisBinderProcessor.java
  23. +20 −0 plugins/analysis/ik/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
  24. +31 −0 plugins/analysis/ik/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java
  25. +27 −0 plugins/analysis/ik/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java
  26. +256 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/Context.java
  27. +137 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/IKSegmentation.java
  28. +214 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/Lexeme.java
  29. +104 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/cfg/Configuration.java
  30. +292 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/dic/DictSegment.java
  31. +529 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/dic/Dictionary.java
  32. +75 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/dic/Hit.java
  33. +60 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/help/CharacterHelper.java
  34. +39 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
  35. +420 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/lucene/IKQueryParser.java
  36. +19 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/lucene/IKSimilarity.java
  37. +65 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
  38. +196 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/seg/CJKSegmenter.java
  39. +16 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/seg/ISegmenter.java
  40. +236 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/seg/LetterSegmenter.java
  41. +612 −0 plugins/analysis/ik/src/main/java/org/wltea/analyzer/seg/QuantifierSegmenter.java
  42. +83 −0 plugins/analysis/ik/src/main/uml/IKAnalysisBinderProcessor.uml
  43. +39 −0 plugins/analysis/ik/src/test/java/CfgTester.java
  44. +45 −0 plugins/analysis/ik/src/test/java/CharacterTest.java
  45. +485 −0 plugins/analysis/ik/src/test/java/DictionaryTester.java
  46. +97 −0 plugins/analysis/ik/src/test/java/IKAnalyzerDemo.java
  47. +37 −0 plugins/analysis/ik/src/test/java/IKTokenerTest.java
  48. +349 −0 plugins/analysis/ik/src/test/java/SegmentorTester.java
  49. +11 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/IKAnalyzer.cfg.xml
  50. 0 plugins/analysis/ik/src/test/java/extended/ik_dict/ext_dict/mydict.dic
  51. +33 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/ext_stopwords/ext_stopword.dic
  52. +275,910 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/main.dic
  53. +25 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/preposition.dic
  54. +312 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/quantifier.dic
  55. +33 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/stopword.dic
  56. +37 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/suffix.dic
  57. +131 −0 plugins/analysis/ik/src/test/java/extended/ik_dict/surname.dic
  58. +3 −0 settings.gradle
@@ -0,0 +1,9 @@
+<component name="libraryTable">
+ <library name="junit">
+ <CLASSES>
+ <root url="jar://$PROJECT_DIR$/../repository/junit-4.8.1.jar!/" />
+ </CLASSES>
+ <JAVADOC />
+ <SOURCES />
+ </library>
+</component>
View
1 .idea/misc.xml 100644 → 100755
@@ -36,5 +36,6 @@
<component name="SvnBranchConfigurationManager">
<option name="mySupportsUserInfoFilter" value="true" />
</component>
+ <component name="WebServicesPlugin" addRequiredLibraries="true" />
</project>
View
29 .idea/modules.xml 100644 → 100755
@@ -2,25 +2,26 @@
<project version="4">
<component name="ProjectModuleManager">
<modules>
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//benchmark-micro.iml" filepath="$PROJECT_DIR$/.idea/modules//benchmark-micro.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/benchmark-micro.iml" filepath="$PROJECT_DIR$/.idea/modules/benchmark-micro.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/elasticsearch.iml" filepath="$PROJECT_DIR$/.idea/modules/elasticsearch.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/elasticsearch-root.iml" filepath="$PROJECT_DIR$/.idea/modules/elasticsearch-root.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-analysis-icu.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-analysis-icu.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-cloud-aws.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-cloud-aws.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-analysis-ik.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-analysis-ik.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-cloud-aws.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-cloud-aws.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-lang-groovy.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-lang-groovy.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-lang-javascript.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-lang-javascript.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-lang-python.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-lang-python.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-mapper-attachments.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-mapper-attachments.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-couchdb.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-couchdb.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-rabbitmq.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-rabbitmq.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-lang-javascript.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-lang-javascript.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-lang-python.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-lang-python.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-mapper-attachments.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-mapper-attachments.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-river-couchdb.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-river-couchdb.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-river-rabbitmq.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-river-rabbitmq.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-river-twitter.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-river-twitter.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-wikipedia.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-wikipedia.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-transport-memcached.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-transport-memcached.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-transport-thrift.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-transport-thrift.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-transport-wares.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-transport-wares.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugins-hadoop.iml" filepath="$PROJECT_DIR$/.idea/modules//plugins-hadoop.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//test-integration.iml" filepath="$PROJECT_DIR$/.idea/modules//test-integration.iml" />
- <module fileurl="file://$PROJECT_DIR$/.idea/modules//test-testng.iml" filepath="$PROJECT_DIR$/.idea/modules//test-testng.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-river-wikipedia.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-river-wikipedia.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-transport-memcached.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-transport-memcached.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-transport-thrift.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-transport-thrift.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-transport-wares.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-transport-wares.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugins-hadoop.iml" filepath="$PROJECT_DIR$/.idea/modules/plugins-hadoop.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/test-integration.iml" filepath="$PROJECT_DIR$/.idea/modules/test-integration.iml" />
+ <module fileurl="file://$PROJECT_DIR$/.idea/modules/test-testng.iml" filepath="$PROJECT_DIR$/.idea/modules/test-testng.iml" />
</modules>
</component>
</project>
@@ -30,6 +30,7 @@
<orderEntry type="module" module-name="plugin-river-rabbitmq" />
<orderEntry type="module" module-name="plugin-river-wikipedia" />
<orderEntry type="module" module-name="test-integration" />
+ <orderEntry type="module" module-name="plugin-analysis-ik" />
</component>
</module>
View
@@ -1,13 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
-<module type="JAVA_MODULE" version="4">
+<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
- <output url="file://$MODULE_DIR$/../../plugins/analysis/icu/build/classes/main" />
- <output-test url="file://$MODULE_DIR$/../../plugins/analysis/icu/build/classes/test" />
+ <output url="file://$MODULE_DIR$/../../plugins/analysis/icu/target/classes" />
+ <output-test url="file://$MODULE_DIR$/../../plugins/analysis/icu/target/test-classes" />
<exclude-output />
<content url="file://$MODULE_DIR$/../../plugins/analysis/icu">
<sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/build" />
+ <excludeFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
@@ -27,6 +28,7 @@
</SOURCES>
</library>
</orderEntry>
+ <orderEntry type="library" name="groovy-1.7.5" level="application" />
</component>
</module>
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+ <component name="NewModuleRootManager" inherit-compiler-output="false">
+ <output url="file://$MODULE_DIR$/../../plugins/analysis/ik/build/classes/main" />
+ <output-test url="file://$MODULE_DIR$/../../plugins/analysis/ik/build/classes/main" />
+ <exclude-output />
+ <content url="file://$MODULE_DIR$/../../plugins/analysis/ik">
+ <sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/ik/src/main/java" isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/ik/src/test/java" isTestSource="true" />
+ <excludeFolder url="file://$MODULE_DIR$/../../plugins/analysis/ik/build" />
+ </content>
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ <orderEntry type="module" module-name="elasticsearch" />
+ <orderEntry type="library" name="junit" level="application" />
+ <orderEntry type="library" name="junit" level="project" />
+ <orderEntry type="library" name="junit" level="project" />
+ </component>
+</module>
+
View
10 .idea/projectCodeStyle.xml 100644 → 100755
@@ -147,11 +147,21 @@
<option name="LABEL_INDENT_ABSOLUTE" value="false" />
<option name="USE_RELATIVE_INDENTS" value="false" />
</ADDITIONAL_INDENT_OPTIONS>
+ <codeStyleSettings language="ECMA Script Level 4">
+ <option name="METHOD_ANNOTATION_WRAP" value="0" />
+ <option name="FIELD_ANNOTATION_WRAP" value="0" />
+ <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+ </codeStyleSettings>
<codeStyleSettings language="JavaScript">
<option name="METHOD_ANNOTATION_WRAP" value="0" />
<option name="FIELD_ANNOTATION_WRAP" value="0" />
<option name="PARENT_SETTINGS_INSTALLED" value="true" />
</codeStyleSettings>
+ <codeStyleSettings language="PHP">
+ <option name="METHOD_ANNOTATION_WRAP" value="0" />
+ <option name="FIELD_ANNOTATION_WRAP" value="0" />
+ <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+ </codeStyleSettings>
</value>
</option>
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
View
3 .idea/vcs.xml 100644 → 100755
@@ -1,5 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
+ <component name="ClearCaseSharedConfig">
+ <option name="myUseUcmModel" value="true" />
+ </component>
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
View
@@ -1,6 +1,6 @@
# Cluster Settings
-#cluster:
-# name: elasticsearch
+cluster:
+ name: elasticsearch_medcl
# Path Settings
#path:
@@ -26,3 +26,34 @@
# zen:
# ping.multicast.enabled: false
# ping.unicast.hosts: ["host1", "host2"]
+
+#for ik, use type: org.elasticsearch.index.analysis.IkAnalyzerProvider or name: ik
+#index:
+# analysis:
+# analyzer:
+# ik12:
+# alias: [news_analyzer, news_analyzer_ik]
+# type: custom
+# filter: [standard, lowercase, stop, word_delimiter]
+# tokenizer: standard
+
+# tokenizer: myTokenizer1
+# filter: [myTokenFilter1, myTokenFilter2]
+# char_filter: [my_html]
+# tokenizer:
+# myTokenizer1:
+# type: standard
+# max_token_length: 900
+# filter:
+# myTokenFilter1:
+# type: stop
+# stopwords: [stop1, stop2, stop3, stop4]
+# myTokenFilter2:
+# type: length
+# min: 0
+# max: 2000
+# char_filter:
+# my_html:
+# type: html_strip
+# escaped_tags: [xxx, yyy]
+# read_ahead: 1024
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
+<properties>
+ <comment>IK Analyzer 扩展配置</comment>
+ <!--用户可以在这里配置自己的扩展字典 -->
+ <entry key="ext_dict">custom/mydict.dic;</entry>
+ <!--用户可以在这里配置自己的扩展停止词字典-->
+ <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
+</properties>
@@ -0,0 +1,33 @@
+也
+了
+仍
+从
+以
+使
+则
+却
+
+又
+及
+对
+就
+并
+很
+或
+把
+
+是
+的
+着
+给
+而
+被
+让
+在
+还
+比
+等
+当
+与
+于
+但
@@ -0,0 +1,2 @@
+medcl
+脑残片
Oops, something went wrong.

0 comments on commit 21abad1

Please sign in to comment.