From a0c3d826c2c50790b5e93e63a1bfeb821d75ce95 Mon Sep 17 00:00:00 2001 From: mozillazg Date: Mon, 8 May 2017 22:57:29 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E4=B8=BA=E4=BD=BF=E7=94=A8=E5=89=8D?= =?UTF-8?q?=E7=BC=80=E9=9B=86=E5=90=88=E4=BB=A3=E6=9B=BF=20trie=20?= =?UTF-8?q?=E5=AD=98=E5=82=A8=E8=AF=8D=E8=AF=AD=E6=95=B0=E6=8D=AE=E5=AD=97?= =?UTF-8?q?=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pypinyin/contrib/mmseg.py | 96 ++++++++++++--------------------------- 1 file changed, 30 insertions(+), 66 deletions(-) diff --git a/pypinyin/contrib/mmseg.py b/pypinyin/contrib/mmseg.py index 1b801afc..dd7fd38c 100644 --- a/pypinyin/contrib/mmseg.py +++ b/pypinyin/contrib/mmseg.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- """最大正向匹配分词""" -from collections import defaultdict class Seg(object): - def __init__(self, trie): - self.trie = trie + def __init__(self, prefix_set): + self.prefix_set = prefix_set + self.trie = prefix_set def cut(self, text): """分词 @@ -15,79 +15,43 @@ def cut(self, text): """ remain = text while remain: - # 按最大词长切分 - text = remain[:self.trie.max_word_legth] - while len(text) > 1: - matched_tree = self.trie.match(text) - # 尾节点是个词语 - if '' in matched_tree: - yield text - # 从待处理文本中删除匹配的词语 - remain = remain[len(text):] - break + matched = '' + # 一次加一个字的匹配 + for index in range(len(remain)): + word = remain[:index + 1] + if word in self.prefix_set: + matched = word else: - # 不是个词语,删除右边一个字,重新开始匹配 - text = text[:-1] - else: - # 只剩一个字了还没匹配到,这个字算一个词语 - yield text - # 从待处理文本中删除匹配的词语 - remain = remain[len(text):] + # 前面的字符串是个词语 + if matched: + yield matched + matched = '' + remain = remain[index:] + else: # 前面为空 + yield word + remain = remain[index + 1:] + # 有结果了,剩余的重新开始匹配 + break + else: # 整个文本就是一个词语 + yield remain + break -class Trie(object): +class PrefixSet(object): def __init__(self): - self._data = defaultdict(tree) - self._max_word_legth = 0 + self._prefix_set = set() def train(self, word_s): - """更新 trie + """更新 prefix set :param word_s: 词语库列表 :type word_s: iterable :return: None """ for word in word_s: - # 更新最大词长 - word_length = len(word) - if word_length > self._max_word_legth: - self._max_word_legth = word_length - # 把词语的每个字更新到 trie 中 - # 父节点 - pre_tree = self._data - for letter in word: - # 当前节点 - current_tree = pre_tree[letter] - # 当前节点是子节点的父节点 - pre_tree = current_tree - # 标记最后一个字的节点,表示到这里是一个词语 - current_tree[''] = None - - def match(self, text): - """返回匹配的节点 - - :param word: 待匹配的文本 - :rtype: tree - - ``'' in tree`` 表示这个节点是词语的尾节点, 当前 text 是个词语 - """ - # 父节点 - pre_tree = self._data - for char in text: - if char in pre_tree: - # 当前节点 - current_tree = pre_tree[char] - # 当前节点是子节点的父节点 - pre_tree = current_tree - else: - return tree() - return current_tree - - @property - def max_word_legth(self): - """最大词长""" - return self._max_word_legth - + # 把词语的每个前缀更新到 prefix_set 中 + for index in range(len(word)): + self._prefix_set.add(word[:index + 1]) -def tree(): - return defaultdict(tree) + def __contains__(self, key): + return key in self._prefix_set