### 搭建一个分词系统

In [1]:
train_v2_file = 'data/train-v3.0.json'
dic_words_file = 'data/综合类中文词库.xlsx'

In [2]:
import numpy as np
import pandas as pd

In [3]:
df_dic_words = pd.read_excel(dic_words_file, header=None)
df_dic_words.columns = ["word", "info", "v"]

In [4]:
df_dic_words.head()

Unnamed: 0,word,info,v
0,酢,9 @,237692
1,做做事,120 v,191456
2,做做饭,134 n,95350
3,做做,210 v,223109
4,做作,208 a,34124


In [5]:
# df_dic_words[df_dic_words['word'].isin(['我', '学习'])]

In [6]:
# # df_dic_words = df_dic_words.reset_index()
# df_dic_words = df_dic_words.set_index('word')
# # 'dict', 'list', 'series', 'split', 'records', 'index'
# dic_words1 = df_dic_words.to_dict(orient='index')
# dic_words1

In [7]:
import json


class DictTree(object):
    def __init__(self, name='root', word=None, info=None):
        self.name = name
        self.info = info
        self.word = word
        self.children = {}
    
    def __str__(self):
        return json.dumps({self.word: self.info}, ensure_ascii=False)
    
    def set_info(self, info):
        self.info = info
    
    def find_child(self, name):
        """查询子节点是否存在"""
        if name in self.children.keys():
            return True
        else:
            return False
    
    def add_word(self, chars, info, pointer=0):
        """将数据创建成字典树"""
        if len(chars[pointer:]) <= 0:
            self.info = info
            self.word = chars
        else:
            if not self.find_child(chars[pointer]):
                self.children[chars[pointer]] = DictTree(chars[pointer])
            self.children[chars[pointer]].add_word(chars, info, pointer + 1)
    
    def add_words(self, words, infos):
        """添加多组词"""
        for word, info in zip(words, infos):
            self.add_word(word, info)
    
    def add_words2(self, words):
        """添加多组词，不添加此相关信息"""
        for word in words:
            self.add_word(word, None)
            
    def cut_word(self, chars):
        """
        查询字典树中是否有某个词
        return: 没有该词会返回空，有该值会返回该词的信息
        """
        if len(chars) == 0:
            if self.word is None:
                return None
            else:
                return self
        elif self.find_child(chars[0]):
            return self.children[chars[0]].cut_word(chars[1:])
        else:
            return None
    
    def _cut_words(self, chars):
        """
        查询字典树中是否有某个词
        return: 没有该词会返回空，有该值会返回该词的信息
        """
        words = set()
        if len(chars) == 0:
            return words
        if self.word is not None:
            words.add(self)
        if self.find_child(chars[0]):
            words.update(self.children[chars[0]]._cut_words(chars[1:]))
        return words
        
    def cut_words(self, sentence):
        """查询字典树"""
        words = set()
        if len(sentence) == 0:
            return words
        for n in range(len(sentence)):
            words.update(self._cut_words(sentence[n:]))
        return words
    
    def to_dict(self):
        """输出成dict"""
        children = {}
        for child_name, child_node in self.children.items():
            children[child_name] = child_node.to_dict()
        return {
            'name': self.name,
            'info': self.info,
            'children': children,
        }
    
    def read_dict(self, node_dict):
        """从dict中读入"""
        self.name = node_dict['name']
        self.info = node_dict['info']
        for child_name, child_dict in node_dict['children'].items():
            self.children[child_name] = Node(None).read_dict(child_dict)


In [8]:
# 测试
dict_tree = DictTree()
words = {'abc': 'aabbcc', 'a': 'aa', 'acd': 'aaccdd', 'bcd': 'bbccdd'}
dict_tree.add_words(words.keys(), zip(words.keys(), words.values()))
# print(json.dumps(dict_tree.to_dict(), indent=1))
print(dict_tree.cut_word('abc'))
print(dict_tree.cut_word('acd'))
print(dict_tree.cut_word('ab'))
print(dict_tree.cut_word('ace'))
print(dict_tree.cut_word('a'))
print([i.word for i in dict_tree.cut_words('abcdacd')])

{"abc": ["abc", "aabbcc"]}
{"acd": ["acd", "aaccdd"]}
None
None
{"a": ["a", "aa"]}
['abc', 'bcd', 'a']


In [9]:
def deal_line(x, dict_tree):
    dict_tree.add_word(x['word'], x.to_dict())
    
dict_tree = DictTree()
df_dic_words.apply(lambda x: deal_line(x, dict_tree), axis=1);  # 分号防止打印

In [10]:
# '做张做致': {'info': '1  i', 'v': 281467},
print(dict_tree.cut_word('做张做致'))

{"做张做致": {"word": "做张做致", "info": "1  i", "v": 281467}}


In [11]:
print([i.word for i in dict_tree.cut_words('我们学习人工智能，人工智能是未来')])

['我们', '们', '习', '智能', '工智', '人', '学', '智', '人工智能', '未', '能', '是', '人工', '工', '我', '学习']
