### 搭建一个分词系统

In [1]:
train_v2_file = 'data/train-v3.0.json'
dic_words_file = 'data/综合类中文词库.xlsx'

In [2]:
import numpy as np
import pandas as pd

In [3]:
df_dic_words = pd.read_excel(dic_words_file, header=None)
df_dic_words.columns = ["word", "info", "v"]

In [16]:
df_dic_words.head()

Unnamed: 0,word,info,v
0,酢,9 @,237692
1,做做事,120 v,191456
2,做做饭,134 n,95350
3,做做,210 v,223109
4,做作,208 a,34124


In [5]:
# df_dic_words[df_dic_words['word'].isin(['我', '学习'])]

In [90]:
# # df_dic_words = df_dic_words.reset_index()
# df_dic_words = df_dic_words.set_index('word')
# # 'dict', 'list', 'series', 'split', 'records', 'index'
# dic_words1 = df_dic_words.to_dict(orient='index')
# dic_words1

In [79]:
class Node(object):
    def __init__(self, name, info=None):
        self.name = name
        self.info = info
        self.children = {}
    
    def set_info(self, info):
        self.info = info
    
    def add_child(self, name, info=None):
        if not self.find_child(name):
            self.children[name] = Node(name, info=info)
        return self.children[name]
    
    def find_child(self, name):
        """查询子节点是否存在"""
        if name in self.children.keys():
            return True
        else:
            return False
    
    def get_child(self, name):
        """获取某个子节点"""
        if name in self.children.keys():
            return self.children[name]
        else:
            return None
    
    def to_dict(self):
        children = {}
        for child_name, child_node in self.children.items():
            children[child_name] = child_node.to_dict()
        return {
            'name': self.name,
            'info': self.info,
            'children': children,
        }
    
    def read_dict(self, node_dict):
        self.name = node_dict['name']
        self.info = node_dict['info']
        for child_name, child_dict in node_dict['children'].items():
            self.children[child_name] = Node(None).read_dict(child_dict)

    
class DictTree(object):
    
    def __init__(self):
        self.dic_words = Node('root')
    
    def cut_words(self, sentence):
        """查询字典树"""
        return self.cut_words_(sentence, self.dic_words)
    
    def cut_words_(self, sentence, child_dic_words):
        pass
    
    def cut_word(self, word):
        """
        查询字典树中是否有某个词
        return: 没有该词会返回空，有该值会返回该词的信息
        """
        return self.cut_word_(word, self.dic_words)

    def cut_word_(self, chars, child_dic_words):
        if len(chars) == 0:
            return child_dic_words.info
        elif child_dic_words.find_child(chars[0]):
            return self.cut_word_(chars[1:], child_dic_words.children[chars[0]])
        else:
            return None
    
    def get_dic_tree(self, chars, info):
        """将数据创建成字典树"""
        self.get_dic_tree_(chars, info, self.dic_words)

    def get_dic_tree_(self, chars, info, child_dic_words):
        """将数据创建成字典树"""
        if len(chars) <= 0:
            child_dic_words.set_info(info)
        else:
            child_node = child_dic_words.add_child(chars[0])
            self.get_dic_tree_(chars[1:], info, child_node)
    
    def add_words(self, dic_words: dict):
        """添加词，格式为{词: 信息}"""
        for dic_word, info in dic_words.items():
            self.get_dic_tree(dic_word, info)


In [81]:
# 测试
dict_tree = DictTree()
dict_tree.add_words({'abc': 'aabbcc', 'a': 'aa', 'acd': 'aaccdd', 'bcd': 'bbccdd'})
print(dict_tree.cut_word('abc'))
print(dict_tree.cut_word('acd'))
print(dict_tree.cut_word('ab'))
print(dict_tree.cut_word('ace'))
print(dict_tree.cut_word('a'))

aabbcc
aaccdd
None
None
aa


In [88]:
def deal_line(x, dict_tree):
    dict_tree.get_dic_tree(x['word'], x.to_dict())
    
dict_tree = DictTree()
df_dic_words.apply(lambda x: deal_line(x, dict_tree), axis=1);  # 分号防止打印

In [89]:
# '做张做致': {'info': '1  i', 'v': 281467},
print(dict_tree.cut_word('做张做致'))

{'word': '做张做致', 'info': '1  i', 'v': 281467}
