### 搭建一个分词系统

In [50]:
train_v2_file = 'data/train-v3.0.json'
dic_words_file = 'data/综合类中文词库.xlsx'

In [51]:
import numpy as np
import pandas as pd

In [52]:
df_dic_words = pd.read_excel(dic_words_file, header=None)
df_dic_words.columns = ["word", "info", "v"]

In [53]:
df_dic_words.head()

Unnamed: 0,word,info,v
0,酢,9 @,237692
1,做做事,120 v,191456
2,做做饭,134 n,95350
3,做做,210 v,223109
4,做作,208 a,34124


In [54]:
# df_dic_words[df_dic_words['word'].isin(['我', '学习'])]

In [55]:
# df_dic_words['v_8'] = df_dic_words['v']*(10**(-8))

In [56]:
# df_dic_words['v_8_log'] = -np.log(df_dic_words['v_8'])

In [57]:
# df_dic_words.head()

In [58]:
# df_dic_words = df_dic_words.reset_index()
df_dic_words = df_dic_words.set_index('word')
df_dic_words.head()

Unnamed: 0_level_0,info,v
word,Unnamed: 1_level_1,Unnamed: 2_level_1
酢,9 @,237692
做做事,120 v,191456
做做饭,134 n,95350
做做,210 v,223109
做作,208 a,34124


In [59]:
# 'dict', 'list', 'series', 'split', 'records', 'index'
dic_words1 = df_dic_words.to_dict(orient='index')
dic_words1

{'酢': {'info': '9  @', 'v': 237692},
 '做做事': {'info': '120  v', 'v': 191456},
 '做做饭': {'info': '134  n', 'v': 95350},
 '做做': {'info': '210  v', 'v': 223109},
 '做作': {'info': '208  a', 'v': 34124},
 '做主': {'info': '215  v', 'v': 116461},
 '做针线': {'info': '131  v', 'v': 63534},
 '做张做智': {'info': '1  i', 'v': 281475},
 '做张做致': {'info': '1  i', 'v': 281467},
 '做张做势': {'info': '122  v', 'v': 93360},
 '做贼心虚': {'info': '145  i', 'v': 204043},
 '做月子': {'info': '131  l', 'v': 24879},
 '做一天和尚撞一天钟': {'info': '4  i', 'v': 83718},
 '做一日和尚撞一天钟': {'info': '2  l', 'v': 159394},
 '做秀': {'info': '136  v', 'v': 124867},
 '做小伏低': {'info': '1  i', 'v': 278838},
 '做戏': {'info': '137  v', 'v': 262302},
 '做文章': {'info': '189  v', 'v': 183824},
 '做为': {'info': '146  v', 'v': 198044},
 '做头': {'info': '133  n', 'v': 52466},
 '做通': {'info': '126  v', 'v': 289114},
 '做私商勾当': {'info': '105  n', 'v': 259815},
 '做寿': {'info': '128  v', 'v': 144379},
 '做手脚': {'info': '144  v', 'v': 211425},
 '做事': {'info': '526  v', '

In [94]:
class Node(object):
    def __init__(self, name, info=None):
        self.name = name
        self.info = info
        self.children = {}
    
    def add_child(self, name, node):
        self.children[name] = node
    
    def find_child(self, name):
        """查询子节点是否存在"""
        if name in self.children.keys():
            return True
        else:
            return False
    
    def get_child(self, name):
        """获取某个子节点"""
        if name in self.children.keys():
            return self.children[name]
        else:
            return None

class DictTree(object):
    
    def __init__(self):
        self.dic_words = Node('root')
    
    def cut_words(self, sentence):
        """查询字典树"""
        return self.cut_words_(sentence, self.dic_words)
    
    def cut_words_(self, sentence, child_dic_words):
        for char in sentence:
            if char in child_dic_words.keys():
                    return child_dic_words
    
    def get_dic_tree(self, chars, info):
        """将数据创建成字典树"""
        self.get_dic_tree_(chars, info, self.dic_words)

    def get_dic_tree_(self, chars, info, dic_words):
        """将数据创建成字典树"""
        if len(chars) <= 0:
            raise Exception("单词不允许为空")
        elif len(chars) == 1:
            child_node = Node(chars[0], info)
            dic_words.add_child(chars[0], child_node)
        else:
            child_node = dic_words.get_child(chars[0])
            if child_node is None:
                child_node = Node(chars[0])
            dic_words.add_child(chars[0], child_node)
            self.get_dic_tree_(chars[1:], info, child_node)
    
    def add_words(self, dic_words: dict):
        """添加词，格式为{词: 信息}"""
        for dic_word, info in dic_words.items():
            self.get_dic_tree(dic_word, info)


In [95]:
dict_tree = DictTree()
dict_tree.add_words({'abcd': 'aaa', 'acde': 'bbb', 'bcd': 'ccc'})

In [97]:
dict_tree.dic_words.children

{'a': <__main__.Node at 0x260167b2d68>, 'b': <__main__.Node at 0x260167b2198>}