## 总结：
1. 二元语法模型：每次计算只涉及连续的两个单词的二元连续语言模型；
2. 平滑策略：使语法模型频次折线平滑为曲线
3. 词网： 一个句子中所有的单词

In [2]:
from pyhanlp import *

### 3.3.1加载语料库

In [12]:
# 语料加载器
CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')
# 自然语言标记器
NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker')

corpus_path = my_cws_corpus()
sents = CorpusLoader.convert2SentenceList(corpus_path)
for sent in sents:
    print(sent)

[商品, 和, 服务]
[商品, 和服, 物美价廉]
[服务, 和, 货币]


### 3.3.2统计一元语法

In [6]:
def train_bigram(corpus_path, model_path):
    sents = CorpusLoader.convert2SentenceList(corpus_path)
    for sent in sents:
        for word in sent:
            word.setLabel('n')
    maker = NatureDictionaryMaker()
    maker.compute(sents)
    maker.saveTxtTo(model_path)
    
model_path = './data/my_cws_test/my_cws_model'
train_bigram(corpus_path, model_path)

# my_cws_model.ngram.txt 为自动生成的二元语法词典

In [18]:
# from tests.test_utility import test_data_path
# 实现test_data_path 方法
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    获取测试数据路径
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


# from test.test_utility import ensure_data
def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    
    # 解压文件，手动下载文件至目录中
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path,'r') as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:len('.zip')]
    return dest_path

In [9]:
# from tests.book.ch03.demo_corpus_loader import my_cws_corpus
def my_cws_corpus():
    """
    在指定文件目录中，创建测试语料库
    """
    data_root = test_data_path()
    corpus_path = os.path.join(data_root, 'my_cws_corpus.txt')
    if not os.path.isfile(corpus_path):
        with open(corpus_path, 'w', encoding='utf-8') as f:
            f.write('商品 和 服务\n商品 和服 物美价廉\n服务 和 货币')
    
    return corpus_path 

In [20]:
# 微软亚洲研究院语料库
# from tests.book.ch03.msr import msr_model
sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')
msr_model = os.path.join(test_data_path(), 'msr_cws')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.txt')
msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')

In [36]:
from jpype import JString

from pyhanlp import *
# from tests.book.ch03.demo_corpus_loader import my_cws_corpus
# from tests.book.ch03.msr import msr_model
# from tests.test_utility import test_data_path

In [33]:
# 挂载接口
NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker')
CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')
WordNet = JClass('com.hankcs.hanlp.seg.common.WordNet')
Vertex = JClass('com.hankcs.hanlp.seg.common.Vertex')
ViterbiSegment = JClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment')
DijksraSegment = JClass('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment')
CoreDictionary = LazyLoadingJClass('com.hankcs.hanlp.dictionary.CoreDictionary')
Nature = JClass('com.hankcs.hanlp.corpus.tag.Nature')

In [47]:
def train_bigram(corpus_path, model_path):
    """
    训练二元网络模型
    """
    # 转换为句子列表
    sents = CorpusLoader.convert2SentenceList(corpus_path)
    for sent in sents:
        for word in sent:
            if word.label is None:
                word.setLabel('n')   # 重新打标签
    maker = NatureDictionaryMaker()  # 生成标注词典
    maker.compute(sents)
    maker.saveTxtTo(model_path)
    
    
def load_bigram(model_path, verbose=True, ret_viterbi=True):
    # 核心词典路径
    HanLP.Config.CoreDictionaryPath = model_path + '.txt'   # 一元模型
    HanLP.Config.BiGramDictionaryPath = model_path + '.ngram.txt'
    
    if verbose:
        # 获取某个词语出现的频率
        print(CoreDictionary.getTermFrequency('商品'))
        # print(CoreB)
        sent = '商品和服务'
        wordnet = generate_wordnet(sent, CoreDictionary.trie)
        print(wordnet)
        print(viterbi(wordnet))
    return ViterbiSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(
        False) if ret_viterbi else DijkstraSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(False)


def generate_wordnet(sent, trie):
    """
    生成词网
    :param sent: 句子
    :param trie: 词典
    :return: 词网
    """
    searcher = trie.getSearcher(JString(sent), 0)
    wordnet = WordNet(sent)
    while searcher.next():
        wordnet.add(
            searcher.begin + 1,
            Vertex(
                sent[searcher.begin:searcher.begin + searcher.length],
                searcher.value,
                searcher.index
            )
        )
    vertexes = wordnet.getVertexes()
    i = 0
    while i < len(vertexes):
        if len(vertexes[i]) == 0:   # 空行
            j = i + 1
            # 寻找第一个非空行
            for j in range(i + 1, len(vertexes) - 1):
                if len(vertexes[j]):
                    break
            wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1]))
            i = j
        else:
            i += len(vertexes[i][-1].realWord)
    return wordnet


def viterbi(wordnet):
    nodes = wordnet.getVertexes()
    for i in range(0, len(nodes) - 1):
        for node in nodes[i]:
            for to in nodes[i + len(node.realWord)]:
                to.updateFrom(node)
                
    path = []
    f = nodes[len(nodes) - 1].getFirst()
    while f:
        path.insert(0, f)
        f = f.getFrom()
    return [v.realWord for v in path]

In [48]:
corpus_path = my_cws_corpus()
model_path = os.path.join(test_data_path(), 'my_cws_model')
train_bigram(corpus_path, model_path)
load_bigram(model_path)

2209
0:[ ]
1:[商, 商品]
2:[品]
3:[和, 和服]
4:[服, 服务]
5:[务]
6:[ ]

[' ', '商品', '和', '服务', ' ']


<jpype._jclass.com.hankcs.hanlp.seg.Viterbi.ViterbiSegment at 0x7faf6d043390>