In [9]:
MAXLEN = 5
vocabulary = set()
with open('./word_freq_list.txt', encoding='utf-8') as f:
    for l in f.readlines():
        vocabulary.add(l.split(' ')[1])

In [10]:
def forward_split(text):
    split_words = []
    while text != '':
        sub_string = text[:MAXLEN]
        while sub_string != '':
            if sub_string in vocabulary or len(sub_string) == 1:
                split_words.append(sub_string)
                break
            else:
                sub_string = sub_string[:-1]
        text = text[len(sub_string):]
    return split_words

In [11]:
def backward_split(text):
    split_words = []
    while text != '':
        sub_string = text[-MAXLEN:]
        while sub_string != '':
            if sub_string in vocabulary or len(sub_string) == 1:
                split_words.append(sub_string)
                break
            else:
                sub_string = sub_string[1:]
        text = text[:-len(sub_string)]
    return split_words[::-1]

In [12]:
def contrast(forward_list, backward_list):
    forward_hit = [(1 if w in vocabulary else 0) for w in forward_list]
    backward_hit = [(1 if w in vocabulary else 0) for w in forward_list]
    forward_acc = sum(forward_hit) / len(forward_list)
    backward_acc = sum(backward_hit) / len(backward_list)
    if forward_acc > backward_acc:
        return forward_list, forward_acc
    elif backward_acc > forward_acc:
        return backward_list, backward_acc
    else:
        forward_single = [(1 if len(w) == 1 else 0) for w in forward_list]
        backward_single = [(1 if len(w) == 1 else 0) for w in backward_list]
        if sum(backward_single) < sum(forward_single):
            return backward_list, backward_acc
        elif sum(backward_single) > sum(forward_single):
            return forward_list, forward_acc
        else:
            if len(backward_list) < len(forward_list):
                return backward_list, backward_acc
            else:
                return forward_list, forward_acc

In [13]:
def evaluation(gold_string_list, predict_string_list):
    gold_len = [len(i) for i in gold_string_list]
    gold_set = {(sum(gold_len[:i]), sum(gold_len[:i + 1])) for i, _ in enumerate(gold_len)}

    predict_len = [len(i) for i in predict_string_list]
    predict_set = {(sum(predict_len[:i]), sum(predict_len[:i + 1])) for i, _ in enumerate(predict_len)}

    intersection_set = gold_set.intersection(predict_set)
    return len(intersection_set), len(gold_set), len(predict_set)

In [20]:
gold_list = []
i = 0
with open('./pku_test_gold.txt', encoding='utf-8') as f:
    for l in f.readlines():
        text_string_list = l.split('。')
        for text_string in text_string_list:
            gold_list.append(text_string.split('  '))

predict_list = []
with open('./pku_test.txt', encoding='utf-8') as f:
    for l in f.readlines():
        text_string_list = l.split('。')
        for text_string in text_string_list:
            forward_words_list = forward_split(text_string)
            backward_words_list = backward_split(text_string)
            better_words_list, _ = contrast(forward_words_list, backward_words_list)
            predict_list.append(better_words_list)

In [23]:
len(gold_list)
predict_list[-2]

['（', '新华社', '记者', '李', '昌', '元', '摄', '）', '\n']

In [24]:
intersection_num = 0
gold_num = 0
predict_num = 0
for i in range(len(gold_list)):
    intersection_num_i, gold_num_i, predict_num_i = evaluation(gold_list[i], predict_list[i])
    intersection_num += intersection_num_i
    gold_num += gold_num_i
    predict_num += predict_num_i
precision = intersection_num / predict_num
recall = intersection_num / gold_num
print('正确标记的个数：{}'.format(intersection_num))
print('金标语料分词数目：{}'.format(gold_num))
print('输出分词数目：{}'.format(predict_num))
print('准确率：{}'.format(precision))
print('召回率：{}'.format(recall))


正确标记的个数：93508
金标语料分词数目：109741
输出分词数目：110174
准确率：0.8487301904260534
召回率：0.8520789859760709
