In [1]:
#*_* coding:utf-8 *_*

In [2]:
from tqdm import tqdm
import math

In [3]:
fpath = "dict.txt"

### 载入词典

In [4]:
lfreq = {}
lftotal = 0
with open(fpath,'r') as f:
    linenum = 0
    for line in tqdm(f.read().strip().split('\n')):
        linenum += 1
        try:
            word,freq,_ = line.split(' ')
            freq = int(freq)
            lfreq[word] = freq
            lftotal += freq
            for ch in range(len(word)):
                wfrag = word[:ch + 1]
                if wfrag not in lfreq:
                    lfreq[wfrag] = 0
        except:
            pass

100%|██████████| 584429/584429 [00:03<00:00, 150561.30it/s]


In [5]:
len(lfreq)
# lfreq["香侬"]  # 词库中没有香侬这个词

827207

In [6]:
def get_DAG(sentence):
    DAG = {}
    n = len(sentence)
    for k in range(n):
        temlist = []
        i = k
        frag = sentence[k]
        while i<n and frag in lfreq:
            if lfreq[frag]:
                temlist.append(i)
            i += 1
            frag = sentence[k:i+1]     ## 左闭右开
            
        if not temlist:
            temlist.append(k)
        DAG[k] = temlist
    return DAG

In [7]:
DAG = get_DAG("欢迎加入香侬科技")

In [8]:
DAG

{0: [0, 1], 1: [1], 2: [2, 3], 3: [3], 4: [4], 5: [5], 6: [6, 7], 7: [7]}

In [9]:
def get_route(DAG,sentence,route):
    n = len(sentence)
    route[n] = (0,0)
    logtotal = math.log(lftotal)
    for idx in range(n-1,-1,-1):
        route[idx] = max((math.log(lfreq.get(sentence[idx:x+1]) or 1)- logtotal +route[x+1][0],x)
                              for x in DAG[idx])

In [10]:
route = {}
get_route(DAG,"欢迎加入香侬科技",route)
route

{8: (0, 0),
 7: (-11.25195129228439, 7),
 6: (-8.621081625431708, 7),
 5: (-22.381217252373446, 5),
 4: (-31.99305551884484, 4),
 3: (-40.36109639411737, 3),
 2: (-41.787215342110194, 3),
 1: (-51.655308867679885, 1),
 0: (-51.525888820365445, 1)}

In [11]:
dag = get_DAG("2015年3月")
dag

{0: [0], 1: [1], 2: [2], 3: [3], 4: [4], 5: [5], 6: [6]}

In [12]:
import re

In [13]:
def seg(sentence):
    sentence = sentence.strip()
    DAG = get_DAG(sentence)
    route = {}
    get_route(DAG,sentence,route)
    x = 0
    n = len(sentence)
    buf = ''
    lseg = []
    re_eng = re.compile('[a-zA-Z0-9]',re.U)
    while x<n:
        y = route[x][1] + 1    #[] 为前闭后开
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word)==1:
            buf += l_word
            x = y
        else:
            if buf:
                lseg.append(buf)
                buf = ''
            lseg.append(l_word)
            x = y
    if buf:
        lseg.append(buf)      ## 保证末尾字词不被忽略
    return lseg

In [14]:
sentence = "欢迎加入香侬科技"
se = seg(sentence)
se

['欢迎', '加入', '香', '侬', '科技']

In [15]:
sentence = "2015年3月"
se = seg(sentence)
se

['2015', '年', '3', '月']

### 载入人民日报语料 进行测试

In [16]:
fpath = "/home/kebo/人民日报语料库2014/2014/0102"

In [17]:
import os

In [18]:
filelist = []
for file in tqdm(os.listdir(fpath)):
    if not re.match("^\._",file):
        file = os.path.join(fpath,file)
        filelist.append(file)

100%|██████████| 5286/5286 [00:00<00:00, 205293.58it/s]


In [19]:
len(filelist)

2643

In [20]:
def transfor(line):
    line = re.sub("/[a-z]+","",line)
    line = line.replace("\n","").strip()
    line = line.replace("[","").strip()
    line = line.replace("]","").strip()
    cut_line = line.split(" ")
    line = "".join(cut_line)
    return line,cut_line

In [21]:
eva_list = []
seg_list = []
for file in tqdm(filelist):
    with open(file) as f:
        for line in f.readlines():
            line,cut_line = transfor(line)
            seg_list.append(seg(line))
            eva_list.append(cut_line)

100%|██████████| 2643/2643 [00:16<00:00, 159.41it/s]


In [22]:
len(eva_list)

20791

In [23]:
def car_preci(eva_list,seg_list):
    if len(eva_list) != len(seg_list):
        print("error !")
        return 0
    p_score = 0.0
    r_score=0.0
    f_score=0.0
    for i in tqdm(range(len(eva_list))):
        if len(seg_list[i])>0:
            count = 0
            for w in seg_list[i]:
                if w in eva_list[i]:
                    count+=1
            p_score += count/len(seg_list[i])
            r_score += count/len(eva_list[i])
    p_score = p_score/len(eva_list)
    r_score = r_score/len(eva_list)
    f = p_score*r_score*2/(p_score+r_score)
    return (p_score,r_score,f)

In [24]:
p_score,r_score,f = car_preci(eva_list,seg_list)

100%|██████████| 20791/20791 [00:01<00:00, 10861.15it/s]


In [25]:
print(p_score,r_score,f)

0.787546769422249 0.8270464959795102 0.8068134681691268


### 测试结果
    准确率：0.787546769422249
    召回率：0.8270464959795102
    f值：0.8068134681691268