# Capter11 条件随机场
## LinearChainCRF 模型

In [1]:
import numpy as np

class LinearChainCRF():
    class FeatureFunc():
        def __init__(self, x, y_i):
            self.x = x
            self.y_i = y_i

        def __call__(self, x, y_i):
            return float(self.y_i == y_i and self.x == x)

    class TransFeatureFunc():
        def __init__(self, x, y_i_1, y_i):
            self.x = x
            self.y_i_1 = y_i_1
            self.y_i = y_i

        def __call__(self, x, y_i_1, y_i):
            return float(self.y_i_1 == y_i_1 and self.y_i == y_i)
            #return float(self.x == x and self.y_i_1 == y_i_1 and self.y_i == y_i)

    def get_all_feature_funcs(self, observ_arr, status_arr):
        feature_dict = {}
        trans_feature_dict = {}
        flags = set()
        for o_seq, s_seq in zip(observ_arr, status_arr):
            feature_dict.update({(x, y) : self.FeatureFunc(x, y) for x, y in zip(o_seq, s_seq)})
            y_i_1 = None
            for i in range(len(o_seq)):
                if i != 0:
                    #TODO key需与特征函数一致
                    trans_feature_dict[(y_i_1, s_seq[i])] = self.TransFeatureFunc(o_seq[i], y_i_1, s_seq[i])
                    #trans_feature_dict[(o_seq[i], y_i_1, s_seq[i])] = self.TransFeatureFunc(o_seq[i], y_i_1, s_seq[i])
                y_i_1 = s_seq[i]
        feature_funcs = []
        trans_feature_funcs = []
        #以下两变量方便调试使用
        self.func_keys = []
        self.trans_func_keys = []
        for k, v in feature_dict.items():
            self.func_keys.append(k)
            feature_funcs.append(v)
        for k, v in trans_feature_dict.items():
            self.trans_func_keys.append(k)
            trans_feature_funcs.append(v)
        return (feature_funcs, trans_feature_funcs)

    def get_empirical_val(self, observ_arr, status_arr):
        feature_emp = np.zeros(len(self.feature_funcs))
        trans_feature_emp = np.zeros(len(self.trans_feature_funcs))
        obs_emp_dict = {}
        sample_num = len(observ_arr)
        for seq_o, seq_s in zip(observ_arr, status_arr):
            tuple_o = tuple(seq_o)
            obs_emp_dict[tuple_o] = 1/len(observ_arr) if tuple_o not in obs_emp_dict else obs_emp_dict[tuple_o] + 1/len(observ_arr)
            for k, func in enumerate(self.feature_funcs):
                for o, s in zip(seq_o, seq_s):
                    feature_emp[k] += func(o, s)
            for k, func in enumerate(self.trans_feature_funcs):
                y_i_1 = None
                for o, s in zip(seq_o, seq_s):
                    if y_i_1 == None:
                        y_i_1 = s
                    else:
                        trans_feature_emp[k] += func(o, y_i_1, s)
                        y_i_1 = s
        return (obs_emp_dict, feature_emp/len(observ_arr), trans_feature_emp/len(observ_arr))

    def get_seq_feature_array(self, seq_o):
        s_num = len(self.status)
        feature_array = np.zeros([len(seq_o), s_num, len(self.feature_funcs)])
        trans_feature_array = np.zeros([len(seq_o), s_num, s_num, len(self.trans_feature_funcs)])
        for i, o in enumerate(seq_o):
            for y_i_1, s_i_1 in enumerate(self.status):
                for k, func in enumerate(self.feature_funcs):
                    feature_array[i][y_i_1][k] = func(o, s_i_1)
                if i > 0:
                    for y_i, s_i in enumerate(self.status):
                        for k, func in enumerate(self.trans_feature_funcs):
                            trans_feature_array[i][y_i_1][y_i][k] = func(o, s_i_1, s_i)
        return feature_array, trans_feature_array

    def __init__(self, observ_arr, status_arr):
        print('start init')
        self.feature_funcs, self.trans_feature_funcs = self.get_all_feature_funcs(observ_arr, status_arr)
        print('feature_funcs num %d, trans_feature_funcs num %d' % (len(self.feature_funcs), len(self.trans_feature_funcs)))
        obs_emp_dict, feature_emp, trans_feature_emp = self.get_empirical_val(observ_arr, status_arr)
        self.train_obs = []
        self.obs_emp = []
        for key, val in obs_emp_dict.items():
            self.train_obs.append(key)
            self.obs_emp.append(val)
        self.empirical = np.concatenate((feature_emp, trans_feature_emp))
        status = set()
        for lst in status_arr:
            status.update(set(lst))
        self.status = list(status)
        self.w = np.zeros(len(self.feature_funcs) + len(self.trans_feature_funcs))
        self.feature_array_dict = {}
        self.trans_feature_array_dict = {}
        for seq_o in self.train_obs:
            feature_array, trans_feature_array = self.get_seq_feature_array(seq_o)    
            self.feature_array_dict[seq_o] = feature_array
            self.trans_feature_array_dict[seq_o] = trans_feature_array

    def get_M_matrix(self, seq_o):
        fea_arr = self.feature_array_dict[seq_o]
        trans_fea_arr = self.trans_feature_array_dict[seq_o]
        fea_val = (fea_arr * self.w[:len(self.feature_funcs)]).sum(axis=2).reshape(len(seq_o), -1, len(self.status))
        trans_fea_val = (trans_fea_arr * self.w[len(self.feature_funcs):]).sum(axis=3)
        M = trans_fea_val + fea_val
        return np.exp(M)

    def forward(self, seq_o, M):
        forward_prob = []
        prob = np.ones(len(self.status))
        forward_prob.append(prob)
        for i in range(len(seq_o)):
            prob = np.matmul(prob, M[i])
            forward_prob.append(prob)
        return np.array(forward_prob)

    def backward(self, seq_o, M):
        backward_prob = []
        prob = np.ones((len(self.status), 1))
        backward_prob.append(prob)
        for i in range(len(seq_o) - 1, -1, -1):
            prob = np.matmul(M[i], prob)
            backward_prob.append(prob)
        backward_prob.reverse()
        return np.array(backward_prob)

    def get_seq_T_val(self):
        s_num = len(self.status)
        T = np.zeros([len(self.train_obs), s_num], dtype=np.int)
        for seq_no, seq_o in enumerate(self.train_obs):
            fea_arr = self.feature_array_dict[seq_o]
            trans_fea_arr = self.trans_feature_array_dict[seq_o]
            fea_arr = fea_arr.sum(axis = 2)
            trans_fea_arr = trans_fea_arr.sum(axis = 3)
            for i in range(len(seq_o)):
                T[seq_no] = (T[seq_no].reshape([s_num, -1]) + trans_fea_arr[i]).max(axis=0) + fea_arr[i]
        return T

    def newton(self, f, g, x0, eps):
        prex = x0
        x = x0 - f(x0) / g(x0)
        while abs(x - prex) > eps:
            prex, x = x, x - f(x) / g(x)
        return x

    #迭代尺度法训练参数
    def fit(self, epoch = 100, eps = 0.001):
        print('start to train')
        T = self.get_seq_T_val()
        Tx = T.max(axis = 1)
        fea_func_num = len(self.feature_funcs)
        trans_fea_func_num = len(self.trans_feature_funcs)
        for it in range(epoch):
            print('start %d epoch' % it)
            loss = 0.
            for k in range(fea_func_num + trans_fea_func_num):
                Tmax = np.zeros(Tx.max()+1)
                func_i = k if k < fea_func_num else k - fea_func_num
                num = 0
                for seq_no, seq_o in enumerate(self.train_obs):
                    M = self.get_M_matrix(seq_o)
                    forward_prob = self.forward(seq_o, M)
                    backward_prob = self.backward(seq_o, M)
                    Z = forward_prob[-1].sum()

                    fea_arr = self.feature_array_dict[seq_o]
                    trans_fea_arr = self.trans_feature_array_dict[seq_o]
                    fea_seq_E = 0.
                    for i in range(len(seq_o)):
                        if k < fea_func_num:
                            fea_prob = (forward_prob[i+1] * backward_prob[i+1].flatten()) / Z
                            fea_seq_E = fea_seq_E + (fea_arr[i, :, func_i] * fea_prob).sum()
                        else:
                            fea_prob = np.outer(forward_prob[i], backward_prob[i+1])*M[i] / Z
                            fea_seq_E = fea_seq_E + (trans_fea_arr[i, :, :, func_i] * fea_prob).sum()
                    Tmax[Tx[seq_no]] += (self.obs_emp[seq_no] * fea_seq_E)
                dw = np.log(self.newton(
                    lambda x: sum(e * x ** t for t, e in enumerate(Tmax)) - self.empirical[k],
                    lambda x: sum(t * e * x ** (t  - 1) for t, e in enumerate(Tmax) if i > 0),
                    1, eps))
                self.w[k] += dw
                loss += abs(self.empirical[k] - Tmax.sum())
            loss = loss / (fea_func_num + trans_fea_func_num)
            print("epoch: %d, loss: %.4f" % (it, loss))

    def viterbi(self, seq_o):
        fea_arr, trans_fea_arr = self.get_seq_feature_array(seq_o)
        fea_val = (fea_arr * self.w[:len(self.feature_funcs)]).sum(axis=2)
        trans_fea_val = (trans_fea_arr * self.w[len(self.feature_funcs):]).sum(axis=3)
        s_num = len(self.status)
        cur_prob = np.ones(s_num)
        max_index = []
        probs = []
        for i in range(len(seq_o)):
            trans_prob = cur_prob[:, None] + trans_fea_val[i]
            max_index.append(np.argmax(trans_prob, axis = 0))
            cur_prob = np.max(trans_prob, axis = 0) + fea_val[i]
            probs.append(cur_prob)
        path = [''] * len(seq_o)
        cur_idx = np.argmax(probs[-1])
        for i in range(len(seq_o) - 1, -1, -1):
            path[i] = self.status[cur_idx]
            cur_idx = max_index[i][cur_idx]
        return path
    
    def predict(self, test_arr):
        hide_status = []
        for seq_o in test_arr:
            s = self.viterbi(seq_o)
            hide_status.append(s)
        return hide_status

## 算法测试
### 由于特征函数太多会导致训练很慢，因此未使用全部训练样本

In [2]:
import re
train_raw = open('data/pku_training.utf8').read()
train_lines = train_raw.split('\n')[:30]
def get_train_data(train_lines):
    observ_arr = []
    status_arr = []
    for t_l in train_lines:
        for l in re.split('[，。；！]', t_l):
            observ = []
            status = []
            flds = l.split(' ')
            for fld in flds:
                if len(fld) == 0:
                    continue
                for i in range(len(fld)):
                    observ.append(fld[i])
                if len(fld) == 1:
                    status.append('S')
                elif len(fld) == 2:
                    status.append('B')
                    status.append('E')
                else:
                    status.append('B')
                    for i in range(1, len(fld)-1):
                        status.append('M')
                    status.append('E')
            if len(observ) > 0:
                observ_arr.append(observ)
                status_arr.append(status)
    return observ_arr, status_arr

def predict(model, test_list):
    for l in test_list:
        s = model.viterbi(list(l))
        pre_str = ''
        for i in range(len(l)):
            if s[i] == 'S':
                pre_str = pre_str + ' ' + l[i] + ' '
            elif s[i] == 'B':
                pre_str = pre_str + ' ' + l[i]
            elif s[i] == 'E':
                pre_str = pre_str + l[i] + ' '
            else:
                pre_str = pre_str + l[i]
        print(pre_str)

observ_arr, status_arr = get_train_data(train_lines)
model = LinearChainCRF(observ_arr, status_arr)
model.fit(5)
inputs = ['中国人民愿与世界各国人民一道',
        '为开创持久和平、共同发展的新世纪而不懈努力',
        '中国的外交工作取得了重要成果',
        '符合和平与发展的时代主题']
predict(model, inputs)

start init
feature_funcs num 941, trans_feature_funcs num 8
start to train
start 0 epoch
epoch: 0, loss: 0.0191
start 1 epoch
epoch: 1, loss: 0.0187
start 2 epoch
epoch: 2, loss: 0.0183
start 3 epoch
epoch: 3, loss: 0.0180
start 4 epoch
epoch: 4, loss: 0.0176
 中国  人民 愿  与  世界  各国  人民  一道 
 为  开创 持 久  和 平  、  共同  发展  的  新  世纪  而  不懈  努力 
 中国  的  外交  工作  取得  了  重要  成果 
 符合  和 平  与  发展  的  时代  主题 
