In [1]:
#! -*- coding:utf-8 -*-

import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import re, os
import codecs


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam

import keras
keras.__version__

Using TensorFlow backend.


'2.3.1'

# bert parameters

In [2]:
maxlen = 100
config_path = 'model/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'model/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'model/chinese_L-12_H-768_A-12/vocab.txt'


token_dict = {}

with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)


class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

tokenizer = OurTokenizer(token_dict)

# Arrange word matrix

In [3]:
import jieba
import numpy as np

def count_words(s):
    stop_words = ['$', '?', '_', '“', '”', '、', '。', '《', '》','，','（', '）', '\n', '的', '了', '是']
    tokenstr = []
    result = {}
    
    word2pos = {}
    pos2word = {}
    
    words = jieba.cut(s)
    
    i = 0 
    for word in words:
        if word in stop_words: continue
        tokenstr.append(word)
        result[word] = result.get(word, 0) + 1
        pos2word[i] = word
        
        indexs = word2pos.get(word, [])
        indexs.append(i)
        word2pos[word] = indexs
        
        i+=1
    
    result = dict(sorted(result.items(), key=lambda x: (x[1],x[0]), reverse=True))
    wordslist = list(result.keys())
    assert len(set(tokenstr)) == len(wordslist)
    return (wordslist, tokenstr, word2pos, pos2word)


def fill_table(TD_list, related_tables,target_width, qqueue):
    TD_list[0] = qqueue[0] # TD_list 长度为target_width 第一个位置对应此单词在wlist中的索引。0,1,2...
    count = 1

    while qqueue != [] and count < target_width:
        use_index = qqueue[0] # 单词索引
        del qqueue[0]
        use_list = related_tables[use_index]  #取出use_index单词对应的相关单词。
        len1 = len(use_list)   # 查看 i对应 的相关单词的个数。
        len2 = target_width - count 
        if len1 >= len2:   # 大体意思应该是查看单词i对应的相关单词个数如果满足 target_width就直接从相关单词按顺序取出来填充到TD_list中。
            TD_list[count:] = use_list[:len2]  
            assert len(TD_list) == target_width
            count = target_width
            break
        else:              # 如果不满足就有多少填多少。剩下的用 -1填充。
            TD_list[count:count + len1] = use_list
            assert len(TD_list) == target_width
            count += len1
            for next_id in use_list:
                qqueue.append(next_id)
    for i in range(count, target_width):
        TD_list[i] = -1
        
        
def reorder(table, word2pos, pos2word, wlist, word2id):
    sort_table = []
    topn, neighbor = np.array(table).shape
    for i in range(topn):
        tmp = []
        tmp += word2pos[wlist[table[i][0]]] # record each center word index
        length = len(tmp)                   # occurred times of center words
        t = []                              # t is use to related words index
        for j in range(1, neighbor):
            t += word2pos[wlist[table[i][j]]]
        index = np.random.randint(len(t), size = 20-length)
        t = np.array(t)
        t = list(t[index])
        tmp = tmp + t                       # conccat the index of center word and index of its related words
        tmp.sort()
        for j in range(len(tmp)):
            tmp[j] = word2id[pos2word[tmp[j]]] # convert index to word_id
            # tmp[j] = pos2word[tmp[j]]       # convert index to word
        sort_table.append(tmp)
    
    return np.array(sort_table)


def text2matrix(s, sliding_window=3, target_width=5):
    """
    
    """
    (wlist, tokenwords, word2pos, pos2word) = count_words(s)
    word2id = {k:v for k,v in zip(wlist, range(len(wlist)))}
    wordslist_length = len(wlist)
    
    AM_table = [[0 for i in range(wordslist_length)] for j in range(wordslist_length)]
    
    # generate occurred matrix with sliding_window
    for num in range(len(tokenwords)-sliding_window+1):
        for i in range(sliding_window-1):
            for j in range(i+1, sliding_window):
                AM_table[wlist.index(tokenwords[num + i])][wlist.index(tokenwords[num + j])] += 1
                AM_table[wlist.index(tokenwords[num + j])][wlist.index(tokenwords[num + i])] += 1
                
    related_tables = {}
    for i in range(wordslist_length):
        related_tables[i] = [[index, num] for index, num in enumerate(AM_table[i]) if num > 0 and index != i]
        related_tables[i].sort(key=lambda x: x[1], reverse=True)
        related_tables[i] = [element[0] for element in related_tables[i]]
    
    TD_table = [[-1 for i in range(target_width)] for j in range(wordslist_length)]
    for i in range(wordslist_length):
        fill_table(TD_table[i], related_tables, target_width, [i]) # fill TD table with -1
    
    # TD_table = reorder(TD_table, word2pos, pos2word, wlist, word2id)
    
    # convert id to words: arrange word matrix
    awm = []
    for row in TD_table:
        awm.append([pos2word[i] for i in row])
    return wlist, awm # ,TD_table

# Load data for binary classify

neg = pd.read_excel('./data/bert_keras/neg.xls', header=None)
pos = pd.read_excel('./data/bert_keras/pos.xls', header=None)

data = []

for d in neg[0]:
    data.append((d, 0))

for d in pos[0]:
    data.append((d, 1))


# 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

# Load Data with multi labels(Baidu)

In [4]:
# 类别
n_class = 19
# 是否处理数据不平衡
imbalance = True
fixed_nums = 900
# 是否使用Arrange word matrix
AWM = True


x,y = None, None
if n_class == 13:
    x = open('./data/kkb/x.txt','r',encoding='utf8').readlines()
    y = eval(open('./data/kkb/y.txt', 'r',encoding='utf8').readlines()[0])
elif n_class == 19:
    x = open('./data/kkb/x_19.txt','r',encoding='utf8').readlines()
    y = eval(open('./data/kkb/y_19.txt','r',encoding='utf8').readlines()[0])

print('{} class task, data size:{},{}'.format(n_class,len(x),len(y)))


# 类别不平衡
if imbalance:
    ys = [str(i) for i in y]
    from collections import Counter
    counts = Counter(ys)
    for k,v in counts.items():
        #print(k,':',v)
        pass

    df = pd.DataFrame({'x':x,'y':ys})
    new_x,new_y = [],[]
    for k, v in counts.items():
        x_ = np.random.choice(df[df.y == k].x, fixed_nums)
        y_ = [eval(k)]*900
        new_x.extend(x_)
        new_y.extend(y_)
    print('after deal data imbalance, data size:',len(new_x),len(new_y))
    x,y = new_x, new_y

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(range(n_class))
y = mlb.fit_transform(y)
y = [list(i) for i in y]

# arrage word matrix
x_awm = []
if AWM:
    for row in x:
        _, awm = text2matrix(row)
        x_awm.append(''.join(list(np.reshape(awm[:20],(-1)))))
    x = x_awm


data = list(zip(x,y))
np.random.shuffle(data)
train_data = data[:-4000]
val_data = data[-4000:-2000]
test_data = data[-2000:]

19 class task, data size:22151,22151
after deal data imbalance, data size: 13500 13500


Building prefix dict from the default dictionary ...
I1023 10:28:17.963426 139844851050304 __init__.py:111] Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
I1023 10:28:17.965262 139844851050304 __init__.py:131] Loading model from cache /tmp/jieba.cache
Loading model cost 0.512 seconds.
I1023 10:28:18.477261 139844851050304 __init__.py:163] Loading model cost 0.512 seconds.
Prefix dict has been built succesfully.
I1023 10:28:18.478954 139844851050304 __init__.py:164] Prefix dict has been built succesfully.


data_19 = data
data_19[0]

data[0][0]

# Data generator

In [5]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])


class data_generator:
    def __init__(self, data, batch_size=32, multi_labels=False):
        self.data = data
        self.batch_size = batch_size
        self.multi_labels = multi_labels
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    if self.multi_labels: Y = Y.reshape(-1, np.shape(Y)[-1])
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []

# Model for binarry classify

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))

x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x)

model = Model([x1_in, x2_in], p)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-5), # 用足够小的学习率
    metrics=['accuracy']
)
model.summary()


train_D = data_generator(train_data)
valid_D = data_generator(valid_data)

model.fit_generator(
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=1,
    validation_data=valid_D.__iter__(),
    validation_steps=len(valid_D)
)

# Model for Multi_labels

## 1.Metrics

In [6]:
def micro_f1(y_true, y_pred):
    """F1 metric.
    
    Computes the micro_f1 and macro_f1, metrics for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
    
    """Macro_F1 metric.
    """
    precision = true_positives/(predicted_positives+K.epsilon())
    recall = true_positives/(possible_positives+K.epsilon())
    macro_f1 = K.mean(2*precision*recall/(precision+recall+K.epsilon()))
        
    """Micro_F1 metric.
    """
    precision = K.sum(true_positives)/K.sum(predicted_positives)
    recall = K.sum(true_positives)/K.sum(possible_positives)
    micro_f1 = 2*precision*recall/(precision+recall+K.epsilon())
    return micro_f1

def macro_f1(y_true, y_pred):
    """F1 metric.
    
    Computes the micro_f1 and macro_f1, metrics for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
    
    """Macro_F1 metric.
    """
    precision = true_positives/(predicted_positives+K.epsilon())
    recall = true_positives/(possible_positives+K.epsilon())
    macro_f1 = K.mean(2*precision*recall/(precision+recall+K.epsilon()))
        
    """Micro_F1 metric.
    """
    precision = K.sum(true_positives)/K.sum(predicted_positives)
    recall = K.sum(true_positives)/K.sum(possible_positives)
    micro_f1 = 2*precision*recall/(precision+recall+K.epsilon())
    return macro_f1

In [7]:
def f1_np(y_true, y_pred):
    """F1 metric.
    
    Computes the micro_f1 and macro_f1, metrics for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)), axis=0)
    predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)), axis=0)
    possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)), axis=0)
    
    """Macro_F1 metric.
    """
    precision = true_positives/(predicted_positives+1e-8)
    recall = true_positives/(possible_positives+1e-8)
    macro_f1 = np.mean(2*precision*recall/(precision+recall))
        
    """Micro_F1 metric.
    """
    precision = np.sum(true_positives)/np.sum(predicted_positives)
    recall = np.sum(true_positives)/np.sum(possible_positives)
    micro_f1 = 2*precision*recall/(precision+recall+1e-8)
    return micro_f1, macro_f1
    

y_true = np.array([[1,0,1,1,0],[1,1,0,1,1]])
y_pred = np.array([[0,1,1,1,0],[1,1,1,0,1]])

f1_np(y_true, y_pred)

(0.7142857092857143, 0.7333333277777777)

## 2.Model

In [8]:
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))

x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(n_class, activation='sigmoid')(x)

model = Model([x1_in, x2_in], p)
# val_metric = Metrics([val_x,val_y])
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-5), # 用足够小的学习率
    metrics=[micro_f1,macro_f1]
)
model.summary()

W1023 10:29:03.600848 139844851050304 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]              

## 3.Train

In [9]:
train_D = data_generator(train_data,multi_labels=True)
valid_D = data_generator(val_data,multi_labels=True)
test_D = data_generator(test_data,multi_labels=True)

model.fit_generator(
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=5,
    validation_data=valid_D.__iter__(),
    validation_steps=len(valid_D),
    # callbacks=[val_metric],
)
# tff.summary()
# val_metric.val_macro_f1s, val_metric.val_micro_f1s

W1023 10:29:17.918670 139844851050304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f2d582b6c50>

## 4.Evalulation of Test

In [10]:
test_D = data_generator(test_data,multi_labels=True)

model.evaluate_generator(test_D.__iter__(), len(test_D))

[0.07132768630981445, 0.9013189673423767, 0.7753145694732666]