In [1]:
import collections
import gc
import json
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from random import choice, seed, randint, random
import pandas as pd
import numpy as np
import tensorflow as tf
import keras.backend as K
import keras
from keras.models import Sequential, Model
from keras.layers import Input, CuDNNGRU as GRU, CuDNNLSTM as LSTM, Dropout, BatchNormalization
from keras.layers import Dense, Concatenate, Activation, Embedding, SpatialDropout1D, Bidirectional, Lambda, Conv1D
from keras.layers import Add, Average
from keras.optimizers import Nadam, Adam, Adamax
from keras.activations import absolute_import
from keras.legacy import interfaces
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback
from keras.utils import to_categorical
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import StratifiedKFold as SKF
from keras_bert.loader import load_trained_model_from_checkpoint
from keras_bert import AdamWarmup, calc_train_steps
from keras.engine import Layer
from keras.engine import InputSpec
from keras.objectives import categorical_crossentropy
from keras.objectives import sparse_categorical_crossentropy
from keras import activations, initializers, regularizers, constraints
from keras.models import load_model
from keras_bert import get_custom_objects
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score
from special_tokens import CHINESE_MAP
from metric_utils import compute_f1, compute_exact
from collections import OrderedDict, Counter

Using TensorFlow backend.


In [2]:
BERT_PRETRAINED_DIR = "../../../chinese_bert/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/"
TRN_FILENAME = "../data/train_20200228.csv"
DEV_FILENAME = "../data/dev_20200228.csv"
PREFIX = "roberta_large_v2_augm"
MAX_EPOCH = 20
MAX_LEN = 60
MAX_DOC_LEN = MAX_LEN // 2
THRE = 0.5
B_SIZE = 32
ACCUM_STEP = int(32 // B_SIZE)
FOLD_ID = [-1]
FOLD_NUM = 25
SEED = 2020
PREFIX += "_seed" + str(SEED)
SHUFFLE = True
DOC_STRIDE = 128
cfg = {}
cfg["verbose"] = PREFIX
cfg["base_dir"] = BERT_PRETRAINED_DIR
cfg["span_mode"] = True
cfg["lr"] = 2e-5
cfg['min_lr'] = 6e-8 
cfg["ch_type"] = "tx_ft"
cfg["trainable"] = True
cfg["bert_trainable"] = True
cfg["accum_step"] = ACCUM_STEP
cfg["cls_num"] = 4
cfg["unit1"] = 128
cfg["unit2"] = 128
cfg["unit3"] = 512
cfg["conv_num"] = 128
cfg['maxlen'] = MAX_LEN
cfg["adv_training"] = False

train_data = pd.read_csv(TRN_FILENAME)
train_data.fillna("", inplace=True)
dev_data = pd.read_csv(DEV_FILENAME)
dev_data.fillna("", inplace=True)
all_data = pd.concat([train_data, dev_data], axis=0, ignore_index=True)

def get_data(df_data):

    df_gb = df_data.groupby('query1')
    res = {}
    for index, data in df_gb:
        query2s = data["query2"]
        lables = data["label"]
        ele = {}
        pos_qs = []
        neg_qs = []
        for q, lable in zip(query2s, lables):
            if lable == 1:
                pos_qs.append(q)
            elif lable == 0:
                neg_qs.append(q)
            else:
                print("wrong data", index, q, lable)
        ele["pos"] = pos_qs
        ele["neg"] = neg_qs
        res[index] = ele
    return res

# train_data_dict = get_data(train_data)

In [3]:
def get_vocab():
    if "albert"in cfg["verbose"].lower():
        dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab_chinese.txt')
    else:
        dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
    with open(dict_path, mode="r", encoding="utf8") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]

    word_index = {v: k  for k, v in enumerate(lines)}
    for k, v in CHINESE_MAP.items():
        assert v in word_index
        del word_index[v]
    return word_index


def get_label():
    labels = ["0", "1"]
    label2id = {k: v for v, k in enumerate(labels)}
    id2label = {v: k for k, v in label2id.items()}
    return label2id, id2label, labels
    
    
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype=np.float16)


def load_embed(path, dim=300, word_index=None):
    embedding_index = {}
    with open(path, mode="r", encoding="utf8") as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split()
            word, arr = l[0], l[1:]
            if len(arr) != dim:
                print("[!] l = {}".format(l))
                continue
            if word_index and word not in word_index:
                continue
            word, arr = get_coefs(word, arr)
            embedding_index[word] = arr
    return embedding_index


def build_matrix(path, word_index=None, max_features=None, dim=300):
    embedding_index = load_embed(path, dim=dim, word_index=word_index)
    max_features = len(word_index) + 1 if max_features is None else max_features 
    embedding_matrix = np.zeros((max_features + 1, dim))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                unknown_words.append(word)
    return embedding_matrix, unknown_words


def load_word_embed(word_embed_f1="../../../chinese_embedding/Tencent_AILab_ChineseEmbedding.txt", 
               word_embed_f2="../../../chinese_embedding/cc.zh.300.vec", 
               save_filename="./word_embedding_matrix",
               word_index=None):
    if os.path.exists(save_filename + ".npy"):
        word_embedding_matrix = np.load(save_filename + ".npy").astype("float32")
    else:
        if "tx" in cfg["ch_type"]:
            tx_embed, tx_unk = build_matrix(word_embed_f1, word_index=word_index, dim=200)
        else:
            tx_embed = np.zeros(shape=(len(word_index) + 2, 0))
            tx_unk = []
        if "ft" in cfg["ch_type"]:
            ft_embed, ft_unk = build_matrix(word_embed_f2, word_index=word_index, dim=300)
        else:
            ft_embed = np.zeros(shape=(len(word_index) + 2, 0))
            ft_unk = []    

        word_embedding_matrix = np.concatenate([tx_embed, ft_embed], axis=-1).astype("float32")
        print(word_embedding_matrix.shape, len(tx_unk), len(ft_unk))
        np.save(save_filename, word_embedding_matrix )
    return word_embedding_matrix
    
    
word_index = get_vocab()
label2id, id2label, labels = get_label()
word_embedding_matrix = load_word_embed(word_index=word_index)

NUM_CLASS = len(label2id)
cfg["x_pad"] = word_index["[PAD]"]
cfg["num_class"] = NUM_CLASS
cfg["filename"] = "{}_{}_{}_{}".format(PREFIX, cfg["ch_type"], FOLD_NUM, cfg["lr"])
cfg["filename"] = cfg["filename"] + "_adv_training" if cfg["adv_training"] else cfg["filename"]
print(label2id, id2label, labels)

{'0': 0, '1': 1} {0: '0', 1: '1'} ['0', '1']


In [4]:
def build_model(cfg, summary=False, word_embedding_matrix=None):
    def _get_model(base_dir, cfg_=None):
        if "albert"in cfg["verbose"].lower():
            from bert4keras.bert import build_bert_model
            config_file = os.path.join(base_dir, 'albert_config.json')
            checkpoint_file = os.path.join(base_dir, 'model.ckpt-best')
            model = build_bert_model(
                    config_path=config_file,
                    checkpoint_path=checkpoint_file,
                    model='albert',
                    return_keras_model=True
            )
            if cfg_["cls_num"] > 1:
                output = Concatenate(axis=-1)([model.get_layer("Encoder-1-FeedForward-Norm").get_output_at(-i) for i in range(1, cfg["cls_num"] + 1)])
                model = Model(model.inputs[: 2], outputs=output)
            model.trainable = cfg_["bert_trainable"]
        else:
            config_file = os.path.join(base_dir, 'bert_config.json')
            checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
            if not os.path.exists(config_file):
                config_file = os.path.join(base_dir, 'bert_config_large.json')
                checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')            
            model = load_trained_model_from_checkpoint(config_file, 
                                                       checkpoint_file, 
                                                       training=False, 
                                                       trainable=cfg_["bert_trainable"], 
                                                       output_layer_num=cfg_["cls_num"],
                                                       seq_len=cfg_['maxlen'])
            
            # model = Model(inputs=model.inputs[: 2], outputs=model.layers[-7].output)

        return model
    
    def _get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None):
        total_steps, warmup_steps = calc_train_steps(
            num_example=num_example,
            batch_size=B_SIZE,
            epochs=MAX_EPOCH,
            warmup_proportion=warmup_proportion,
        )
        opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr)
        if cfg.get("accum_step", None) and cfg["accum_step"] > 1:
            print("[!] using accum_step = {}".format(cfg["accum_step"]))
            from accum_optimizer import AccumOptimizer
            opt = AccumOptimizer(opt, steps_per_update=cfg["accum_step"])
        
        return opt

    bert_model = _get_model(cfg["base_dir"], cfg)

    if word_embedding_matrix is not None:
        embed = Embedding(input_dim=word_embedding_matrix.shape[0], 
                          output_dim=word_embedding_matrix.shape[1],
                          weights=[word_embedding_matrix],
                          trainable=cfg["trainable"],
                          name="char_embed"
                         )
    
    t1_in = Input(shape=(None, ))
    t2_in = Input(shape=(None, ))
    o1_in = Input(shape=(1, ))
    o2_in = Input(shape=(1, ))

    t1, t2, o1, o2 = t1_in, t2_in, o1_in, o2_in
    
    ## Char information
    mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'))(t1)
    word_embed = embed(t1)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])
    word_embed = Bidirectional(LSTM(cfg["unit1"], return_sequences=True), merge_mode="sum")(word_embed)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])
    
    t = bert_model([t1, t2])
    t = Concatenate(axis=-1)([t, word_embed])
    t = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([t, mask]) 
    t = Bidirectional(LSTM(cfg["unit3"], return_sequences=True), merge_mode="concat")(t)
    # t = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([t, mask]) 
    # t = Conv1D(cfg["conv_num"], kernel_size=3, padding="same")(t) 
    t = Lambda(lambda x: x[:, 0, :], name="extract_layer")(t)
    if cfg.get("num_class", 1) == 2:
        po1_logit = Dense(1, name="po1_logit")(t)
        po1 = Activation('sigmoid', name="po1")(po1_logit)
        train_model = Model(inputs=[t1_in, t2_in, o1_in],
                            outputs=[po1])        
        o1_loss = K.binary_crossentropy(o1, po1)
        loss = K.mean(o1_loss)
    else:
        po1_logit = Dense(cfg["num_class"], name="po1_logit")(t)
        po1 = Activation('softmax', name="po1")(po1_logit)
        train_model = Model(inputs=[t1_in, t2_in, o1_in],
                            outputs=[po1])
        loss = K.categorical_crossentropy(o1, po1, axis=-1)
        loss = K.mean(loss)

    train_model.add_loss(loss)
    opt = _get_opt(num_example=cfg["num_example"], lr=cfg["lr"], min_lr=cfg['min_lr'])
    train_model.compile(optimizer=opt)
    if summary:
        train_model.summary()
    return train_model


# print("----------------build model ---------------")
# model = build_model(cfg, summary=True, word_embedding_matrix=word_embedding_matrix)
# del model

In [5]:
def token2id_X(x, x_dict, x2=None, maxlen=None, maxlen1=None):
    if x2:
        x1 = x
        del x
        maxlen -= 3
        maxlen1 -= 2
        assert maxlen > maxlen1
        maxlen2 = maxlen - maxlen1 - 1
        x1 = ["[CLS]"] + list(x1)[: maxlen1] + ["[SEP]"] 
        x1 = [x_dict[e] if e in x_dict else x_dict["[UNK]"] for e in x1]
        seg1= [0 for _ in x1]
        
        x2 = list(x2)[: maxlen2] + ["[SEP]"] 
        x2= [x_dict[e] if e in x_dict else x_dict["[UNK]"] for e in x2]
        seg2 = [1 for _ in x2]
        x = x1 + x2
        seg = seg1 + seg2
        
    else:
        maxlen -= 2
        x = ["[CLS]"] + list(x)[: maxlen] + ["[SEP]"] 
        x = [x_dict[e] if e in x_dict else x_dict["[UNK]"] for e in x]
        seg = [0 for _ in x]        
    return x, seg


def seq_padding(X, maxlen=None, padding_value=None, debug=False):
    L = [len(x) for x in X]
    if maxlen is None:
        maxlen = max(L)

    pad_X = np.array([
        np.concatenate([x, [padding_value] * (maxlen - len(x))]) if len(x) < maxlen else x for x in X
    ])
    if debug:
        print("[!] before pading {}\n".format(X))
        print("[!] after pading {}\n".format(pad_X))
    return pad_X


class data_generator:
    
    def __init__(self, data, batch_size=B_SIZE, shuffle=SHUFFLE, augm_frac=0.75):
        self.data = data
        self.batch_size = batch_size
        self.steps = cfg["num_example"] // self.batch_size
        self.shuffle = shuffle
        self.data_dict = get_data(data)
        self.augm_frac = augm_frac
        if cfg["num_example"] % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps
    
    def __iter__(self):
        
        while True:
            idxs = list(range(len(self.data)))
            if self.shuffle:
                np.random.shuffle(idxs)
            T1, T2, O1, O2 = [], [], [], []
            for i in idxs:
                d = self.data.iloc[i]
                text = d["query1"]
                label_text = d["query2"]
                o1 = d["label"]
                
                if random() > self.augm_frac:
                    data_d = self.data_dict[text]
                    pos_data = data_d["pos"]
                    neg_data = data_d["neg"]
                    if pos_data and neg_data:
                        if random() > 0.5:
                            o1 = 1
                            label_text = choice(pos_data)
                            if len(pos_data) >= 2:
                                _pos_data = [e for e in pos_data if e != label_text]
                                text = choice(_pos_data)
                        else:
                            o1 = 0
                            text = choice(pos_data)
                            label_text = choice(neg_data)   
                
                if random() > 0.5:
                    text, label_text = label_text, text
                
                if o1 == "":
                    continue
                o1 = float(o1)
                assert 0 <= o1 <= 1
                
                O1.append(o1)                
                t1, t2 = token2id_X(text, x2=label_text, x_dict=word_index, maxlen=MAX_LEN, maxlen1=MAX_DOC_LEN)
                assert len(t1) == len(t2)
                
                T1.append(t1)
                T2.append(t2)

                if len(T1) == self.batch_size or i == idxs[-1]:
                    O1 = np.array(O1).reshape(-1, 1)
                    T1 = seq_padding(T1, padding_value=cfg["x_pad"])
                    T2 = seq_padding(T2, padding_value=0)
                    assert T1.shape == T2.shape and T1.shape[0] == O1.shape[0]

                    yield [T1, T2, O1], None
                    T1, T2, O1, = [], [], []
                    
                        
# gen = data_generator(train_data)
# for i, e in enumerate(gen):
#     if i > 400:
#         break
#     print("i = {}".format(i), "-" * 81)
#     # print(e[0])
#     for _e in e[0]:
#         print(_e.shape, _e.sum(axis=0).sum(axis=0))
# del gen
# print("finish")

In [6]:
def get_model(model_):
    model_inp_ind = [0, 1]
    inputs = [model_.inputs[e] for e in model_inp_ind]
    sub_model = Model(inputs=inputs, outputs=[model_.get_layer("po1").output])
    return sub_model


def evaluate(sub_model, data, bs=32):
    idxs = list(range(len(data)))
    T1, T2, O1, O2 = [], [], [], []
    preds = []
    for i in idxs:
        d = data.iloc[i]
        text = d["query1"]
        label_text = d["query2"]

        t1, t2 = token2id_X(text, x2=label_text, x_dict=word_index, maxlen=MAX_LEN, maxlen1=MAX_DOC_LEN)
        assert len(t1) == len(t2)

        T1.append(t1)
        T2.append(t2)

        o1 = float(d["label"])
        O1.append(o1)
        if len(T1) == bs or i == idxs[-1]:
            T1 = seq_padding(T1, padding_value=cfg["x_pad"])
            T2 = seq_padding(T2, padding_value=0)
            assert T1.shape == T2.shape
            pred = sub_model.predict([T1, T2])
            preds.append(pred)
            T1, T2 = [], []
    
    preds = np.concatenate(preds, axis=0).reshape(-1)
    O1 = np.array(O1).reshape(-1)
    O1 = O1.astype("int32")
    auc = roc_auc_score(O1, preds)
    acc = accuracy_score(O1, np.array(preds > 0.5, "int32"))
    return auc, acc
    

class Evaluate(Callback):
    def __init__(self, data, filename=None):
        self.F1 = []
        self.best = 0.
        self.filename = filename
        self.data = data
    
    def on_epoch_begin(self, epoch, logs=None):
        if epoch ==  0:
            print("[!] test load&save model")
            f = self.filename + ".h5"
            self.model.save(f, include_optimizer=False, overwrite=False)
            if "albert" in cfg["verbose"]:
                model_ = load_model(f) 
            else:
                model_ = load_model(f, custom_objects=get_custom_objects()) 

    def on_epoch_end(self, epoch, logs=None):
        if epoch + 1 < 1:
            return
        if epoch + 1 in [3, 6, 9, 10, 12, 15, 18, 20]:
            self.model.save(self.filename + "_{}.h5".format(epoch + 1), include_optimizer=False)
            
        sub_model = get_model(self.model)
        f1, class_f1 = evaluate(sub_model, data=self.data)
        self.F1.append(f1)
        if f1 > self.best:
            self.model.save(self.filename + ".h5", include_optimizer=False)
            
        if f1 > self.best:
            self.best = f1
            print("[!] epoch = {}, new best_auc = {}".format(epoch + 1,  f1))
        print('[!] epoch = {}, auc = {}, best auc {}'.format(epoch + 1, f1, self.best))
        print('[!] epoch = {}, acc = {}\n'.format(epoch + 1, class_f1))

In [7]:
def search_layer(inputs, name, exclude_from=None):
    """根据inputs和name来搜索层
    说明：inputs为某个层或某个层的输出；name为目标层的名字。
    实现：根据inputs一直往上递归搜索，直到发现名字为name的层为止；
         如果找不到，那就返回None。
    """
    if exclude_from is None:
        exclude_from = set()

    if isinstance(inputs, keras.layers.Layer):
        layer = inputs
    else:
        layer = inputs._keras_history[0]

    if layer.name == name:
        return layer
    elif layer in exclude_from:
        return None
    else:
        exclude_from.add(layer)
        if isinstance(layer, keras.models.Model):
            model = layer
            for layer in model.layers:
                if layer.name == name:
                    return layer
        inbound_layers = layer._inbound_nodes[0].inbound_layers
        if not isinstance(inbound_layers, list):
            inbound_layers = [inbound_layers]
        if len(inbound_layers) > 0:
            for layer in inbound_layers:
                layer = search_layer(layer, name, exclude_from)
                if layer is not None:
                    return layer
                
def adversarial_training(model, embedding_names, epsilon=1):
    """给模型添加对抗训练
    其中model是需要添加对抗训练的keras模型，embedding_names
    则是model里边Embedding层的名字。要在模型compile之后使用。
    """
    if model.train_function is None:  # 如果还没有训练函数
        model._make_train_function()  # 手动make
    old_train_function = model.train_function  # 备份旧的训练函数

    # 查找Embedding层
    embedding_layers = []
    for embedding_name in embedding_names:
        for output in model.outputs:
            embedding_layer = search_layer(output, embedding_name)
            if embedding_layer is not None:
                embedding_layers.append(embedding_layer)
                break
    for embedding_layer in embedding_layers:
        if embedding_layer is None:
            raise Exception('Embedding layer not found')

    # 求Embedding梯度
    embeddings = [embedding_layer.embeddings for embedding_layer in embedding_layers] # Embedding矩阵
    gradients = K.gradients(model.total_loss, embeddings)  # Embedding梯度
    # gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor
    gradients = [K.zeros_like(embedding) + gradient for embedding, gradient in zip(embeddings, gradients)]

    # 封装为函数
    inputs = (model._feed_inputs +
              model._feed_targets +
              model._feed_sample_weights)  # 所有输入层
    embedding_gradients = K.function(
        inputs=inputs,
        outputs=gradients,
        name='embedding_gradients',
    )  # 封装为函数

    def train_function(inputs):  # 重新定义训练函数
#         grads = embedding_gradients(inputs)[0]  # Embedding梯度
#         delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
        grads = embedding_gradients(inputs)  # Embedding梯度
        deltas = [epsilon * grad / (np.sqrt((grad**2).sum()) + 1e-8) for grad in grads]  # 计算扰动
        # 注入扰动
        # K.set_value(embeddings, K.eval(embeddings) + delta)  
        for embedding, delta in zip(embeddings, deltas):
            K.set_value(embedding, K.eval(embedding) + delta)
            
        outputs = old_train_function(inputs)  # 梯度下降
        # 删除扰动
        # K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
        for embedding, delta in zip(embeddings, deltas):
            K.set_value(embedding, K.eval(embedding) - delta)       
        return outputs

    model.train_function = train_function  # 覆盖原训练函数


# 写好函数后，启用对抗训练只需要一行代码
# adversarial_training(model, 'Embedding-Token', 0.5)

In [8]:
adv_layer_names = ['Embedding-Token', 'char_embed']

if -1 in FOLD_ID:
    fold_id = -1
    cfg["num_example"] = len(train_data)
    print("-" * 81)
    print("[!] start fold_id =", fold_id, train_data.shape, dev_data.shape)
    print(cfg)
    K.clear_session()
    gc.collect()
    train_D = data_generator(train_data)
    seed(SEED + fold_id)
    np.random.seed(SEED + fold_id)
    tf.random.set_random_seed(SEED + fold_id)
    model = build_model(cfg, summary=True, word_embedding_matrix=word_embedding_matrix)
    if cfg["adv_training"]:
        print("[!] using adv_training")
        adversarial_training(model, adv_layer_names, 0.5)
    evaluator = Evaluate(filename=cfg["filename"] + "_fold{}".format(fold_id), data=dev_data)
    model.fit_generator(train_D.__iter__(),
                              steps_per_epoch=len(train_D),
                              epochs=MAX_EPOCH,
                              callbacks=[evaluator],
                              shuffle=True
                              )
    del model, train_data, dev_data
    gc.collect()
    print("[!] finish fold_id =", fold_id)
    print("-" * 81)


skf = SKF(FOLD_NUM, shuffle=True, random_state=SEED)
print(all_data.shape)
for fold_id, (trn_ind, val_ind) in enumerate(skf.split(range(len(all_data)), all_data["label"])):
    if fold_id not in FOLD_ID:
        continue
    
    dev_data = all_data.iloc[val_ind].reset_index(drop=True)
    train_data = all_data.iloc[trn_ind].reset_index(drop=True)
    cfg["num_example"] = len(train_data)
    print("-" * 81)
    print("[!] start fold_id =", fold_id, train_data.shape, dev_data.shape)
    print(cfg)
    K.clear_session()
    gc.collect()
    train_D = data_generator(train_data)
    seed(SEED + fold_id)
    np.random.seed(SEED + fold_id)
    tf.random.set_random_seed(SEED + fold_id)
    model = build_model(cfg, summary=True, word_embedding_matrix=word_embedding_matrix)
    if cfg["adv_training"]:
        print("[!] using adv_training")
        adversarial_training(model, adv_layer_names, 0.5)
    evaluator = Evaluate(filename=cfg["filename"] + "_fold{}".format(fold_id), data=dev_data)
    model.fit_generator(train_D.__iter__(),
                              steps_per_epoch=len(train_D),
                              epochs=MAX_EPOCH,
                              callbacks=[evaluator],
                              shuffle=True
                              )
    del model, train_data, dev_data
    gc.collect()
    print("[!] finish fold_id =", fold_id)
    print("-" * 81)
    


---------------------------------------------------------------------------------
[!] start fold_id = -1 (8747, 5) (2002, 5)
{'verbose': 'roberta_large_v2_augm_seed2020', 'base_dir': '../../../chinese_bert/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/', 'span_mode': True, 'lr': 2e-05, 'min_lr': 6e-08, 'ch_type': 'tx_ft', 'trainable': True, 'bert_trainable': True, 'accum_step': 1, 'cls_num': 4, 'unit1': 128, 'unit2': 128, 'unit3': 512, 'conv_num': 128, 'maxlen': 60, 'adv_training': False, 'x_pad': 0, 'num_class': 2, 'filename': 'roberta_large_v2_augm_seed2020_tx_ft_25_2e-05', 'num_example': 8747}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (I



[!] epoch = 1, new best_auc = 0.9790332560901669
[!] epoch = 1, auc = 0.9790332560901669, best auc 0.9790332560901669
[!] epoch = 1, acc = 0.9180819180819181

Epoch 2/10
[!] epoch = 2, new best_auc = 0.981319404646218
[!] epoch = 2, auc = 0.981319404646218, best auc 0.981319404646218
[!] epoch = 2, acc = 0.9325674325674326

Epoch 3/10
[!] epoch = 3, auc = 0.9790093771542953, best auc 0.981319404646218
[!] epoch = 3, acc = 0.9365634365634365

Epoch 4/10
[!] epoch = 4, new best_auc = 0.9824531349928156
[!] epoch = 4, auc = 0.9824531349928156, best auc 0.9824531349928156
[!] epoch = 4, acc = 0.9385614385614386

Epoch 5/10
[!] epoch = 5, auc = 0.9793997458450652, best auc 0.9824531349928156
[!] epoch = 5, acc = 0.9415584415584416

Epoch 6/10
[!] epoch = 6, auc = 0.9802022857332702, best auc 0.9824531349928156
[!] epoch = 6, acc = 0.9425574425574426

Epoch 7/10
[!] epoch = 7, auc = 0.9790555777041339, best auc 0.9824531349928156
[!] epoch = 7, acc = 0.9445554445554446

Epoch 8/10
[!] epoch 

In [9]:
evaluator.F1, max(evaluator.F1)

([0.9790332560901669,
  0.981319404646218,
  0.9790093771542953,
  0.9824531349928156,
  0.9793997458450652,
  0.9802022857332702,
  0.9790555777041339,
  0.9809113863071952,
  0.9795554780355318,
  0.9800071013878853],
 0.9824531349928156)

In [10]:
def test(sub_model, data, bs=32):
    idxs = list(range(len(data)))
    T1, T2, O1, O2 = [], [], [], []
    preds = []
    for i in idxs:
        d = data.iloc[i]
        text = d["query1"]
        label_text = d["query2"]

        t1, t2 = token2id_X(text, x2=label_text, x_dict=word_index, maxlen=MAX_LEN, maxlen1=MAX_DOC_LEN)
        assert len(t1) == len(t2)

        T1.append(t1)
        T2.append(t2)

        if len(T1) == bs or i == idxs[-1]:
            T1 = seq_padding(T1, padding_value=cfg["x_pad"])
            T2 = seq_padding(T2, padding_value=0)
            assert T1.shape == T2.shape
            pred = sub_model.predict([T1, T2])
            preds.append(pred)
            T1, T2 = [], []
    
    preds = np.concatenate(preds, axis=0).reshape(-1)
    return preds


def ensemble_predictions(predictions, weights=None, type_="linear"):
    if not weights:
        # print("[!] AVE_WGT")
        weights = [1./ len(predictions) for _ in range(len(predictions))]
    assert len(predictions) == len(weights)
    if np.sum(weights) != 1.0:
        weights = [w / np.sum(weights) for w in weights]
    # print("[!] weights = {}".format(weights))
    assert np.isclose(np.sum(weights), 1.0)
    if type_ == "linear":
        res = np.average(predictions, weights=weights, axis=0)
    elif type_ == "harmonic":
        res = np.average([1 / p for p in predictions], weights=weights, axis=0)
        return 1 / res
    elif type_ == "geometric":
        numerator = np.average(
            [np.log(p) for p in predictions], weights=weights, axis=0
        )
        res = np.exp(numerator / sum(weights))
        return res
    elif type_ == "rank":
        from scipy.stats import rankdata
        res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
        return res / (len(res) + 1)
    return res


from glob import glob
from time import time


model_files =[
            "albert_xxlarge_tx_ft_5fold-1_2e-05.h5",
            "albert_xxlarge_tx_ft_5fold-1_2e-05_6.h5",
            "roberta_base_tx_ft_5fold-1_2e-05.h5", 
            "roberta_base_tx_ft_5fold-1_2e-05_10.h5", 
            "roberta_large_tx_ft_5fold-1_2e-05.h5",
            "roberta_large_tx_ft_5fold-1_2e-05_10.h5",
            "UER_large_tx_ft_5fold-1_6e-06.h5",
            "UER_large_tx_ft_5fold-1_6e-06_10.h5",
            "roberta_large_v2_tx_ft_5fold-1_2e-05.h5",    
            "roberta_large_v2_tx_ft_5fold-1_2e-05_10.h5", 
             ]

for f in model_files:
    print(f, os.path.exists(f))
assert len(model_files) == len(set(model_files)) 
assert all([os.path.exists(f) for f in model_files]) 
preds = []
O1 = dev_data["label"].values.reshape(-1)
for f in model_files:
    print("-" * 80)
    K.clear_session()
    t0 = time()
    print("[!]", f)
    if "albert" in f:
        model = load_model(f)
    else:
        model = load_model(f, custom_objects=get_custom_objects())
    sub_model = get_model(model)
    pred = test(sub_model, dev_data)
    auc = roc_auc_score(O1, pred)
    acc = accuracy_score(O1, np.array(pred > 0.5, "int32"))    
    print("[{}]".format(time() - t0), auc, acc)
    print("-" * 80)
    preds.append(pred)
    del model
    gc.collect()

    
pred = ensemble_predictions(preds)
print(pred.shape)
auc = roc_auc_score(O1, pred)
acc = accuracy_score(O1, np.array(pred > 0.5, "int32"))    
print(auc, acc)

albert_xxlarge_tx_ft_5fold-1_2e-05.h5 False
albert_xxlarge_tx_ft_5fold-1_2e-05_6.h5 False
roberta_base_tx_ft_5fold-1_2e-05.h5 False
roberta_base_tx_ft_5fold-1_2e-05_10.h5 False
roberta_large_tx_ft_5fold-1_2e-05.h5 False
roberta_large_tx_ft_5fold-1_2e-05_10.h5 False
UER_large_tx_ft_5fold-1_6e-06.h5 False
UER_large_tx_ft_5fold-1_6e-06_10.h5 False
roberta_large_v2_tx_ft_5fold-1_2e-05.h5 False
roberta_large_v2_tx_ft_5fold-1_2e-05_10.h5 False


AssertionError: 

In [None]:
pred1 = ensemble_predictions(preds[0::2])
pred2 = ensemble_predictions(preds[1::2])
print(pred2[: 3], pred1[: 3])
for i in range(1, 100):
    wgt = i / 100
    pred = ensemble_predictions([pred1, pred2], weights=[wgt, 1 - wgt], type_="geometric")
    auc = roc_auc_score(O1, pred)
    acc = accuracy_score(O1, np.array(pred > 0.5, "int32"))    
    print(wgt, auc, acc)    

In [None]:
import jieba
from collections import Counter

def get_cnt(data, col1, col2, cate):
    data = data[data["category"] == cate]
    if col2:
        data = data[col1].tolist() + data[col2].tolist() 
    else:
        data = data[col1].tolist()
    data = [list(jieba.cut(e)) for e in data]
    cnt1 = Counter([w for sent in data for w in sent])
    return cnt1
    
    
stop_words = ['？', '吗', '了', '，', '的', '?', '有', '得', '地', '是', '什么',
              '怎么办', '哪些', '怎么回事', '怎么', '要', '能', '呢', '会']
for cate in train_data["category"].value_counts().index:
    print("-" * 40, cate, "-" * 40)
    cnt1 = get_cnt(train_data, col1="query1", col2="query2", cate=cate)
    cnt1 = [(k, cnt) for k, cnt in cnt1.most_common() if k not in stop_words]
    print(cnt1[: 20])
    print("-" * 81)



In [None]:
for cate in dev_data["category"].value_counts().index:
    print("------------------------")
    print(cate)
    cnt1 = get_cnt(dev_data, col="query1", cate=cate)
    cnt2 = get_cnt(dev_data, col="query2", cate=cate)
    print(cnt1.most_common(10))
    print(cnt2.most_common(10))

In [None]:
def _foo(x, col):
    cate = x["category"]
    q = x[col]
    if cate == "咳血":
        return q.count("咯血") + q.count("咳血")
    
    
    return q.count(cate)


    
train_data["q1_cnt_cate"] = train_data.apply(lambda x: _foo(x, "query1"), axis=1)
train_data["q2_cnt_cate"] = train_data.apply(lambda x: _foo(x, "query2"), axis=1)
train_data.head(12)