In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import jieba
import re
import os
import time
import gc

from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
from tqdm import tqdm
from sklearn.metrics import f1_score, auc, roc_curve, classification_report

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import transformers
transformers.__version__

'2.11.0'

In [4]:
from config import root_path, stopwords_path, device, best_model_path, batch_size, train_V0_path, test_V0_path,\
    user_dict_path, train_augmented_V0201_path, train_augmented_V0204_path, train_all_path, test_path, pretrained_bert_path, lr, \
    is_cuda, max_gradient_norm, num_directions, lstm_hidden_size
import config

## dataset

In [5]:
class QAMatchDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len, mode):
        assert mode in ['train', 'dev', 'test']

        self.mode = mode
        self.tokenizer = tokenizer
        self.df = df
        self.max_seq_len = max_seq_len
        # self.df = pd.read_csv(file)
        # self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(file)

    def __getitem__(self, idx):
        token_seq_1 = self.df.iloc[idx]['question']
        token_seq_2 = self.df.iloc[idx]['reply_content']
        if self.mode in ['train', 'dev']:
            label_tensor = torch.tensor(self.df.iloc[idx]['label'])
        else:
            label_tensor = None
        token_seq_1 = self.tokenizer.tokenize(token_seq_1)
        token_seq_2 = self.tokenizer.tokenize(token_seq_2)
#         print("token_seq_1:", token_seq_1, "lens:", len(token_seq_1))|
#         print("token_seq_2:", token_seq_2, "lens:", len(token_seq_2))
        
        # truncate
        if len(token_seq_1) > self.max_seq_len:
            token_seq_1 = token_seq_1[:self.max_seq_len]
        if len(token_seq_2) > self.max_seq_len:
            token_seq_2 = token_seq_2[:self.max_seq_len]
        
        # padding
        token_seq_1 += ['[PAD]'] * (self.max_seq_len - len(token_seq_1))
        token_seq_2 += ['[PAD]'] * (self.max_seq_len - len(token_seq_2))
        
        seq = ["[CLS]"] + token_seq_1 + ["[SEP]"] + token_seq_2 + ["[SEP]"]
        seq = self.tokenizer.convert_tokens_to_ids(seq)

        seq_segment = [0] * (len(token_seq_1) + 2) + [1] * (len(token_seq_2) + 1)
        
        return torch.Tensor(seq).type(torch.long), torch.Tensor(seq_segment).type(torch.long), label_tensor
    
    def collate_fn(self, samples):
#         print(samples[0])
        seqs = torch.stack([s[0] for s in samples])
        seq_segments = torch.stack([s[1] for s in samples])

        if self.mode in ['train', 'dev']:
            labels = torch.stack([s[2] for s in samples])
        else:
            labels = None

        # attention mask处理
        seq_masks = torch.zeros(seqs.shape, dtype=torch.long)
        seq_masks = seq_masks.masked_fill(seqs != 0, 1)

        return seqs, seq_masks, seq_segments, labels
    def __len__(self):
        return len(self.df)

## model

In [6]:
class CNN(nn.Module):
    # 这个模型的作用：接受一个句子的embedding（从pretrain model中提取的），输出这个句子的句向量
    # 最后输出的句向量大小：len(window_sizes) * feature_size ,即窗口个数 * 由每个窗口提取到的特征
    # 积后得到的feature_map的大小 = (seq_len - window_size) / stride + 1
    def __init__(self, embedding_size, feature_size, window_sizes, max_seq_len):
        super(CNN, self).__init__()
        self.convs = nn.ModuleList([
            nn.Sequential(nn.Conv1d(in_channels=embedding_size, out_channels=feature_size, kernel_size=h),
                         nn.ReLU(),
                         nn.MaxPool1d(kernel_size=max_seq_len-h+1))   
            for h in window_sizes
        ])
        # Conv1d指的就是在纵列方向上做卷积，out_channels指每种kernel(窗口)要有几个
        # MaxPool1d指的就是在纵列方向上做池化，kernel_size设成feature_map的大小，就相当于对每个feature_map做max pool
        # 以下是网络结构中每一层的维度变化
#                 x                             permute                                conv                             MaxPool1d
# (batch, seq_len, embedding_size) -> (batch, embedding_size, seq_len) -> (batch, feature_size, max_seq_len-h+1) -> (batch, feature_size, 1)
    
    def forward(self, x):
        """
        :param x: (batch, max_seq_len, embedding_size)
        
        :return output: (batch, len(window_sizes) * feature_size)
        """
        x = x.permute(0, 2, 1)   # (batch, embedding_size, seq_len)  因为一维卷积是在最后维度上扫的
        out = [conv(x) for conv in self.convs]    # out[i]: (batch, feature_size, 1)
        out = torch.cat(out, dim=1)    # (batch, len(window_sizes)*feature_size, 1) 把所有窗口得到的feature拼接起来
        out = out.view(-1, out.size(1)) # (batch, len(window_sizes)*feature_size)
        
        return out
    
class BertModelWithCNN(nn.Module):
    def __init__(self, config):
        super(BertModelWithCNN, self).__init__()        
        self.bert_config = BertConfig.from_pretrained(os.path.join(config.pretrained_bert_path, 'config.json'))
#         self.bert_config.output_hidden_states = True
        self.max_seq_len = config.max_seq_len_r
        self.bert = BertModel.from_pretrained(config.pretrained_bert_path, output_hidden_states=False)
        
        # 用max_seq_len_r作为q和r的最大序列长度
        self.cnn = CNN(self.bert_config.hidden_size, config.feature_size, config.window_sizes, self.max_seq_len)
        
        self.dropout = nn.Dropout(p=config.dropout_rate)
        self.linear = nn.Linear(2 * self.bert_config.hidden_size + 4 * len(config.window_sizes) * config.feature_size, 1)
        self.loss_fn = nn.BCELoss()
        for param in self.bert.parameters():
            param.requires_grad = True     # fine-tune，每个参数都要更新

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels=None):
        """
        :param batch_seqs: input_ids
        :param batch_seq_masks: attention_mask
        :param batch_seq_segments: token_type_ids
        :param labels:
        :return: outputs: (loss, logits, ...)
                 outputs: (logits, ...)
        注：
        hidden_size: bert中的hidden_size
        lstm_hidden_szie: lstm中的hidden_size
        """
        # last_hidden_state, (batch_size, sequence_length, hidden_size)
        last_hidden_state, pooler_output = self.bert(input_ids=batch_seqs,
                                                    attention_mask=batch_seq_masks,
                                                    token_type_ids=batch_seq_segments)[:2]
        
        last_cls_hidden_state = last_hidden_state[:, 0]
        
        q_embeddings = last_hidden_state[:, 1 : self.max_seq_len+2]    # 第一句+[SEP]
        r_embeddings = last_hidden_state[:, self.max_seq_len+2 :]   # 第二句+[SEP]
        
        q_cnn_embeddings = self.cnn(q_embeddings)   # (batch, len(window_sizes)*feature_size)
        r_cnn_embeddings = self.cnn(r_embeddings)   # (batch, len(window_sizes)*feature_size)
        
        q_r_cnn_gap = torch.abs(q_cnn_embeddings - r_cnn_embeddings)
        q_r_cnn_muliple = q_cnn_embeddings * r_cnn_embeddings
        
        # concatenate these four tensor -> (batch_size, 2 * hidden_size 4 * len(window_sizes) * feature_size)
        x = torch.cat([pooler_output, last_cls_hidden_state, q_cnn_embeddings, r_cnn_embeddings, q_r_cnn_gap, q_r_cnn_muliple], dim=1)

        # dropout
        x = self.dropout(x)
        # FC层 -> (batch, 1)
        x = self.linear(x)
        # sigmoid
        output = torch.sigmoid(x)    # (batch_size, 1) 即模型预测每个样本为1的概率

        logits = x
        proba_0 = 1.0 - output     # (batch_size, 1)
        probabilities = torch.cat((proba_0, output), dim=1)   # (batch_size, 2)
        if labels is not None:
            # 有标签，则返回loss, logits, probabilities
            loss = self.loss_fn(output.squeeze(), labels.type(torch.float))
            outputs = (loss, logits, probabilities)
        else:
            # 无标签，则返回logits, probabilities
            outputs = (logits, probabilities)

        return outputs

In [7]:
def writeToLog(path, content):
    with open(path, 'a') as fp:
        fp.write(content)
        fp.write('\n')

## train

In [8]:
def train(train_dataloader, dev_dataloader, bert_tokenizer, best_model_path, output_path, fold, version, epochs=5, patience=2, checkpoint=None):
    # ---------------------- Model definition ---------------------- #
    print("\t* Building model...")
    bulid_time = time.time()
    model = BertModelWithCNN(config).to(device)
    print("\t* Building model time:{:.4f}s".format(time.time()-bulid_time))
    # ---------------------- Preparation for training -------------- #
    param_optimizer = list(model.named_parameters())
    # 这里，指定部分参数不参与权重衰减
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.001
    }, {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }]
    # optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.85, patience=patience)

    best_score = 0.0    # 记录validation最好的结果
    best_thres = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epoch_count = []
    train_losses = []
    valid_losses = []
    best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')

    # 如果有给参数checkpoint，则继续训练
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        best_score = checkpoint['best_score']
        best_thres = checkpoint['best_thres']
        print("\t* Training will continue on existing model from epoch{}...".format(start_epoch))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epoch_count = checkpoint['epoch_count']
        train_losses = checkpoint['train_losses']
        valid_losses = checkpoint['valid_losses']

    # Compute loss and accuracy before starting (or resuming) training
    # 如果准备start training，这里的valid结果就是预训练BERT（做fine-tune之前）对下游任务的效果
    # 如果准备resuming training，这里的valid结果就是上一次fine-tune的结果
    valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
    print("\t* Validation loss before training: {:.4f}, accuracy:{:.4f}, "
          "f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
          format(valid_loss, (valid_accuracy * 100), valid_f1, thres, valid_auc))
    print("\n", 20 * "=", "Training Bert model o device: {}".format(device), 20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        print("-> Start epoch {}".format(epoch))
        writeToLog(output_path, "-> Start epoch {}".format(epoch))
        epoch_count.append(epoch)
        # train
        epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc = train_for_one_epoch(model,
                                                                                          train_dataloader,
                                                                                          optimizer,
                                                                                          max_gradient_norm)
        train_losses.append(epoch_loss)
        print("-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        writeToLog(output_path, "-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        
        # validation
        valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
        print("-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        writeToLog(output_path, "-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        
        valid_losses.append(valid_loss)
        scheduler.step(valid_auc)

        # 以valid_auc为评测标准
        
        if valid_auc <= best_score:
            patience_counter += 1
        else:
            best_score = valid_auc
            best_thres = thres
            patience_counter = 0
            best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
            torch.save({
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "best_thres": best_thres,
                "epochs_count": epoch_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, best_model_saved_path)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
    if patience_counter != 0:
        # 如果最后一个epoch不是最好的模型，则读取之前的最好的模型
        best_checkpoint = torch.load(best_model_saved_path)
        model.load_state_dict(best_checkpoint['model'])
    return model, best_score


def train_for_one_epoch(model, dataloader, optimizer, max_gradient_norm):
    model.train()

    epoch_start_time = time.time()
    running_loss = 0.0   # 记录整个epoch的累加loss
    correct_count = 0.0
    batch_avg_time = 0.0 # 记录该epoch平均batch花费时间
    all_preds = []
    all_pred_probas = []
    all_labels = []

    tqdm_dataloader = tqdm(dataloader)
    for batch_index, data in enumerate(tqdm_dataloader):
        batch_start_time = time.time()
        if is_cuda:
            data = [t.to(device) for t in data if t is not None]
        # 梯度置零
        optimizer.zero_grad()
        seqs, seq_masks, seq_segments, labels = data
        outputs = model(seqs, seq_masks, seq_segments, labels)
        # 回传梯度
        loss = outputs[0]
        logits = outputs[1]
        probabilities = outputs[2]
        # probabilities = nn.functional.softmax(logits, dim=-1)
        loss.backward()
        # 梯度裁剪
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()

        running_loss += loss.item()
        pred = torch.argmax(probabilities, dim=1)
        correct_count = correct_count + (pred == labels).sum().item()
        batch_avg_time += time.time() - batch_start_time
        all_preds.append(pred.cpu())
        all_labels.append(labels.cpu())
        all_pred_probas.append(probabilities.detach().cpu())

        description = "Batch num: {}. Avg. batch proc. time: {:.4f}s, loss: {:.4f}".\
            format(batch_index+1, batch_avg_time/(batch_index+1), running_loss/(batch_index+1))
        tqdm_dataloader.set_description(description)
        del data
        torch.cuda.empty_cache()
        
    all_labels = torch.cat(all_labels)    # 把每个batch的labels平铺成一维tensor (samples, )
    all_preds = torch.cat(all_preds)      # 把每个batch的preds平铺成一维tensor (samples, )
    all_pred_probas = torch.cat(all_pred_probas) # 把每个batch的probas平铺成tensor (samples, 2)

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_count / len(dataloader.dataset)
    epoch_time = time.time() - epoch_start_time
    epoch_auc = auc(fpr, tpr)
    epoch_f1 = f1_score(all_labels, all_preds)

    return epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc
#     return epoch_time, epoch_loss, epoch_accuracy, 0, epoch_auc


def validate(model, dataloader):
    model.eval()
    running_loss = 0.0  # 记录整个epoch的累加loss
    correct_count = 0.0
    # all_preds = []
    all_labels = []
    all_pred_probas = []
    tqdm_dataloader = tqdm(dataloader)

    # Deactivate autograd for evaluation
    with torch.no_grad():   # 必须加这个，减少显存的使用
        for batch_index, data in enumerate(tqdm_dataloader):
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]

            seqs, seq_masks, seq_segments, labels = data
            outputs = model(seqs, seq_masks, seq_segments, labels)
            loss = outputs[0]
            logits = outputs[1]
            probabilities = outputs[2]
            # probabilities = nn.functional.softmax(logits, dim=-1)

            running_loss += loss.item()
            # _, pred = torch.max(logits, dim=1)

            # correct_count = correct_count + (pred == labels).sum().item()
            # all_preds.append(pred.cpu())
            all_labels.append(labels.cpu())
            all_pred_probas.append(probabilities.cpu())
            
            del data
            torch.cuda.empty_cache()
                    
    all_labels = torch.cat(all_labels)  # 把每个batch的labels平铺成一维tensor shape: (samples, )
    # all_preds = torch.cat(all_preds)  # 把每个batch的preds平铺成一维tensor shape: (samples, )
    all_pred_probas = torch.cat(all_pred_probas)  # 把每个batch的probas变成tensor（原来是[tensor, tensor, ...]）


    # best_f1, best_thres = search_f1(all_labels, all_pred_probas[:, 1])
    # all_preds = (all_pred_probas[:, 1] > best_thres).type(torch.long)
    all_preds = torch.argmax(all_pred_probas, dim=1)
    correct_count = (all_preds == all_labels).sum().item()

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    valid_loss = running_loss / len(dataloader)
    valid_acc = correct_count / len(dataloader.dataset)
    valid_f1 = f1_score(all_labels, all_preds)
    # valid_f1 = best_f1
    valid_auc = auc(fpr, tpr)
    best_thres = 0
    return valid_loss, valid_acc, valid_f1, valid_auc, best_thres
    # return valid_loss, valid_acc, 0, 0
    
def search_f1(y_true, y_pred):
    """

    :param y_true: 一维tensor
    :param y_pred: 一维tensor，y_pred[i]表示第i个样本在label为1上的预测概率
    :return:
    """
    best_score = 0.0
    best_thres = 0.0
    for i in range(30, 70):
        thres = i / 100
        y_pred_bin = (y_pred > thres)   # 大于thres的为1，小于thres的为0
        # print("y_pred_bin shape:", y_pred_bin.shape)
        score = f1_score(y_true, y_pred_bin)
        if score > best_score:
            best_score = score
            best_thres = thres

    return best_score, best_thres
    
def get_pred_probas(model, dataloader):
    model.eval()
    probas = None

    with torch.no_grad():
        for data in dataloader:
            # 将所有tensors移到GPU上
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]
#             print(len(data))
#             print(data)
#             break
            seqs, seq_masks, seq_segments = data[:3]
            outputs = model(seqs,
                            seq_masks,
                            seq_segments)
            logits = outputs[0]
            probabilities = outputs[1]   # (batch, 2)
            # probabilities = nn.functional.softmax(logits, dim=1)
            # _, pred = torch.max(logits.data, dim=1)

            if probas is None:
                probas = probabilities
            else:
                # 将每个batch的预测结果拼接起来
                probas = torch.cat([probas, probabilities])
                
            del data
            torch.cuda.empty_cache()

    return probas.cpu()

## KFold

In [9]:
def k_fold_cross_val(train_df, test_df, k, bert_tokenizer, best_model_path, output_path, version):
    kf = KFold(n_splits=k)
    test_dataset = QAMatchDataset(test_df, bert_tokenizer, config.max_seq_len_r, mode='test')
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=3, collate_fn=test_dataset.collate_fn)
    dev_labels = []
    dev_probas = []
    k_test_probas = []
    k_best_scores = []
    for fold, (train_idxs, dev_idxs) in enumerate(kf.split(train_df)):
        print("\t* Start "+str(fold)+" fold")
        writeToLog(output_path, "\t* Start "+str(fold)+" fold")
        dev_labels.extend(train_df.iloc[dev_idxs]['label'].tolist())
        # ---------------------- Data loading -------------------------- #
        print("\t* Building dataset...")
        train_dataset = QAMatchDataset(train_df.iloc[train_idxs], bert_tokenizer, config.max_seq_len_r, 'train')
        dev_dataset = QAMatchDataset(train_df.iloc[dev_idxs], bert_tokenizer, config.max_seq_len_r, 'dev')

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=3,
                                      collate_fn=train_dataset.collate_fn)
        dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, num_workers=3,
                                    collate_fn=dev_dataset.collate_fn)
        best_model_fold_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
        checkpoint = None
        if not(os.path.exists(best_model_fold_path)):
            # 若没有
            model, best_score = train(train_dataloader, dev_dataloader, bert_tokenizer, best_model_path, output_path, 
                                      fold, version, epochs=5, patience=3, checkpoint=None)
        else:
            checkpoint = torch.load(best_model_fold_path)
            model = BertModelWithCNN(config).to(device)
            model.load_state_dict(checkpoint['model'])
            best_score = checkpoint['best_score']
        k_best_scores.append(best_score)
        
        fold_dev_proba = get_pred_probas(model, dev_dataloader)
        fold_test_proba = get_pred_probas(model, test_dataloader)
        
        dev_probas.append(fold_dev_proba)  # (k, len(dev_idxs), 2)
        k_test_probas.append(fold_test_proba) # (k, len(test_dataset), 2)
#         model.to(torch.device('cpu'))
        del model, train_dataloader, dev_dataloader, checkpoint
        torch.cuda.empty_cache() 
        time.sleep(5)
    
    dev_probas = torch.cat(dev_probas)  # (len(train_df), 2)    # 把每一折的验证集的预测结果拼接，得到整个训练集的预测结果
    
    k_test_probas = torch.stack(k_test_probas) # (k, len(test_dataset), 2)， 只是把[tensor, tensor, ... ]转为tensor
#     test_probas = torch.mean(k_test_probas, dim=0)  # (len(test_dataset), 2)  取每一折的平均

    # k折模型加权融合
    k_best_scores = np.array(k_best_scores)              
    k_weights = k_best_scores / k_best_scores.sum()             # (k,)
    k_weights = np.expand_dims(np.expand_dims(k_weights,1),1)   # (k, 1, 1)
    print('k_best_score :', k_best_scores)
    print('k weights :', k_weights)
    k_test_probas = k_test_probas * k_weights               # 广播机制，使得每个模型预测的概率乘上该模型的权重 (k, len(test_dataset), 2)
    test_probas = torch.sum(k_test_probas, dim=0)           # 求和
    # search f1
    best_f1, best_thres = search_f1(dev_labels, dev_probas[:, 1])
    print(best_f1, best_thres)
    test_preds = (test_probas[:, 1] > best_thres).type(torch.long)
    
    # 不用search f1
    # test_preds = torch.argmax(test_probas, dim=1) 
    return test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres

## 操作

In [10]:
model_version = 'V6.5'     # 模型版本
scheme_version = 'V6.5'     # 方案版本
# train_df = pd.read_csv(train_all_path)
train_df = pd.read_csv(train_augmented_V0204_path)
# test_df = pd.read_csv(test_path)
# train_df = pd.read_csv(train_V0_path)
test_df = pd.read_csv(test_V0_path)
k = 5
bert_tokenizer = BertTokenizer.from_pretrained(os.path.join(pretrained_bert_path, 'vocab.txt'))
output_path = os.path.join(root_path, 'output/'+scheme_version+'.txt')

print("\t* K fold training and validating...")
test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres = k_fold_cross_val(train_df, test_df, k, bert_tokenizer, 
                                                                                          best_model_path, output_path, model_version)
dev_preds = (dev_probas[:, 1] > best_thres).type(torch.long)
fpr, tpr, thresholds = roc_curve(dev_labels, dev_probas[:, 1], pos_label=1)
dev_auc = auc(fpr, tpr)
print('dev auc: ',dev_auc)

print("\t* Saving dev result...")
with open(os.path.join(root_path, 'report/'+scheme_version+'_'+'classification_report.txt'), 'w') as fp:
    fp.write(classification_report(dev_labels, dev_preds))
    fp.write('\n')
    fp.write('f1-score: {:.4f}'.format(f1_score(dev_labels, dev_preds)))
    fp.write(' auc: {:.4f}'.format(dev_auc))

print("\t* Predicting...")
test_df['pred'] = test_preds.cpu().numpy()
k_test_probas = k_test_probas.cpu().numpy()

print("\t* Saving test result...")
# 保存预测结果
time_str = '' + time.strftime("%Y%m%d%H%M", time.localtime())                                  
test_df[['dialog_id', 'reply_id', 'pred']].to_csv(os.path.join(root_path,'submission/'+scheme_version+'_'+time_str+'.csv'),
                                                  sep='\t',
                                                  index=0,
                                                  header=0)
# 保存K折预测概率结果
k_test_probas_path = os.path.join(root_path, 'result/'+scheme_version+'_'+str(k)+'_test_probas.npz')
if not os.path.exists(k_test_probas_path):
    np.save(k_test_probas_path, k_test_probas)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


	* K fold training and validating...
	* Start 0 fold
	* Building dataset...
	* Building model...


  0%|          | 0/270 [00:00<?, ?it/s]

	* Building model time:5.6000s


100%|██████████| 270/270 [00:21<00:00, 12.64it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

	* Validation loss before training: 0.6978, accuracy:49.6410, f1_score: 0.2835, best_thres: 0.0000, auc: 0.4548

-> Start epoch 1


Batch num: 1080. Avg. batch proc. time: 0.2514s, loss: 0.3401: 100%|██████████| 1080/1080 [06:03<00:00,  2.98it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.0459s, loss: 0.3401, accuracy: 85.4144%, f1_score: 0.6889, auc: 0.8961


100%|██████████| 270/270 [00:21<00:00, 12.56it/s]


-> Validation loss: 0.2700, accuracy: 88.7885%, f1_score: 0.7768, best_thres: 0.0000, auc: 0.9395


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1080. Avg. batch proc. time: 0.2555s, loss: 0.2255: 100%|██████████| 1080/1080 [06:08<00:00,  2.93it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:368.8642s, loss: 0.2255, accuracy: 90.8322%, f1_score: 0.8160, auc: 0.9571


100%|██████████| 270/270 [00:21<00:00, 12.54it/s]


-> Validation loss: 0.2664, accuracy: 89.5993%, f1_score: 0.7865, best_thres: 0.0000, auc: 0.9451


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1080. Avg. batch proc. time: 0.2547s, loss: 0.1547: 100%|██████████| 1080/1080 [06:07<00:00,  3.66it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:368.0699s, loss: 0.1547, accuracy: 94.1478%, f1_score: 0.8841, auc: 0.9792


100%|██████████| 270/270 [00:21<00:00, 12.53it/s]


-> Validation loss: 0.2801, accuracy: 90.4563%, f1_score: 0.8097, best_thres: 0.0000, auc: 0.9515


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1080. Avg. batch proc. time: 0.2566s, loss: 0.1067: 100%|██████████| 1080/1080 [06:09<00:00,  3.71it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:369.9736s, loss: 0.1067, accuracy: 96.1342%, f1_score: 0.9235, auc: 0.9900


100%|██████████| 270/270 [00:21<00:00, 12.53it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

-> Validation loss: 0.3175, accuracy: 90.3289%, f1_score: 0.7961, best_thres: 0.0000, auc: 0.9471
-> Start epoch 5


Batch num: 1080. Avg. batch proc. time: 0.2559s, loss: 0.0824: 100%|██████████| 1080/1080 [06:08<00:00,  3.74it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:369.0720s, loss: 0.0824, accuracy: 97.0898%, f1_score: 0.9424, auc: 0.9940


100%|██████████| 270/270 [00:21<00:00, 12.55it/s]


-> Validation loss: 0.3162, accuracy: 91.1513%, f1_score: 0.8154, best_thres: 0.0000, auc: 0.9494
	* Start 1 fold
	* Building dataset...
	* Building model...


  0%|          | 0/270 [00:00<?, ?it/s]

	* Building model time:2.5562s


100%|██████████| 270/270 [00:22<00:00, 12.19it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

	* Validation loss before training: 0.6259, accuracy:73.1874, f1_score: 0.2879, best_thres: 0.0000, auc: 0.6201

-> Start epoch 1


Batch num: 1080. Avg. batch proc. time: 0.2546s, loss: 0.3327: 100%|██████████| 1080/1080 [06:07<00:00,  3.78it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:367.8882s, loss: 0.3327, accuracy: 86.1064%, f1_score: 0.7050, auc: 0.9003


100%|██████████| 270/270 [00:21<00:00, 12.52it/s]


-> Validation loss: 0.2740, accuracy: 88.8812%, f1_score: 0.7744, best_thres: 0.0000, auc: 0.9349


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1080. Avg. batch proc. time: 0.2563s, loss: 0.2243: 100%|██████████| 1080/1080 [06:10<00:00,  3.66it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:370.4188s, loss: 0.2243, accuracy: 91.1218%, f1_score: 0.8212, auc: 0.9571


100%|██████████| 270/270 [00:21<00:00, 12.49it/s]


-> Validation loss: 0.2720, accuracy: 89.6456%, f1_score: 0.7864, best_thres: 0.0000, auc: 0.9411


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1080. Avg. batch proc. time: 0.2566s, loss: 0.1541: 100%|██████████| 1080/1080 [06:10<00:00,  3.61it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:370.5147s, loss: 0.1541, accuracy: 94.2318%, f1_score: 0.8852, auc: 0.9793


100%|██████████| 270/270 [00:21<00:00, 12.47it/s]


-> Validation loss: 0.2737, accuracy: 90.0973%, f1_score: 0.7988, best_thres: 0.0000, auc: 0.9470


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1080. Avg. batch proc. time: 0.2573s, loss: 0.1088: 100%|██████████| 1080/1080 [06:10<00:00,  3.57it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:370.9790s, loss: 0.1088, accuracy: 96.1169%, f1_score: 0.9230, auc: 0.9891


100%|██████████| 270/270 [00:21<00:00, 12.46it/s]


-> Validation loss: 0.2938, accuracy: 90.5722%, f1_score: 0.8059, best_thres: 0.0000, auc: 0.9485


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1080. Avg. batch proc. time: 0.2571s, loss: 0.0841: 100%|██████████| 1080/1080 [06:10<00:00,  2.91it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:370.8441s, loss: 0.0841, accuracy: 96.9508%, f1_score: 0.9395, auc: 0.9935


100%|██████████| 270/270 [00:22<00:00, 12.12it/s]


-> Validation loss: 0.2967, accuracy: 91.2092%, f1_score: 0.8193, best_thres: 0.0000, auc: 0.9517
	* Start 2 fold
	* Building dataset...
	* Building model...


  0%|          | 0/270 [00:00<?, ?it/s]

	* Building model time:2.1128s


100%|██████████| 270/270 [00:22<00:00, 12.09it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

	* Validation loss before training: 0.8161, accuracy:24.7047, f1_score: 0.3819, best_thres: 0.0000, auc: 0.3402

-> Start epoch 1


Batch num: 1080. Avg. batch proc. time: 0.2547s, loss: 0.3617: 100%|██████████| 1080/1080 [06:08<00:00,  3.68it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:368.4080s, loss: 0.3617, accuracy: 84.7165%, f1_score: 0.6598, auc: 0.8783


100%|██████████| 270/270 [00:21<00:00, 12.44it/s]


-> Validation loss: 0.2782, accuracy: 88.2326%, f1_score: 0.7533, best_thres: 0.0000, auc: 0.9340


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1080. Avg. batch proc. time: 0.2545s, loss: 0.2436: 100%|██████████| 1080/1080 [06:07<00:00,  3.68it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:367.9894s, loss: 0.2436, accuracy: 90.2849%, f1_score: 0.8008, auc: 0.9486


100%|██████████| 270/270 [00:21<00:00, 12.41it/s]


-> Validation loss: 0.2581, accuracy: 89.6456%, f1_score: 0.7833, best_thres: 0.0000, auc: 0.9454


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1080. Avg. batch proc. time: 0.2540s, loss: 0.1732: 100%|██████████| 1080/1080 [06:06<00:00,  3.76it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:366.5799s, loss: 0.1732, accuracy: 93.3341%, f1_score: 0.8660, auc: 0.9741


100%|██████████| 270/270 [00:21<00:00, 12.46it/s]


-> Validation loss: 0.2734, accuracy: 90.2131%, f1_score: 0.7972, best_thres: 0.0000, auc: 0.9468


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1080. Avg. batch proc. time: 0.2545s, loss: 0.1220: 100%|██████████| 1080/1080 [06:07<00:00,  3.64it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:367.2615s, loss: 0.1220, accuracy: 95.4885%, f1_score: 0.9099, auc: 0.9868


100%|██████████| 270/270 [00:21<00:00, 12.58it/s]


-> Validation loss: 0.2738, accuracy: 90.5837%, f1_score: 0.8043, best_thres: 0.0000, auc: 0.9476


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1080. Avg. batch proc. time: 0.2521s, loss: 0.0940: 100%|██████████| 1080/1080 [06:03<00:00,  3.74it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.8840s, loss: 0.0940, accuracy: 96.5686%, f1_score: 0.9315, auc: 0.9918


100%|██████████| 270/270 [00:21<00:00, 12.65it/s]


-> Validation loss: 0.2801, accuracy: 90.9775%, f1_score: 0.8189, best_thres: 0.0000, auc: 0.9483
	* Start 3 fold
	* Building dataset...
	* Building model...


  0%|          | 0/270 [00:00<?, ?it/s]

	* Building model time:2.3203s


100%|██████████| 270/270 [00:22<00:00, 12.20it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

	* Validation loss before training: 0.9139, accuracy:25.2288, f1_score: 0.3945, best_thres: 0.0000, auc: 0.3313

-> Start epoch 1


Batch num: 1080. Avg. batch proc. time: 0.2504s, loss: 0.3365: 100%|██████████| 1080/1080 [06:01<00:00,  3.75it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:362.0764s, loss: 0.3365, accuracy: 85.8347%, f1_score: 0.6960, auc: 0.8970


100%|██████████| 270/270 [00:21<00:00, 12.57it/s]


-> Validation loss: 0.2714, accuracy: 88.5208%, f1_score: 0.7769, best_thres: 0.0000, auc: 0.9392


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1080. Avg. batch proc. time: 0.2528s, loss: 0.2258: 100%|██████████| 1080/1080 [06:04<00:00,  3.57it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:364.9195s, loss: 0.2258, accuracy: 91.0902%, f1_score: 0.8188, auc: 0.9560


100%|██████████| 270/270 [00:21<00:00, 12.60it/s]


-> Validation loss: 0.2670, accuracy: 89.5749%, f1_score: 0.7928, best_thres: 0.0000, auc: 0.9467


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1080. Avg. batch proc. time: 0.2511s, loss: 0.1542: 100%|██████████| 1080/1080 [06:02<00:00,  3.67it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:362.3658s, loss: 0.1542, accuracy: 94.1567%, f1_score: 0.8828, auc: 0.9792


100%|██████████| 270/270 [00:21<00:00, 12.59it/s]


-> Validation loss: 0.2718, accuracy: 90.6637%, f1_score: 0.8063, best_thres: 0.0000, auc: 0.9517


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1080. Avg. batch proc. time: 0.2511s, loss: 0.1117: 100%|██████████| 1080/1080 [06:02<00:00,  3.76it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:362.8305s, loss: 0.1117, accuracy: 95.9027%, f1_score: 0.9181, auc: 0.9887


100%|██████████| 270/270 [00:21<00:00, 12.57it/s]


-> Validation loss: 0.2856, accuracy: 90.8375%, f1_score: 0.8150, best_thres: 0.0000, auc: 0.9528


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1080. Avg. batch proc. time: 0.2526s, loss: 0.0857: 100%|██████████| 1080/1080 [06:03<00:00,  3.74it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.8398s, loss: 0.0857, accuracy: 96.9364%, f1_score: 0.9387, auc: 0.9932


100%|██████████| 270/270 [00:21<00:00, 12.59it/s]


-> Validation loss: 0.3133, accuracy: 90.8027%, f1_score: 0.8176, best_thres: 0.0000, auc: 0.9498
	* Start 4 fold
	* Building dataset...
	* Building model...


  0%|          | 0/270 [00:00<?, ?it/s]

	* Building model time:2.3788s


100%|██████████| 270/270 [00:22<00:00, 12.20it/s]
  0%|          | 0/1080 [00:00<?, ?it/s]

	* Validation loss before training: 0.9451, accuracy:25.5647, f1_score: 0.4070, best_thres: 0.0000, auc: 0.5101

-> Start epoch 1


Batch num: 1080. Avg. batch proc. time: 0.2520s, loss: 0.3339: 100%|██████████| 1080/1080 [06:03<00:00,  3.81it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.6945s, loss: 0.3339, accuracy: 85.7449%, f1_score: 0.6933, auc: 0.8984


100%|██████████| 270/270 [00:21<00:00, 12.56it/s]


-> Validation loss: 0.2760, accuracy: 88.5903%, f1_score: 0.7719, best_thres: 0.0000, auc: 0.9365


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1080. Avg. batch proc. time: 0.2510s, loss: 0.2251: 100%|██████████| 1080/1080 [06:02<00:00,  3.76it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:362.7633s, loss: 0.2251, accuracy: 91.1597%, f1_score: 0.8195, auc: 0.9555


100%|██████████| 270/270 [00:21<00:00, 12.61it/s]


-> Validation loss: 0.2746, accuracy: 89.5981%, f1_score: 0.7977, best_thres: 0.0000, auc: 0.9447


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1080. Avg. batch proc. time: 0.2503s, loss: 0.1556: 100%|██████████| 1080/1080 [06:01<00:00,  3.60it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:361.6226s, loss: 0.1556, accuracy: 94.1306%, f1_score: 0.8820, auc: 0.9783


100%|██████████| 270/270 [00:21<00:00, 12.59it/s]


-> Validation loss: 0.2774, accuracy: 90.1541%, f1_score: 0.8156, best_thres: 0.0000, auc: 0.9504


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1080. Avg. batch proc. time: 0.2526s, loss: 0.1110: 100%|██████████| 1080/1080 [06:03<00:00,  3.72it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.7692s, loss: 0.1110, accuracy: 95.9288%, f1_score: 0.9183, auc: 0.9887


100%|██████████| 270/270 [00:21<00:00, 12.58it/s]


-> Validation loss: 0.3022, accuracy: 90.3162%, f1_score: 0.8185, best_thres: 0.0000, auc: 0.9506


  0%|          | 0/1080 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1080. Avg. batch proc. time: 0.2523s, loss: 0.0845: 100%|██████████| 1080/1080 [06:03<00:00,  3.70it/s]
  0%|          | 0/270 [00:00<?, ?it/s]

-> Training time:363.8187s, loss: 0.0845, accuracy: 97.1362%, f1_score: 0.9426, auc: 0.9931


100%|██████████| 270/270 [00:21<00:00, 12.61it/s]


-> Validation loss: 0.3171, accuracy: 90.4436%, f1_score: 0.8194, best_thres: 0.0000, auc: 0.9512
k_best_score : [0.95151878 0.95165389 0.94826516 0.9528116  0.95116603]
k weights : [[[0.20009162]]

 [[0.20012003]]

 [[0.19940743]]

 [[0.20036348]]

 [[0.20001744]]]
0.8169916434540391 0.53
dev auc:  0.9500097555117696
	* Saving dev result...
	* Predicting...
	* Saving test result...
