In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import jieba
import re
import os
import time
import gc

from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score, auc, roc_curve, classification_report

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import sys
sys.path.append('..')

In [4]:
import config
from config import device, is_cuda

## dataset

In [5]:
class QAMatchDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len_q, max_seq_len_r, mode):
        assert mode in ['train', 'dev', 'test']

        self.mode = mode
        self.tokenizer = tokenizer
        self.df = df
        self.max_seq_len_q = max_seq_len_q
        self.max_seq_len_r = max_seq_len_r
        # self.df = pd.read_csv(file)
        # self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(file)

    def __getitem__(self, idx):
        token_seq_1 = self.df.iloc[idx]['question']
        token_seq_2 = self.df.iloc[idx]['reply_content']
        if self.mode in ['train', 'dev']:
            label_tensor = torch.tensor(self.df.iloc[idx]['label'])
        else:
            label_tensor = None
        token_seq_1 = self.tokenizer.tokenize(token_seq_1)
        token_seq_2 = self.tokenizer.tokenize(token_seq_2)
#         print("token_seq_1:", token_seq_1, "lens:", len(token_seq_1))|
#         print("token_seq_2:", token_seq_2, "lens:", len(token_seq_2))
        
        # truncate
        if len(token_seq_1) > self.max_seq_len_q:
            token_seq_1 = token_seq_1[:self.max_seq_len_q]
        if len(token_seq_2) > self.max_seq_len_r:
            token_seq_2 = token_seq_2[:self.max_seq_len_r]
        
        # padding
        token_seq_1 += ['[PAD]'] * (self.max_seq_len_q - len(token_seq_1))
        token_seq_2 += ['[PAD]'] * (self.max_seq_len_r - len(token_seq_2))
        
        seq = ["[CLS]"] + token_seq_1 + ["[SEP]"] + token_seq_2 + ["[SEP]"]
        seq = self.tokenizer.convert_tokens_to_ids(seq)

        seq_segment = [0] * (len(token_seq_1) + 2) + [1] * (len(token_seq_2) + 1)
        
        return torch.Tensor(seq).type(torch.long), torch.Tensor(seq_segment).type(torch.long), label_tensor
    
    def collate_fn(self, samples):
#         print(samples[0])
        seqs = torch.stack([s[0] for s in samples])
        seq_segments = torch.stack([s[1] for s in samples])

        if self.mode in ['train', 'dev']:
            labels = torch.stack([s[2] for s in samples])
        else:
            labels = None

        # attention mask处理
        seq_masks = torch.zeros(seqs.shape, dtype=torch.long)
        seq_masks = seq_masks.masked_fill(seqs != 0, 1)

        return seqs, seq_masks, seq_segments, labels
    def __len__(self):
        return len(self.df)

## model

In [6]:
class BertModelWithRNN(nn.Module):
    def __init__(self, params):
        super(BertModelWithRNN, self).__init__()        
        self.bert_config = BertConfig.from_pretrained(os.path.join(params['pretrained_model_path'], 'config.json'))
        self.max_seq_len_q = params['max_seq_len_q']
        self.max_seq_len_r = params['max_seq_len_r']
        self.lstm_hidden_size = params['lstm_hidden_size']
        self.num_directions = params['num_directions']
        self.bert = BertModel.from_pretrained(params['pretrained_model_path'], output_hidden_states=False)
        # tied weight version
        self.bilstm = nn.LSTM(input_size=self.bert_config.hidden_size, 
                                hidden_size=self.lstm_hidden_size, 
                                bidirectional=True, 
                                batch_first=True)
        
        self.linear = nn.Linear(2 * self.bert_config.hidden_size + 8 * self.lstm_hidden_size, 1)
        self.dropout = nn.Dropout(p=params['dropout_rate'])
        self.loss_fn = nn.BCELoss()
        for param in self.bert.parameters():
            param.requires_grad = True     # fine-tune，每个参数都要更新

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels=None):
        """
        :param batch_seqs: input_ids
        :param batch_seq_masks: attention_mask
        :param batch_seq_segments: token_type_ids
        :param labels:
        :return: outputs: (loss, logits, ...)
                 outputs: (logits, ...)
        注：
        hidden_size: bert中的hidden_size
        lstm_hidden_szie: lstm中的hidden_size
        """
        # last_hidden_state, (batch_size, sequence_length, hidden_size)
        # pooler_output : 最后一层[CLS]token的hidden state过一个FN+tanh输出的logits  (batch_size, hidden_size)
        last_hidden_state, pooler_output = self.bert(input_ids=batch_seqs,
                                                     attention_mask=batch_seq_masks,
                                                     token_type_ids=batch_seq_segments)[:2]
        last_cls_hidden_state = last_hidden_state[:, 0]
        
        q_embeddings = last_hidden_state[:, 1 : self.max_seq_len_q+2]    # 包含中间的[SEP]
        r_embeddings = last_hidden_state[:, self.max_seq_len_q+2:]     # 包含末尾的[SEP]
        
        # lstm_hidden_size = 512
        # lstm_output:t=1到t=seq_len的最后一层的hidden state, shape(batch_first):(batch, seq_len, num_directions * lstm_hidden_size)
        # t时刻的token向量= ht(->) concat ht(<-) , shape是lstm_hidden_size*2
        q_lstm_output = self.bilstm(q_embeddings)[0]
        r_lstm_output = self.bilstm(r_embeddings)[0]
        
        # 取 [h(->)_n;h(<-)_0]
        # (batch, seq_len, num_directions, lstm_hidden_size)
        batch_size = q_embeddings.size(0)
        q_lstm_output = q_lstm_output.view(batch_size, -1, self.num_directions, self.lstm_hidden_size) 
        r_lstm_output = r_lstm_output.view(batch_size, -1, self.num_directions, self.lstm_hidden_size) 
        
        q_lstm_embeddings = torch.cat([q_lstm_output[:, 0, -1, :], q_lstm_output[:, -1, 0, :]], dim=1)  # (batch, 2 * lstm_hidden_size)
        r_lstm_embeddings = torch.cat([r_lstm_output[:, 0, -1, :], r_lstm_output[:, -1, 0, :]], dim=1)  # (batch, 2 * lstm_hidden_size)
        
        # |seq_1 - seq_2|
        lstm_seq_gap = torch.abs(q_lstm_embeddings - r_lstm_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        lstm_seq_multiple = q_lstm_embeddings * r_lstm_embeddings
        
        # concatenate this four tensor -> (batch_size, 2 * hidden_size + 8 * lstm_hidden_size)
        x = torch.cat([pooler_output, last_cls_hidden_state, 
                       q_lstm_embeddings, r_lstm_embeddings, lstm_seq_gap, lstm_seq_multiple], dim=1)

        # dropout
        x = self.dropout(x)
#         x = nn.functional.dropout(x, p=0.6)
        # FC层 -> (batch, 1)
        x = self.linear(x)
        # sigmoid
        output = torch.sigmoid(x)    # (batch_size, 1) 即模型预测每个样本为1的概率

        logits = x
        proba_0 = 1.0 - output     # (batch_size, 1)
        probabilities = torch.cat((proba_0, output), dim=1)   # (batch_size, 2)
        if labels is not None:
            # 有标签，则返回loss, logits, probabilities
            loss = self.loss_fn(output.squeeze(), labels.type(torch.float))
            outputs = (loss, logits, probabilities)
        else:
            # 无标签，则返回logits, probabilities
            outputs = (logits, probabilities)

        return outputs

In [7]:
def writeToLog(path, content):
    with open(path, 'a') as fp:
        fp.write(content)
        fp.write('\n')

## train

In [8]:
def train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, fold,
          version, checkpoint=None):
    # ---------------------- Model definition ---------------------- #
    print("\t* Building model...")
    bulid_time = time.time()
    model = BertModelWithRNN(params).to(device)
    print("\t* Building model time:{:.4f}s".format(time.time()-bulid_time))
    # ---------------------- Preparation for training -------------- #
#     param_optimizer = list(model.named_parameters())
    # 这里，指定部分参数不参与权重衰减
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#     optimizer_grouped_parameters = [{
#         'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#         'weight_decay': params['weight_decay']
#     }, {
#         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#         'weight_decay': 0.0
#     }]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': params['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
#     optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr'])
    optimizer = Adam(model.parameters(), lr=params['lr'], weight_decay=params['l2_weight'])
#     optimizer = SGD(model.parameters(),lr=params['lr'],momentum=params['momentum'], weight_decay=params['l2_weight'])
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.85, patience=params['patience'])
    num_training_steps = len(train_dataloader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

    best_score = 0.0    # 记录validation最好的结果
    best_thres = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epoch_count = []
    train_losses = []
    valid_losses = []
    train_f1s = []
    valid_f1s = []
    train_aucs = []
    valid_aucs = []
    best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')

    # Compute loss and accuracy before starting (or resuming) training
    # 如果准备start training，这里的valid结果就是预训练BERT（做fine-tune之前）对下游任务的效果
    # 如果准备resuming training，这里的valid结果就是上一次fine-tune的结果
    valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
    print("\t* Validation loss before training: {:.4f}, accuracy:{:.4f}, "
          "f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
          format(valid_loss, (valid_accuracy * 100), valid_f1, thres, valid_auc))
    print("\n", 20 * "=", "Training Bert model o device: {}".format(device), 20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, params['epochs']+1):
        print("-> Start epoch {}".format(epoch))
        writeToLog(output_path, "-> Start epoch {}".format(epoch))
        epoch_count.append(epoch)
        # train
        epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc = train_for_one_epoch(model,
                                                                                          train_dataloader,
                                                                                          optimizer,
                                                                                          scheduler,
                                                                                          params['max_gradient_norm'])
        train_losses.append(epoch_loss)
        train_f1s.append(epoch_f1)
        train_aucs.append(epoch_auc)
        print("-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        writeToLog(output_path, "-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        
        # validation
        valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
        print("-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        writeToLog(output_path, "-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        
        valid_losses.append(valid_loss)
        valid_f1s.append(valid_f1)
        valid_aucs.append(valid_auc)
#         scheduler.step(valid_loss)
        
        if valid_auc <= best_score:
            patience_counter += 1
        else:
            best_score = valid_auc
            best_thres = thres
            patience_counter = 0
            best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
            torch.save({
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,    # k fold时以valid auc来看每折的模型的能力，从而对最终的预测结果进行加权平均
                "best_thres": best_thres,
                "epochs_count": epoch_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, best_model_saved_path)

        if patience_counter >= params['early_stoping']:
            print("-> Early stopping: patience limit reached, stopping...")
            break
            
    if patience_counter != 0:
        # 如果最后一个epoch不是最好的模型，则读取之前的最好的模型
        best_checkpoint = torch.load(best_model_saved_path)
        model.load_state_dict(best_checkpoint['model'])
#     return model, best_score, epoch_count, train_losses, train_f1s, train_aucs, valid_losses, valid_f1s, valid_aucs
    return model, best_score


def train_for_one_epoch(model, dataloader, optimizer, scheduler, max_gradient_norm):
    model.train()

    epoch_start_time = time.time()
    running_loss = 0.0   # 记录整个epoch的累加loss
    correct_count = 0.0
    batch_avg_time = 0.0 # 记录该epoch平均batch花费时间
    all_preds = []
    all_pred_probas = []
    all_labels = []

    tqdm_dataloader = tqdm(dataloader)
    for batch_index, data in enumerate(tqdm_dataloader):
        batch_start_time = time.time()
        if is_cuda:
            data = [t.to(device) for t in data if t is not None]
        # 梯度置零
        optimizer.zero_grad()
        seqs, seq_masks, seq_segments, labels = data
        outputs = model(seqs, seq_masks, seq_segments, labels)
        # 回传梯度
        loss = outputs[0]
        logits = outputs[1]
        probabilities = outputs[2]
        # probabilities = nn.functional.softmax(logits, dim=-1)
        loss.backward()
        # 梯度裁剪
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pred = torch.argmax(probabilities, dim=1)
        correct_count = correct_count + (pred == labels).sum().item()
        batch_avg_time += time.time() - batch_start_time
        all_preds.append(pred.cpu())
        all_labels.append(labels.cpu())
        all_pred_probas.append(probabilities.detach().cpu())

        description = "Batch num: {}. Avg. batch proc. time: {:.4f}s, loss: {:.4f}".\
            format(batch_index+1, batch_avg_time/(batch_index+1), running_loss/(batch_index+1))
        tqdm_dataloader.set_description(description)
#         del data
#         torch.cuda.empty_cache()
        
    all_labels = torch.cat(all_labels)    # 把每个batch的labels平铺成一维tensor (samples, )
    all_preds = torch.cat(all_preds)      # 把每个batch的preds平铺成一维tensor (samples, )
    all_pred_probas = torch.cat(all_pred_probas) # 把每个batch的probas平铺成tensor (samples, 2)

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_count / len(dataloader.dataset)
    epoch_time = time.time() - epoch_start_time
    epoch_auc = auc(fpr, tpr)
    epoch_f1 = f1_score(all_labels, all_preds)

    return epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc
#     return epoch_time, epoch_loss, epoch_accuracy, 0, epoch_auc


def validate(model, dataloader):
    model.eval()
    running_loss = 0.0  # 记录整个epoch的累加loss
    correct_count = 0.0
    # all_preds = []
    all_labels = []
    all_pred_probas = []
    tqdm_dataloader = tqdm(dataloader)

    # Deactivate autograd for evaluation
    with torch.no_grad():   # 必须加这个，减少显存的使用
        for batch_index, data in enumerate(tqdm_dataloader):
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]

            seqs, seq_masks, seq_segments, labels = data
            outputs = model(seqs, seq_masks, seq_segments, labels)
            loss = outputs[0]
            logits = outputs[1]
            probabilities = outputs[2]
            # probabilities = nn.functional.softmax(logits, dim=-1)

            running_loss += loss.item()
            # _, pred = torch.max(logits, dim=1)

            # correct_count = correct_count + (pred == labels).sum().item()
            # all_preds.append(pred.cpu())
            all_labels.append(labels.cpu())
            all_pred_probas.append(probabilities.cpu())
            
#             del data
#             torch.cuda.empty_cache()
                    
    all_labels = torch.cat(all_labels)  # 把每个batch的labels平铺成一维tensor shape: (samples, )
    # all_preds = torch.cat(all_preds)  # 把每个batch的preds平铺成一维tensor shape: (samples, )
    all_pred_probas = torch.cat(all_pred_probas)  # 把每个batch的probas变成tensor（原来是[tensor, tensor, ...]）


    # best_f1, best_thres = search_f1(all_labels, all_pred_probas[:, 1])
    # all_preds = (all_pred_probas[:, 1] > best_thres).type(torch.long)
    all_preds = torch.argmax(all_pred_probas, dim=1)
    correct_count = (all_preds == all_labels).sum().item()

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    valid_loss = running_loss / len(dataloader)
    valid_acc = correct_count / len(dataloader.dataset)
    valid_f1 = f1_score(all_labels, all_preds)
    # valid_f1 = best_f1
    valid_auc = auc(fpr, tpr)
    best_thres = 0
    return valid_loss, valid_acc, valid_f1, valid_auc, best_thres
    # return valid_loss, valid_acc, 0, 0
    
def search_f1(y_true, y_pred):
    """

    :param y_true: 一维tensor
    :param y_pred: 一维tensor，y_pred[i]表示第i个样本在label为1上的预测概率
    :return:
    """
    best_score = 0.0
    best_thres = 0.0
    for i in range(30, 70):
        thres = i / 100
        y_pred_bin = (y_pred > thres)   # 大于thres的为1，小于thres的为0
        # print("y_pred_bin shape:", y_pred_bin.shape)
        score = f1_score(y_true, y_pred_bin)
        if score > best_score:
            best_score = score
            best_thres = thres

    return best_score, best_thres
    
def get_pred_probas(model, dataloader, is_test=False):
    model.eval()
    probas = None
    all_labels = []
    with torch.no_grad():
        for data in dataloader:
            # 将所有tensors移到GPU上
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]
                
            if is_test:
                seqs, seq_masks, seq_segments = data[:3]
            else:
                seqs, seq_masks, seq_segments, labels = data
                all_labels.append(labels)
            outputs = model(seqs,
                            seq_masks,
                            seq_segments)
            logits = outputs[0]
            probabilities = outputs[1]   # (batch, 2)

            if probas is None:
                probas = probabilities
            else:
                # 将每个batch的预测结果拼接起来
                probas = torch.cat([probas, probabilities])
    if is_test:
        return probas.cpu()
    all_labels = torch.cat(all_labels)  # (len, )
    return probas.cpu(), all_labels.cpu()

## KFold

In [9]:
def k_fold_cross_val(train_df, test_df, params, k, bert_tokenizer, best_model_path, output_path, version):
    kf = KFold(n_splits=k)
    test_dataset = QAMatchDataset(test_df, bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], mode='test')
    test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=3, collate_fn=test_dataset.collate_fn)
    dev_labels = []
    dev_probas = []
    k_test_probas = []
    k_best_scores = []
    for fold, (train_idxs, dev_idxs) in enumerate(kf.split(train_df)):
        print("\t* Start "+str(fold)+" fold")
        writeToLog(output_path, "\t* Start "+str(fold)+" fold")
#         dev_labels.extend(train_df.iloc[dev_idxs]['label'].tolist())
        # ---------------------- Data loading -------------------------- #
        print("\t* Building dataset...")
        train_dataset = QAMatchDataset(train_df.iloc[train_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'train')
        dev_dataset = QAMatchDataset(train_df.iloc[dev_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'dev')

        train_dataloader = DataLoader(train_dataset, batch_size=params['batch_size'], num_workers=3,
                                      collate_fn=train_dataset.collate_fn)
        dev_dataloader = DataLoader(dev_dataset, batch_size=512, num_workers=3,
                                    collate_fn=dev_dataset.collate_fn)
        best_model_fold_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
        checkpoint = None
        if not(os.path.exists(best_model_fold_path)):
            # 若没有
            
            model, best_score = train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, 
                                      fold, version, checkpoint=None)
        else:
            checkpoint = torch.load(best_model_fold_path)
            model = BertModelWithRNN(params).to(device)
            model.load_state_dict(checkpoint['model'])
            best_score = checkpoint['best_score']
        k_best_scores.append(best_score)
        
        fold_dev_proba, dev_label = get_pred_probas(model, dev_dataloader)
        for idx, proba in zip(dev_idxs, fold_dev_proba):
            train_df.loc[idx, 'proba_0'] = proba[0].item()
            train_df.loc[idx, 'proba_1'] = proba[1].item()
        fold_test_proba = get_pred_probas(model, test_dataloader, is_test=True)
        
        dev_labels.append(dev_label)
        dev_probas.append(fold_dev_proba)  # (k, len(dev_idxs), 2)
        k_test_probas.append(fold_test_proba) # (k, len(test_dataset), 2)
#         model.to(torch.device('cpu'))
        del model, train_dataloader, dev_dataloader, checkpoint
        torch.cuda.empty_cache() 
        time.sleep(5)
    
    dev_labels = torch.cat(dev_labels)  # (len(train_df),)      # 把每一折的验证集的label拼接，得到整个训练集的label
    dev_probas = torch.cat(dev_probas)  # (len(train_df), 2)    # 把每一折的验证集的预测结果拼接，得到整个训练集的预测结果
    
    k_test_probas = torch.stack(k_test_probas) # (k, len(test_dataset), 2)， 只是把[tensor, tensor, ... ]转为tensor
#     test_probas = torch.mean(k_test_probas, dim=0)  # (len(test_dataset), 2)  取每一折的平均

    # k折模型加权融合
    k_best_scores = np.array(k_best_scores)              
    k_weights = k_best_scores / k_best_scores.sum()             # (k,)
    k_weights = np.expand_dims(np.expand_dims(k_weights,1),1)   # (k, 1, 1)
    print('k_best_score :', k_best_scores)
    print('k weights :', k_weights)
    k_test_probas = k_test_probas * k_weights               # 广播机制，使得每个模型预测的概率乘上该模型的权重 (k, len(test_dataset), 2)
    test_probas = torch.sum(k_test_probas, dim=0)           # 求和
    # search f1
    best_f1, best_thres = search_f1(dev_labels, dev_probas[:, 1])
    print(best_f1, best_thres)
    test_preds = (test_probas[:, 1] > best_thres).type(torch.long)
    
    # 不用search f1
    # test_preds = torch.argmax(test_probas, dim=1) 
    return test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres

## 操作

In [10]:
model_version = 'FFTPD-5fold-V7.0'     # 模型版本
scheme_version = 'FFTPD-5fold-V7.0'     # 方案版本
# train_df = pd.read_csv(train_all_path)
train_df = pd.read_csv(config.augmented_V0204_path)
# test_df = pd.read_csv(test_path)
# train_df = pd.read_csv(train_V0_path)
test_df = pd.read_csv(config.test_V0_path)
k = 5
params = {
    'batch_size': 24,
    'epochs': 20,
    'lr': 3e-05,
    'l2_weight':0,
    'weight_decay': 0,
    'dropout_rate': 0.5,
    'momentum': 0,
    'early_stoping':3,
    'patience': 2,
    'lstm_hidden_size': 512,
    'num_directions':2,
    'max_seq_len_q': config.max_seq_len_q,
    'max_seq_len_r': config.max_seq_len_r,
    'max_gradient_norm': 10.0,
    'pretrained_model_path': config.pretrained_roberta_wwm_ext_large_path, 
}

bert_tokenizer = BertTokenizer.from_pretrained(os.path.join(params['pretrained_model_path'], 'vocab.txt'))
output_path = os.path.join(config.root_path, 'output/'+scheme_version+'.txt')

print("\t* K fold training and validating...")
test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres = k_fold_cross_val(train_df, test_df, params, k, 
                                                                                          bert_tokenizer, config.best_model_path, 
                                                                                          output_path, model_version)
dev_preds = (dev_probas[:, 1] > best_thres).type(torch.long)
fpr, tpr, thresholds = roc_curve(dev_labels, dev_probas[:, 1], pos_label=1)
dev_auc = auc(fpr, tpr)
print('dev auc: ',dev_auc)

print("\t* Saving dev result...")
with open(os.path.join(config.root_path, 'report/'+scheme_version+'_'+'classification_report.txt'), 'w') as fp:
    fp.write(classification_report(dev_labels, dev_preds))
    fp.write('\n')
    fp.write('f1-score: {:.4f}'.format(f1_score(dev_labels, dev_preds)))
    fp.write(' auc: {:.4f}'.format(dev_auc))

train_df.to_csv(os.path.join(config.root_path, 'result/'+scheme_version+'_pred_result.csv'), index=0)

print("\t* Predicting...")
test_df['pred'] = test_preds.cpu().numpy()
k_test_probas = k_test_probas.cpu().numpy()

print("\t* Saving test result...")
# 保存预测结果
time_str = '' + time.strftime("%Y%m%d%H%M", time.localtime())
test_df[['dialog_id', 'reply_id', 'pred']].to_csv(os.path.join(config.root_path,'submission/'+scheme_version+'_'+time_str+'.csv'),
                                                  sep='\t',
                                                  index=0,
                                                  header=0)
# 保存K折预测概率结果
k_test_probas_path = os.path.join(config.root_path, 'result/'+scheme_version+'_'+str(k)+'_test_probas.npz')
if not os.path.exists(k_test_probas_path):
    np.save(k_test_probas_path, k_test_probas)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


	* K fold training and validating...
	* Start 0 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:11.0902s


100%|██████████| 17/17 [00:43<00:00,  2.42s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.7368, accuracy:25.7934, f1_score: 0.3937, best_thres: 0.0000, auc: 0.5189

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.5003s, loss: 0.3425: 100%|██████████| 1439/1439 [12:05<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:725.5674s, loss: 0.3425, accuracy: 85.4231%, f1_score: 0.6868, auc: 0.8929


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2575, accuracy: 89.4255%, f1_score: 0.7793, best_thres: 0.0000, auc: 0.9433


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.5021s, loss: 0.2114: 100%|██████████| 1439/1439 [12:07<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.0937s, loss: 0.2114, accuracy: 91.8428%, f1_score: 0.8361, auc: 0.9612


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2461, accuracy: 89.9583%, f1_score: 0.8006, best_thres: 0.0000, auc: 0.9524


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.5017s, loss: 0.1336: 100%|██████████| 1439/1439 [12:07<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:727.3010s, loss: 0.1336, accuracy: 95.1439%, f1_score: 0.9034, auc: 0.9839


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.2631, accuracy: 90.9544%, f1_score: 0.8164, best_thres: 0.0000, auc: 0.9521
-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.5024s, loss: 0.0913: 100%|██████████| 1439/1439 [12:08<00:00,  2.03it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.5138s, loss: 0.0913, accuracy: 96.7539%, f1_score: 0.9358, auc: 0.9924


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.2967, accuracy: 91.2208%, f1_score: 0.8187, best_thres: 0.0000, auc: 0.9521
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.5026s, loss: 0.0704: 100%|██████████| 1439/1439 [12:08<00:00,  2.03it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.8695s, loss: 0.0704, accuracy: 97.4663%, f1_score: 0.9498, auc: 0.9954


100%|██████████| 17/17 [00:45<00:00,  2.59s/it]


-> Validation loss: 0.3321, accuracy: 90.2942%, f1_score: 0.8170, best_thres: 0.0000, auc: 0.9535


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.5040s, loss: 0.0559: 100%|██████████| 1439/1439 [12:10<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.4813s, loss: 0.0559, accuracy: 98.0020%, f1_score: 0.9603, auc: 0.9968


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.4099, accuracy: 89.5413%, f1_score: 0.8069, best_thres: 0.0000, auc: 0.9536


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.5040s, loss: 0.0461: 100%|██████████| 1439/1439 [12:10<00:00,  2.04it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.5241s, loss: 0.0461, accuracy: 98.4305%, f1_score: 0.9688, auc: 0.9978


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3794, accuracy: 91.7188%, f1_score: 0.8352, best_thres: 0.0000, auc: 0.9514
-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.5036s, loss: 0.0379: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.9625s, loss: 0.0379, accuracy: 98.7462%, f1_score: 0.9751, auc: 0.9984


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3966, accuracy: 91.7304%, f1_score: 0.8333, best_thres: 0.0000, auc: 0.9534
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.5030s, loss: 0.0308: 100%|██████████| 1439/1439 [12:09<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.2062s, loss: 0.0308, accuracy: 98.9836%, f1_score: 0.9798, auc: 0.9990


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.4190, accuracy: 91.8346%, f1_score: 0.8330, best_thres: 0.0000, auc: 0.9498
-> Early stopping: patience limit reached, stopping...
	* Start 1 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:7.5745s


100%|██████████| 17/17 [00:45<00:00,  2.53s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.6775, accuracy:66.8172, f1_score: 0.2217, best_thres: 0.0000, auc: 0.4975

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.5045s, loss: 0.3473: 100%|██████████| 1439/1439 [12:11<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.3971s, loss: 0.3473, accuracy: 85.3507%, f1_score: 0.6823, auc: 0.8899


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2761, accuracy: 88.8117%, f1_score: 0.7675, best_thres: 0.0000, auc: 0.9333


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.5025s, loss: 0.2143: 100%|██████████| 1439/1439 [12:08<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.5185s, loss: 0.2143, accuracy: 91.4924%, f1_score: 0.8291, auc: 0.9604


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2570, accuracy: 89.1707%, f1_score: 0.7899, best_thres: 0.0000, auc: 0.9477


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.5035s, loss: 0.1409: 100%|██████████| 1439/1439 [12:09<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.0945s, loss: 0.1409, accuracy: 94.6690%, f1_score: 0.8941, auc: 0.9829


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2800, accuracy: 88.8696%, f1_score: 0.7940, best_thres: 0.0000, auc: 0.9505


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.5037s, loss: 0.1003: 100%|██████████| 1439/1439 [12:10<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.4001s, loss: 0.1003, accuracy: 96.3920%, f1_score: 0.9287, auc: 0.9908


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.2970, accuracy: 89.8772%, f1_score: 0.8055, best_thres: 0.0000, auc: 0.9468
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.5037s, loss: 0.0771: 100%|██████████| 1439/1439 [12:10<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.2817s, loss: 0.0771, accuracy: 97.2607%, f1_score: 0.9458, auc: 0.9943


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3167, accuracy: 91.0818%, f1_score: 0.8227, best_thres: 0.0000, auc: 0.9516


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.5037s, loss: 0.0573: 100%|██████████| 1439/1439 [12:10<00:00,  2.00it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.4325s, loss: 0.0573, accuracy: 98.0367%, f1_score: 0.9611, auc: 0.9968


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3508, accuracy: 90.8038%, f1_score: 0.8187, best_thres: 0.0000, auc: 0.9471
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.5031s, loss: 0.0464: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.5327s, loss: 0.0464, accuracy: 98.4218%, f1_score: 0.9687, auc: 0.9978


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3256, accuracy: 91.5219%, f1_score: 0.8288, best_thres: 0.0000, auc: 0.9497
-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.5034s, loss: 0.0398: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.9202s, loss: 0.0398, accuracy: 98.6159%, f1_score: 0.9725, auc: 0.9985


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]


-> Validation loss: 0.3878, accuracy: 91.6840%, f1_score: 0.8336, best_thres: 0.0000, auc: 0.9495
-> Early stopping: patience limit reached, stopping...
	* Start 2 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:7.5144s


100%|██████████| 17/17 [00:45<00:00,  2.53s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.6894, accuracy:55.3046, f1_score: 0.3120, best_thres: 0.0000, auc: 0.4940

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.5043s, loss: 0.3524: 100%|██████████| 1439/1439 [12:10<00:00,  2.04it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.1473s, loss: 0.3524, accuracy: 85.1624%, f1_score: 0.6741, auc: 0.8852


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2747, accuracy: 88.9507%, f1_score: 0.7749, best_thres: 0.0000, auc: 0.9355


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.5025s, loss: 0.2169: 100%|██████████| 1439/1439 [12:08<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.5365s, loss: 0.2169, accuracy: 91.4982%, f1_score: 0.8275, auc: 0.9591


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2514, accuracy: 90.1784%, f1_score: 0.8013, best_thres: 0.0000, auc: 0.9466


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.5042s, loss: 0.1380: 100%|██████████| 1439/1439 [12:10<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.9994s, loss: 0.1380, accuracy: 94.8746%, f1_score: 0.8978, auc: 0.9832


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2810, accuracy: 90.6185%, f1_score: 0.8150, best_thres: 0.0000, auc: 0.9493


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.5036s, loss: 0.0939: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.0781s, loss: 0.0939, accuracy: 96.5860%, f1_score: 0.9319, auc: 0.9916


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.3107, accuracy: 91.5335%, f1_score: 0.8296, best_thres: 0.0000, auc: 0.9532


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.5045s, loss: 0.0729: 100%|██████████| 1439/1439 [12:11<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.5170s, loss: 0.0729, accuracy: 97.4576%, f1_score: 0.9493, auc: 0.9947


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.2750, accuracy: 91.0007%, f1_score: 0.8225, best_thres: 0.0000, auc: 0.9495
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.5034s, loss: 0.0569: 100%|██████████| 1439/1439 [12:09<00:00,  2.03it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.8340s, loss: 0.0569, accuracy: 97.9962%, f1_score: 0.9601, auc: 0.9965


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3723, accuracy: 91.5219%, f1_score: 0.8373, best_thres: 0.0000, auc: 0.9516
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.5037s, loss: 0.0447: 100%|██████████| 1439/1439 [12:10<00:00,  2.04it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.2336s, loss: 0.0447, accuracy: 98.4884%, f1_score: 0.9698, auc: 0.9980


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.3372, accuracy: 91.4408%, f1_score: 0.8349, best_thres: 0.0000, auc: 0.9569


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.5038s, loss: 0.0387: 100%|██████████| 1439/1439 [12:10<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.4638s, loss: 0.0387, accuracy: 98.6651%, f1_score: 0.9733, auc: 0.9983


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3761, accuracy: 91.8578%, f1_score: 0.8385, best_thres: 0.0000, auc: 0.9547
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.5033s, loss: 0.0344: 100%|██████████| 1439/1439 [12:09<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.6662s, loss: 0.0344, accuracy: 98.8417%, f1_score: 0.9768, auc: 0.9986


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3260, accuracy: 91.7304%, f1_score: 0.8344, best_thres: 0.0000, auc: 0.9593


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.5036s, loss: 0.0270: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.1285s, loss: 0.0270, accuracy: 99.0589%, f1_score: 0.9812, auc: 0.9990


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3763, accuracy: 91.8230%, f1_score: 0.8347, best_thres: 0.0000, auc: 0.9525
-> Start epoch 11


Batch num: 1439. Avg. batch proc. time: 0.5036s, loss: 0.0222: 100%|██████████| 1439/1439 [12:09<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.9909s, loss: 0.0222, accuracy: 99.2674%, f1_score: 0.9853, auc: 0.9993


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4357, accuracy: 92.4253%, f1_score: 0.8467, best_thres: 0.0000, auc: 0.9559
-> Start epoch 12


Batch num: 1439. Avg. batch proc. time: 0.5034s, loss: 0.0221: 100%|██████████| 1439/1439 [12:09<00:00,  2.03it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.8599s, loss: 0.0221, accuracy: 99.2297%, f1_score: 0.9846, auc: 0.9994


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3797, accuracy: 92.1705%, f1_score: 0.8467, best_thres: 0.0000, auc: 0.9603


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 13


Batch num: 1439. Avg. batch proc. time: 0.5032s, loss: 0.0144: 100%|██████████| 1439/1439 [12:10<00:00,  2.00it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.5207s, loss: 0.0144, accuracy: 99.4875%, f1_score: 0.9897, auc: 0.9997


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4287, accuracy: 92.4485%, f1_score: 0.8481, best_thres: 0.0000, auc: 0.9557
-> Start epoch 14


Batch num: 1439. Avg. batch proc. time: 0.5022s, loss: 0.0149: 100%|██████████| 1439/1439 [12:08<00:00,  2.05it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.4413s, loss: 0.0149, accuracy: 99.4672%, f1_score: 0.9893, auc: 0.9997


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.4116, accuracy: 92.8886%, f1_score: 0.8573, best_thres: 0.0000, auc: 0.9606


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 15


Batch num: 1439. Avg. batch proc. time: 0.5032s, loss: 0.0111: 100%|██████████| 1439/1439 [12:09<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.7843s, loss: 0.0111, accuracy: 99.5627%, f1_score: 0.9912, auc: 0.9998


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4855, accuracy: 92.8307%, f1_score: 0.8538, best_thres: 0.0000, auc: 0.9560
-> Start epoch 16


Batch num: 1439. Avg. batch proc. time: 0.5025s, loss: 0.0088: 100%|██████████| 1439/1439 [12:08<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.8486s, loss: 0.0088, accuracy: 99.6149%, f1_score: 0.9923, auc: 0.9999


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.5079, accuracy: 92.9465%, f1_score: 0.8583, best_thres: 0.0000, auc: 0.9566
-> Start epoch 17


Batch num: 1439. Avg. batch proc. time: 0.5033s, loss: 0.0073: 100%|██████████| 1439/1439 [12:09<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.7826s, loss: 0.0073, accuracy: 99.6583%, f1_score: 0.9931, auc: 1.0000


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.5442, accuracy: 92.9002%, f1_score: 0.8571, best_thres: 0.0000, auc: 0.9539
-> Early stopping: patience limit reached, stopping...
	* Start 3 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:7.2433s


100%|██████████| 17/17 [00:45<00:00,  2.53s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.7082, accuracy:34.8546, f1_score: 0.3954, best_thres: 0.0000, auc: 0.5098

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.5047s, loss: 0.3388: 100%|██████████| 1439/1439 [12:11<00:00,  2.00it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.7620s, loss: 0.3388, accuracy: 85.6783%, f1_score: 0.6892, auc: 0.8950


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.2704, accuracy: 88.6250%, f1_score: 0.7737, best_thres: 0.0000, auc: 0.9381


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.5030s, loss: 0.2043: 100%|██████████| 1439/1439 [12:09<00:00,  1.97it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.2696s, loss: 0.2043, accuracy: 91.9212%, f1_score: 0.8363, auc: 0.9640


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2509, accuracy: 89.8529%, f1_score: 0.8042, best_thres: 0.0000, auc: 0.9501


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.5043s, loss: 0.1272: 100%|██████████| 1439/1439 [12:10<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.0869s, loss: 0.1272, accuracy: 95.3178%, f1_score: 0.9067, auc: 0.9856


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2882, accuracy: 90.8027%, f1_score: 0.8159, best_thres: 0.0000, auc: 0.9508


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.5043s, loss: 0.0870: 100%|██████████| 1439/1439 [12:10<00:00,  1.97it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.1117s, loss: 0.0870, accuracy: 96.8380%, f1_score: 0.9368, auc: 0.9930


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3336, accuracy: 90.9649%, f1_score: 0.8244, best_thres: 0.0000, auc: 0.9539


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.5031s, loss: 0.0658: 100%|██████████| 1439/1439 [12:09<00:00,  1.97it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.4414s, loss: 0.0658, accuracy: 97.7559%, f1_score: 0.9551, auc: 0.9956


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3454, accuracy: 91.4746%, f1_score: 0.8356, best_thres: 0.0000, auc: 0.9534
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.5043s, loss: 0.0520: 100%|██████████| 1439/1439 [12:10<00:00,  1.93it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.1840s, loss: 0.0520, accuracy: 98.2800%, f1_score: 0.9657, auc: 0.9972


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3439, accuracy: 91.5904%, f1_score: 0.8360, best_thres: 0.0000, auc: 0.9552


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.5021s, loss: 0.0432: 100%|██████████| 1439/1439 [12:07<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:727.9621s, loss: 0.0432, accuracy: 98.5203%, f1_score: 0.9704, auc: 0.9980


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4025, accuracy: 91.1966%, f1_score: 0.8280, best_thres: 0.0000, auc: 0.9536
-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.5043s, loss: 0.0346: 100%|██████████| 1439/1439 [12:10<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.1287s, loss: 0.0346, accuracy: 98.7896%, f1_score: 0.9758, auc: 0.9988


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3982, accuracy: 91.9147%, f1_score: 0.8398, best_thres: 0.0000, auc: 0.9563


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.5042s, loss: 0.0315: 100%|██████████| 1439/1439 [12:11<00:00,  2.00it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.8316s, loss: 0.0315, accuracy: 98.9836%, f1_score: 0.9796, auc: 0.9987


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.3954, accuracy: 92.2043%, f1_score: 0.8435, best_thres: 0.0000, auc: 0.9607


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.5039s, loss: 0.0243: 100%|██████████| 1439/1439 [12:10<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.7543s, loss: 0.0243, accuracy: 99.2037%, f1_score: 0.9840, auc: 0.9994


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4339, accuracy: 91.7410%, f1_score: 0.8349, best_thres: 0.0000, auc: 0.9583
-> Start epoch 11


Batch num: 1439. Avg. batch proc. time: 0.5022s, loss: 0.0198: 100%|██████████| 1439/1439 [12:07<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.1375s, loss: 0.0198, accuracy: 99.3311%, f1_score: 0.9866, auc: 0.9995


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4252, accuracy: 92.2970%, f1_score: 0.8438, best_thres: 0.0000, auc: 0.9579
-> Start epoch 12


Batch num: 1439. Avg. batch proc. time: 0.5030s, loss: 0.0177: 100%|██████████| 1439/1439 [12:09<00:00,  2.00it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.2526s, loss: 0.0177, accuracy: 99.3717%, f1_score: 0.9874, auc: 0.9997


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.4670, accuracy: 92.1464%, f1_score: 0.8442, best_thres: 0.0000, auc: 0.9548
-> Early stopping: patience limit reached, stopping...
	* Start 4 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:7.2356s


100%|██████████| 17/17 [00:45<00:00,  2.54s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.6702, accuracy:70.4622, f1_score: 0.1785, best_thres: 0.0000, auc: 0.5168

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.5051s, loss: 0.3333: 100%|██████████| 1439/1439 [12:11<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:732.0974s, loss: 0.3333, accuracy: 86.0837%, f1_score: 0.6963, auc: 0.8983


100%|██████████| 17/17 [00:46<00:00,  2.58s/it]


-> Validation loss: 0.2694, accuracy: 89.0305%, f1_score: 0.7720, best_thres: 0.0000, auc: 0.9409


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.5025s, loss: 0.2075: 100%|██████████| 1439/1439 [12:08<00:00,  2.02it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.4996s, loss: 0.2075, accuracy: 91.9183%, f1_score: 0.8361, auc: 0.9622


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.2639, accuracy: 90.2467%, f1_score: 0.8034, best_thres: 0.0000, auc: 0.9501


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.5034s, loss: 0.1317: 100%|██████████| 1439/1439 [12:09<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.8498s, loss: 0.1317, accuracy: 95.2164%, f1_score: 0.9041, auc: 0.9842


100%|██████████| 17/17 [00:46<00:00,  2.58s/it]


-> Validation loss: 0.2829, accuracy: 90.7912%, f1_score: 0.8159, best_thres: 0.0000, auc: 0.9509


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.5024s, loss: 0.0925: 100%|██████████| 1439/1439 [12:08<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.4474s, loss: 0.0925, accuracy: 96.7251%, f1_score: 0.9344, auc: 0.9921


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.3214, accuracy: 90.8259%, f1_score: 0.8249, best_thres: 0.0000, auc: 0.9513


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.5044s, loss: 0.0698: 100%|██████████| 1439/1439 [12:11<00:00,  1.94it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:731.3344s, loss: 0.0698, accuracy: 97.5474%, f1_score: 0.9508, auc: 0.9952


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.3431, accuracy: 91.0112%, f1_score: 0.8225, best_thres: 0.0000, auc: 0.9516


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.5027s, loss: 0.0547: 100%|██████████| 1439/1439 [12:08<00:00,  1.97it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.9590s, loss: 0.0547, accuracy: 98.1352%, f1_score: 0.9626, auc: 0.9972


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.3863, accuracy: 91.7178%, f1_score: 0.8313, best_thres: 0.0000, auc: 0.9550


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.5021s, loss: 0.0455: 100%|██████████| 1439/1439 [12:08<00:00,  1.97it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.7707s, loss: 0.0455, accuracy: 98.4393%, f1_score: 0.9686, auc: 0.9978


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3934, accuracy: 91.8800%, f1_score: 0.8399, best_thres: 0.0000, auc: 0.9549
-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.5009s, loss: 0.0391: 100%|██████████| 1439/1439 [12:06<00:00,  2.01it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:726.2758s, loss: 0.0391, accuracy: 98.6391%, f1_score: 0.9727, auc: 0.9983


100%|██████████| 17/17 [00:45<00:00,  2.54s/it]


-> Validation loss: 0.3483, accuracy: 91.4862%, f1_score: 0.8330, best_thres: 0.0000, auc: 0.9554


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.4984s, loss: 0.0297: 100%|██████████| 1439/1439 [12:02<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:722.7025s, loss: 0.0297, accuracy: 98.9460%, f1_score: 0.9788, auc: 0.9991


100%|██████████| 17/17 [00:45<00:00,  2.56s/it]


-> Validation loss: 0.4087, accuracy: 91.8337%, f1_score: 0.8420, best_thres: 0.0000, auc: 0.9578


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.5034s, loss: 0.0275: 100%|██████████| 1439/1439 [12:09<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.8124s, loss: 0.0275, accuracy: 99.0358%, f1_score: 0.9806, auc: 0.9992


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]


-> Validation loss: 0.4039, accuracy: 91.9032%, f1_score: 0.8380, best_thres: 0.0000, auc: 0.9603


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 11


Batch num: 1439. Avg. batch proc. time: 0.5032s, loss: 0.0234: 100%|██████████| 1439/1439 [12:09<00:00,  1.98it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:729.3972s, loss: 0.0234, accuracy: 99.2008%, f1_score: 0.9839, auc: 0.9995


100%|██████████| 17/17 [00:46<00:00,  2.59s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4336, accuracy: 91.5557%, f1_score: 0.8387, best_thres: 0.0000, auc: 0.9563
-> Start epoch 12


Batch num: 1439. Avg. batch proc. time: 0.5038s, loss: 0.0190: 100%|██████████| 1439/1439 [12:10<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:730.2837s, loss: 0.0190, accuracy: 99.2819%, f1_score: 0.9856, auc: 0.9996


100%|██████████| 17/17 [00:45<00:00,  2.58s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4552, accuracy: 91.9842%, f1_score: 0.8400, best_thres: 0.0000, auc: 0.9554
-> Start epoch 13


Batch num: 1439. Avg. batch proc. time: 0.5026s, loss: 0.0179: 100%|██████████| 1439/1439 [12:08<00:00,  1.99it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:728.8217s, loss: 0.0179, accuracy: 99.3485%, f1_score: 0.9869, auc: 0.9996


100%|██████████| 17/17 [00:46<00:00,  2.60s/it]


-> Validation loss: 0.4213, accuracy: 91.9032%, f1_score: 0.8400, best_thres: 0.0000, auc: 0.9585
-> Early stopping: patience limit reached, stopping...
k_best_score : [0.95357881 0.95162498 0.96060127 0.96069611 0.9602661 ]
k weights : [[[0.19921144]]

 [[0.19880327]]

 [[0.2006785 ]]

 [[0.20069831]]

 [[0.20060848]]]
0.8334334426835467 0.64
dev auc:  0.9489429355166976
	* Saving dev result...
	* Predicting...
	* Saving test result...
