In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import jieba
import re
import os
import time
import gc

from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score, auc, roc_curve, classification_report

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import sys
sys.path.append('..')

In [4]:
import config
from config import device, is_cuda

## dataset

In [5]:
class QAMatchDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len_q, max_seq_len_r, mode):
        assert mode in ['train', 'dev', 'test']

        self.mode = mode
        self.tokenizer = tokenizer
        self.df = df
        self.max_seq_len_q = max_seq_len_q
        self.max_seq_len_r = max_seq_len_r
        # self.df = pd.read_csv(file)
        # self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(file)

    def __getitem__(self, idx):
        token_seq_1 = self.df.iloc[idx]['question']
        token_seq_2 = self.df.iloc[idx]['reply_content']
        if self.mode in ['train', 'dev']:
            label_tensor = torch.tensor(self.df.iloc[idx]['label'])
        else:
            label_tensor = None
        token_seq_1 = self.tokenizer.tokenize(token_seq_1)
        token_seq_2 = self.tokenizer.tokenize(token_seq_2)
#         print("token_seq_1:", token_seq_1, "lens:", len(token_seq_1))
#         print("token_seq_2:", token_seq_2, "lens:", len(token_seq_2))

        # truncate
        if len(token_seq_1) > self.max_seq_len_q:
            token_seq_1 = token_seq_1[:self.max_seq_len_q]
        if len(token_seq_2) > self.max_seq_len_r:
            token_seq_2 = token_seq_2[:self.max_seq_len_r]

        seq = ["[CLS]"] + token_seq_1 + ["[SEP]"] + token_seq_2 + ["[SEP]"]
        seq = self.tokenizer.convert_tokens_to_ids(seq)

        seq_segments = [0] * (len(token_seq_1) + 2) + [1] * (len(token_seq_2) + 1)

        return torch.Tensor(seq).type(torch.long), torch.Tensor(seq_segments).type(torch.long), \
            torch.Tensor([len(token_seq_1), len(token_seq_2)]).type(torch.long), label_tensor
            

    def collate_fn(self, samples):
        seqs = [s[0] for s in samples]
        seq_segments = [s[1] for s in samples]
        seq_lens = torch.stack([s[2] for s in samples])

        if self.mode in ['train', 'dev']:
            labels = torch.stack([s[3] for s in samples])
        else:
            labels = None

        seqs = pad_sequence(seqs, batch_first=True)
        seq_segments = pad_sequence(seq_segments, batch_first=True)

        # attention mask处理
        seq_masks = torch.zeros(seqs.shape, dtype=torch.long)
        seq_masks = seq_masks.masked_fill(seqs != 0, 1)

        return seqs, seq_masks, seq_segments, seq_lens, labels
    def __len__(self):
        return len(self.df)

## model

In [6]:
class BertModelTrain(nn.Module):
    def __init__(self, params):
        super(BertModelTrain, self).__init__()        
        self.bert_config = BertConfig.from_pretrained(os.path.join(params['pretrained_model_path'], 'config.json'))
#         self.bert_config.output_hidden_states = True
        self.bert = BertModel.from_pretrained(params['pretrained_model_path'], output_hidden_states=False)
        self.bilstm = nn.LSTM(input_size=self.bert_config.hidden_size, 
                              hidden_size=params['lstm_hidden_size'], 
                              bidirectional=True, 
                              batch_first=True)
        self.linear = nn.Linear(6 * self.bert_config.hidden_size + 4 * params['num_directions'] * params['lstm_hidden_size'], 1)
        self.dropout = nn.Dropout(p=params['dropout_rate'])
        self.loss_fn = nn.BCELoss()
        for param in self.bert.parameters():
            param.requires_grad = True     # fine-tune，每个参数都要更新

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, batch_seq_lens, labels=None):
        """
        :param batch_seqs: input_ids
        :param batch_seq_masks: attention_mask
        :param batch_seq_segments: token_type_ids
        :param batch_seq_lens: (batch, 2)   记录着每一个样本对中，两个文本的真实长度（即不加[CLS]/[SEP]）
        :param labels:
        :return: outputs: (loss, logits, ...)
                 outputs: (logits, ...)
        注：
        hidden_size: bert中的hidden_size
        lstm_hidden_szie: lstm中的hidden_size
        """
        # q_embeddings: last_hidden_state, (batch_size, sequence_length, hidden_size)
        # pooler_output : 最后一层[CLS]token的hidden state过一个FN+tanh输出的logits  (batch_size, hidden_size)
        q_embeddings, pooler_output = self.bert(input_ids=batch_seqs,
                                                attention_mask=batch_seq_masks,
                                                token_type_ids=batch_seq_segments)[:2]
        # lstm_hidden_size = 512
        # lstm_output:t=1到t=seq_len的最后一层的hidden state, shape(batch_first):(batch, seq_len, num_directions * lstm_hidden_size)
        # t时刻的token向量= ht(->) concat ht(<-) , shape是lstm_hidden_size*2
        lstm_output = self.bilstm(q_embeddings)[0]
        
        # last layer cls hidden state: (batch_size, hidden_size)
        last_cls_hidden_state = q_embeddings[:, 0]
        
        # lstm层：seq_1和seq_2的句向量embedding（max pooling），均是(batch, num_directions * lstm_hidden_size)
        lstm_seq_1_embeddings, lstm_seq_2_embeddings = self.get_seq_embeddings(lstm_output, batch_seq_lens)
        # |seq_1 - seq_2|
        lstm_seq_gap = torch.abs(lstm_seq_1_embeddings - lstm_seq_2_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        lstm_seq_multiple = lstm_seq_1_embeddings * lstm_seq_2_embeddings
        
        # 倒数第一层：seq_1和seq_2的句向量embedding（max pooling），均是(batch, hidden_size)
        last_seq_1_embeddings, last_seq_2_embeddings = self.get_seq_embeddings(q_embeddings, batch_seq_lens)
        # |seq_1 - seq_2|
        last_seq_gap = torch.abs(last_seq_1_embeddings - last_seq_2_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        last_seq_multiple = last_seq_1_embeddings * last_seq_2_embeddings
        
        # concatenate this four tensor -> (batch_size, 6 * hidden_size + 4 * num_directions * lstm_hidden_size)
        x = torch.cat([pooler_output, last_cls_hidden_state, 
                       last_seq_1_embeddings, last_seq_gap, last_seq_multiple, last_seq_2_embeddings,
                       lstm_seq_1_embeddings, lstm_seq_gap, lstm_seq_multiple, lstm_seq_2_embeddings], dim=1)
        # dropout
        x = self.dropout(x)
#         x = nn.functional.dropout(x, p=0.6)
        # FC层 -> (batch, 1)
        x = self.linear(x)
        # sigmoid
        output = torch.sigmoid(x)    # (batch_size, 1) 即模型预测每个样本为1的概率

        logits = x
        proba_0 = 1.0 - output     # (batch_size, 1)
        probabilities = torch.cat((proba_0, output), dim=1)   # (batch_size, 2)
        if labels is not None:
            # 有标签，则返回loss, logits, probabilities
            loss = self.loss_fn(output.squeeze(), labels.type(torch.float))
            outputs = (loss, logits, probabilities)
        else:
            # 无标签，则返回logits, probabilities
            outputs = (logits, probabilities)

        return outputs
    
    def get_seq_embeddings(self, q_embeddings, batch_seq_lens):
        """
        获取batch中每个样本对中，seq_1和seq_2的
        :param q_embeddings: last_hidden_state, (batch_size, sequence_length, hidden_size)
        :param batch_seq_lens: (batch, 2)   记录着每一个样本对中，两个文本的真实长度（即不加[CLS]/[SEP]）
        """
        batch_seq_1 = []
        batch_seq_2 = []
        for batch in range(q_embeddings.shape[0]):
            seq_1_end_index = 1 + batch_seq_lens[batch][0]     # # 要加1，否则最后一个字取不到。这里不包含第一句的[SEP]
            seq_2_start_index = seq_1_end_index + 1
            seq_2_end_index = seq_2_start_index + batch_seq_lens[batch][1]   # 第二句不包括最后的[SEP]
            seq_1_embedding = q_embeddings[batch, 1:seq_1_end_index,:]
            seq_2_embedding = q_embeddings[batch, seq_2_start_index:seq_2_end_index,:]
            batch_seq_1.append(torch.max(seq_1_embedding, dim=0)[0])    # 句向量用max pooling
            batch_seq_2.append(torch.max(seq_2_embedding, dim=0)[0])    
        batch_seq_1 = torch.stack(batch_seq_1)
        batch_seq_2 = torch.stack(batch_seq_2)
        return batch_seq_1.to(device), batch_seq_2.to(device)

In [7]:
def writeToLog(path, content):
    with open(path, 'a') as fp:
        fp.write(content)
        fp.write('\n')

## train

In [8]:
def train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, fold,
          version, checkpoint=None):
    # ---------------------- Model definition ---------------------- #
    print("\t* Building model...")
    bulid_time = time.time()
    model = BertModelTrain(params).to(device)
    print("\t* Building model time:{:.4f}s".format(time.time()-bulid_time))
    # ---------------------- Preparation for training -------------- #
#     param_optimizer = list(model.named_parameters())
    # 这里，指定部分参数不参与权重衰减
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#     optimizer_grouped_parameters = [{
#         'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#         'weight_decay': params['weight_decay']
#     }, {
#         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#         'weight_decay': 0.0
#     }]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': params['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
#     optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr'])
    optimizer = Adam(model.parameters(), lr=params['lr'])
#     optimizer = SGD(model.parameters(),lr=params['lr'],momentum=params['momentum'], weight_decay=params['l2_weight'])
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.85, patience=params['patience'])
    num_training_steps = len(train_dataloader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

    best_score = 0.0    # 记录validation最好的结果
    best_thres = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epoch_count = []
    train_losses = []
    valid_losses = []
    train_f1s = []
    valid_f1s = []
    train_aucs = []
    valid_aucs = []
    best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')

    # Compute loss and accuracy before starting (or resuming) training
    # 如果准备start training，这里的valid结果就是预训练BERT（做fine-tune之前）对下游任务的效果
    # 如果准备resuming training，这里的valid结果就是上一次fine-tune的结果
    valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
    print("\t* Validation loss before training: {:.4f}, accuracy:{:.4f}, "
          "f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
          format(valid_loss, (valid_accuracy * 100), valid_f1, thres, valid_auc))
    print("\n", 20 * "=", "Training Bert model o device: {}".format(device), 20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, params['epochs']+1):
        print("-> Start epoch {}".format(epoch))
        writeToLog(output_path, "-> Start epoch {}".format(epoch))
        epoch_count.append(epoch)
        # train
        epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc = train_for_one_epoch(model,
                                                                                          train_dataloader,
                                                                                          optimizer,
                                                                                          scheduler,
                                                                                          params['max_gradient_norm'])
        train_losses.append(epoch_loss)
        train_f1s.append(epoch_f1)
        train_aucs.append(epoch_auc)
        print("-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        writeToLog(output_path, "-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        
        # validation
        valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
        print("-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        writeToLog(output_path, "-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        
        valid_losses.append(valid_loss)
        valid_f1s.append(valid_f1)
        valid_aucs.append(valid_auc)
#         scheduler.step(valid_loss)
        
        if valid_auc <= best_score:
            patience_counter += 1
        else:
            best_score = valid_auc
            best_thres = thres
            patience_counter = 0
            best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
            torch.save({
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,    # k fold时以valid auc来看每折的模型的能力，从而对最终的预测结果进行加权平均
                "best_thres": best_thres,
                "epochs_count": epoch_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, best_model_saved_path)

        if patience_counter >= params['early_stoping']:
            print("-> Early stopping: patience limit reached, stopping...")
            break
            
    if patience_counter != 0:
        # 如果最后一个epoch不是最好的模型，则读取之前的最好的模型
        best_checkpoint = torch.load(best_model_saved_path)
        model.load_state_dict(best_checkpoint['model'])
#     return model, best_score, epoch_count, train_losses, train_f1s, train_aucs, valid_losses, valid_f1s, valid_aucs
    return model, best_score

def train_for_one_epoch(model, dataloader, optimizer, scheduler, max_gradient_norm):
    model.train()

    epoch_start_time = time.time()
    running_loss = 0.0   # 记录整个epoch的累加loss
    correct_count = 0.0
    batch_avg_time = 0.0 # 记录该epoch平均batch花费时间
    all_preds = []
    all_pred_probas = []
    all_labels = []

    tqdm_dataloader = tqdm(dataloader)
    for batch_index, data in enumerate(tqdm_dataloader):
        batch_start_time = time.time()
        if is_cuda:
            data = [t.to(device) for t in data if t is not None]
        # 梯度置零
        optimizer.zero_grad()
        seqs, seq_masks, seq_segments, seq_lens, labels = data
        outputs = model(seqs, seq_masks, seq_segments, seq_lens, labels)
        # 回传梯度
        loss = outputs[0]
        logits = outputs[1]
        probabilities = outputs[2]
        # probabilities = nn.functional.softmax(logits, dim=-1)
        loss.backward()
        # 梯度裁剪
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pred = torch.argmax(probabilities, dim=1)
        correct_count = correct_count + (pred == labels).sum().item()
        batch_avg_time += time.time() - batch_start_time
        all_preds.append(pred.cpu())
        all_labels.append(labels.cpu())
        all_pred_probas.append(probabilities.detach().cpu())

        description = "Batch num: {}. Avg. batch proc. time: {:.4f}s, loss: {:.4f}".\
            format(batch_index+1, batch_avg_time/(batch_index+1), running_loss/(batch_index+1))
        tqdm_dataloader.set_description(description)
#         del data
#         torch.cuda.empty_cache()
        
    all_labels = torch.cat(all_labels)    # 把每个batch的labels平铺成一维tensor (samples, )
    all_preds = torch.cat(all_preds)      # 把每个batch的preds平铺成一维tensor (samples, )
    all_pred_probas = torch.cat(all_pred_probas) # 把每个batch的probas平铺成tensor (samples, 2)

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_count / len(dataloader.dataset)
    epoch_time = time.time() - epoch_start_time
    epoch_auc = auc(fpr, tpr)
    epoch_f1 = f1_score(all_labels, all_preds)

    return epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc
#     return epoch_time, epoch_loss, epoch_accuracy, 0, epoch_auc


def validate(model, dataloader):
    model.eval()
    running_loss = 0.0  # 记录整个epoch的累加loss
    correct_count = 0.0
    # all_preds = []
    all_labels = []
    all_pred_probas = []
    tqdm_dataloader = tqdm(dataloader)

    # Deactivate autograd for evaluation
    with torch.no_grad():   # 必须加这个，减少显存的使用
        for batch_index, data in enumerate(tqdm_dataloader):
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]

            seqs, seq_masks, seq_segments, seq_lens, labels = data
            outputs = model(seqs, seq_masks, seq_segments, seq_lens, labels)
            loss = outputs[0]
            logits = outputs[1]
            probabilities = outputs[2]
            # probabilities = nn.functional.softmax(logits, dim=-1)

            running_loss += loss.item()
            # _, pred = torch.max(logits, dim=1)

            # correct_count = correct_count + (pred == labels).sum().item()
            # all_preds.append(pred.cpu())
            all_labels.append(labels.cpu())
            all_pred_probas.append(probabilities.cpu())
            
#             del data
#             torch.cuda.empty_cache()
                    
    all_labels = torch.cat(all_labels)  # 把每个batch的labels平铺成一维tensor shape: (samples, )
    # all_preds = torch.cat(all_preds)  # 把每个batch的preds平铺成一维tensor shape: (samples, )
    all_pred_probas = torch.cat(all_pred_probas)  # 把每个batch的probas变成tensor（原来是[tensor, tensor, ...]）


    # best_f1, best_thres = search_f1(all_labels, all_pred_probas[:, 1])
    # all_preds = (all_pred_probas[:, 1] > best_thres).type(torch.long)
    all_preds = torch.argmax(all_pred_probas, dim=1)
    correct_count = (all_preds == all_labels).sum().item()

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    valid_loss = running_loss / len(dataloader)
    valid_acc = correct_count / len(dataloader.dataset)
    valid_f1 = f1_score(all_labels, all_preds)
    # valid_f1 = best_f1
    valid_auc = auc(fpr, tpr)
    best_thres = 0
    return valid_loss, valid_acc, valid_f1, valid_auc, best_thres
    # return valid_loss, valid_acc, 0, 0
    
def search_f1(y_true, y_pred):
    """

    :param y_true: 一维tensor
    :param y_pred: 一维tensor，y_pred[i]表示第i个样本在label为1上的预测概率
    :return:
    """
    best_score = 0.0
    best_thres = 0.0
    for i in range(30, 70):
        thres = i / 100
        y_pred_bin = (y_pred > thres)   # 大于thres的为1，小于thres的为0
        # print("y_pred_bin shape:", y_pred_bin.shape)
        score = f1_score(y_true, y_pred_bin)
        if score > best_score:
            best_score = score
            best_thres = thres

    return best_score, best_thres
    
def get_pred_probas(model, dataloader, is_test=False):
    model.eval()
    probas = None
    all_labels = []
    with torch.no_grad():
        for data in dataloader:
            # 将所有tensors移到GPU上
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]
                
            if is_test:
                seqs, seq_masks, seq_segments, seq_lens = data[:4]
            else:
                seqs, seq_masks, seq_segments, seq_lens, labels = data
                all_labels.append(labels)
            outputs = model(seqs,
                            seq_masks,
                            seq_segments,
                            seq_lens)
            logits = outputs[0]
            probabilities = outputs[1]   # (batch, 2)

            if probas is None:
                probas = probabilities
            else:
                # 将每个batch的预测结果拼接起来
                probas = torch.cat([probas, probabilities])
    if is_test:
        return probas.cpu()
    all_labels = torch.cat(all_labels)  # (len, )
    return probas.cpu(), all_labels.cpu()

## KFold

In [9]:
def k_fold_cross_val(train_df, test_df, params, k, bert_tokenizer, best_model_path, output_path, version):
    kf = KFold(n_splits=k)
    test_dataset = QAMatchDataset(test_df, bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], mode='test')
    test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=3, collate_fn=test_dataset.collate_fn)
    dev_labels = []
    dev_probas = []
    k_test_probas = []
    k_best_scores = []
    for fold, (train_idxs, dev_idxs) in enumerate(kf.split(train_df)):
        print("\t* Start "+str(fold)+" fold")
        writeToLog(output_path, "\t* Start "+str(fold)+" fold")
#         dev_labels.extend(train_df.iloc[dev_idxs]['label'].tolist())
        # ---------------------- Data loading -------------------------- #
        print("\t* Building dataset...")
        train_dataset = QAMatchDataset(train_df.iloc[train_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'train')
        dev_dataset = QAMatchDataset(train_df.iloc[dev_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'dev')

        train_dataloader = DataLoader(train_dataset, batch_size=params['batch_size'], num_workers=3,
                                      collate_fn=train_dataset.collate_fn)
        dev_dataloader = DataLoader(dev_dataset, batch_size=512, num_workers=3,
                                    collate_fn=dev_dataset.collate_fn)
        best_model_fold_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
        checkpoint = None
        if not(os.path.exists(best_model_fold_path)):
            # 若没有
            
            model, best_score = train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, 
                                      fold, version, checkpoint=None)
        else:
            checkpoint = torch.load(best_model_fold_path)
            model = BertModelTrain(params).to(device)
            model.load_state_dict(checkpoint['model'])
            best_score = checkpoint['best_score']
        k_best_scores.append(best_score)
        
        fold_dev_proba, dev_label = get_pred_probas(model, dev_dataloader)
        for idx, proba in zip(dev_idxs, fold_dev_proba):
            train_df.loc[idx, 'proba_0'] = proba[0].item()
            train_df.loc[idx, 'proba_1'] = proba[1].item()
        fold_test_proba = get_pred_probas(model, test_dataloader, is_test=True)
        
        dev_labels.append(dev_label)
        dev_probas.append(fold_dev_proba)  # (k, len(dev_idxs), 2)
        k_test_probas.append(fold_test_proba) # (k, len(test_dataset), 2)
#         model.to(torch.device('cpu'))
        del model, train_dataloader, dev_dataloader, checkpoint
        torch.cuda.empty_cache() 
        time.sleep(5)
    
    dev_labels = torch.cat(dev_labels)  # (len(train_df),)      # 把每一折的验证集的label拼接，得到整个训练集的label
    dev_probas = torch.cat(dev_probas)  # (len(train_df), 2)    # 把每一折的验证集的预测结果拼接，得到整个训练集的预测结果
    
    k_test_probas = torch.stack(k_test_probas) # (k, len(test_dataset), 2)， 只是把[tensor, tensor, ... ]转为tensor
#     test_probas = torch.mean(k_test_probas, dim=0)  # (len(test_dataset), 2)  取每一折的平均

    # k折模型加权融合
    k_best_scores = np.array(k_best_scores)              
    k_weights = k_best_scores / k_best_scores.sum()             # (k,)
    k_weights = np.expand_dims(np.expand_dims(k_weights,1),1)   # (k, 1, 1)
    print('k_best_score :', k_best_scores)
    print('k weights :', k_weights)
    k_test_probas = k_test_probas * k_weights               # 广播机制，使得每个模型预测的概率乘上该模型的权重 (k, len(test_dataset), 2)
    test_probas = torch.sum(k_test_probas, dim=0)           # 求和
    # search f1
    best_f1, best_thres = search_f1(dev_labels, dev_probas[:, 1])
    print(best_f1, best_thres)
    test_preds = (test_probas[:, 1] > best_thres).type(torch.long)
    
    # 不用search f1
    # test_preds = torch.argmax(test_probas, dim=1) 
    return test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres

## 操作

In [10]:
model_version = 'FFTPD-5fold-V5.0.3'     # 模型版本
scheme_version = 'FFTPD-5fold-V5.0.3'     # 方案版本
# train_df = pd.read_csv(train_all_path)
train_df = pd.read_csv(config.augmented_V0204_path)
# test_df = pd.read_csv(test_path)
# train_df = pd.read_csv(train_V0_path)
test_df = pd.read_csv(config.test_V0_path)
k = 5

params = {
    'batch_size': 24,
    'epochs': 20,
    'lr': 2e-05,
    'l2_weight':0.0005,
    'weight_decay': 0.01,
    'dropout_rate': 0.5,
    'momentum': 0.8,
    'early_stoping':3,
    'patience': 2,
    'lstm_hidden_size': 768,
    'num_directions': 2,
    'max_seq_len_q': 24,
    'max_seq_len_r': 52,
    'max_gradient_norm': 10.0,
    'pretrained_model_path': config.pretrained_roberta_wwm_ext_large_path, 
}

bert_tokenizer = BertTokenizer.from_pretrained(os.path.join(params['pretrained_model_path'], 'vocab.txt'))
output_path = os.path.join(config.root_path, 'output/'+scheme_version+'.txt')

print("\t* K fold training and validating...")
test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres = k_fold_cross_val(train_df, test_df, params, k, 
                                                                                          bert_tokenizer, config.best_model_path, 
                                                                                          output_path, model_version)
dev_preds = (dev_probas[:, 1] > best_thres).type(torch.long)
fpr, tpr, thresholds = roc_curve(dev_labels, dev_probas[:, 1], pos_label=1)
dev_auc = auc(fpr, tpr)
print('dev auc: ',dev_auc)

print("\t* Saving dev result...")
with open(os.path.join(config.root_path, 'report/'+scheme_version+'_'+'classification_report.txt'), 'w') as fp:
    fp.write(classification_report(dev_labels, dev_preds))
    fp.write('\n')
    fp.write('f1-score: {:.4f}'.format(f1_score(dev_labels, dev_preds)))
    fp.write(' auc: {:.4f}'.format(dev_auc))

# 保存每一折对验证集的预测概率（0和1都有）
train_df.to_csv(os.path.join(config.root_path, 'result/'+scheme_version+'_pred_result.csv'), index=0)

print("\t* Predicting...")
test_df['pred'] = test_preds.cpu().numpy()
k_test_probas = k_test_probas.cpu().numpy()

print("\t* Saving test result...")
# 保存预测结果
time_str = '' + time.strftime("%Y%m%d%H%M", time.localtime())
test_df[['dialog_id', 'reply_id', 'pred']].to_csv(os.path.join(config.root_path,'submission/'+scheme_version+'_'+time_str+'.csv'),
                                                  sep='\t',
                                                  index=0,
                                                  header=0)
# 保存K折预测概率结果
k_test_probas_path = os.path.join(config.root_path, 'result/'+scheme_version+'_'+str(k)+'_test_probas.npz')
if not os.path.exists(k_test_probas_path):
    np.save(k_test_probas_path, k_test_probas)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


	* K fold training and validating...
	* Start 0 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:11.2278s


100%|██████████| 17/17 [00:40<00:00,  2.28s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.7197, accuracy:75.5733, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5579

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3729s, loss: 0.3674: 100%|██████████| 1439/1439 [09:02<00:00,  2.83it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:542.6291s, loss: 0.3674, accuracy: 84.5978%, f1_score: 0.6670, auc: 0.8773


100%|██████████| 17/17 [00:42<00:00,  2.34s/it]


-> Validation loss: 0.2670, accuracy: 88.8348%, f1_score: 0.7729, best_thres: 0.0000, auc: 0.9385


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3699s, loss: 0.2158: 100%|██████████| 1439/1439 [08:58<00:00,  2.62it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:538.5128s, loss: 0.2158, accuracy: 91.4780%, f1_score: 0.8278, auc: 0.9598


100%|██████████| 17/17 [00:42<00:00,  2.39s/it]


-> Validation loss: 0.2776, accuracy: 89.1012%, f1_score: 0.7929, best_thres: 0.0000, auc: 0.9462


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3941s, loss: 0.1348: 100%|██████████| 1439/1439 [09:35<00:00,  2.58it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:575.2229s, loss: 0.1348, accuracy: 95.0484%, f1_score: 0.9017, auc: 0.9835


100%|██████████| 17/17 [00:43<00:00,  2.46s/it]


-> Validation loss: 0.3002, accuracy: 90.1436%, f1_score: 0.8088, best_thres: 0.0000, auc: 0.9484


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3765s, loss: 0.0871: 100%|██████████| 1439/1439 [09:08<00:00,  2.70it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:548.3905s, loss: 0.0871, accuracy: 96.9508%, f1_score: 0.9396, auc: 0.9930


100%|██████████| 17/17 [00:41<00:00,  2.34s/it]


-> Validation loss: 0.3137, accuracy: 90.7575%, f1_score: 0.8179, best_thres: 0.0000, auc: 0.9538


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3713s, loss: 0.0592: 100%|██████████| 1439/1439 [09:00<00:00,  2.78it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:540.2462s, loss: 0.0592, accuracy: 97.9093%, f1_score: 0.9586, auc: 0.9967


100%|██████████| 17/17 [00:42<00:00,  2.35s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3957, accuracy: 91.3134%, f1_score: 0.8239, best_thres: 0.0000, auc: 0.9455
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3708s, loss: 0.0469: 100%|██████████| 1439/1439 [08:59<00:00,  2.78it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:539.7977s, loss: 0.0469, accuracy: 98.4392%, f1_score: 0.9690, auc: 0.9978


100%|██████████| 17/17 [00:42<00:00,  2.34s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3818, accuracy: 91.5798%, f1_score: 0.8282, best_thres: 0.0000, auc: 0.9526
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3733s, loss: 0.0389: 100%|██████████| 1439/1439 [09:03<00:00,  2.74it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:543.2008s, loss: 0.0389, accuracy: 98.7462%, f1_score: 0.9751, auc: 0.9984


100%|██████████| 17/17 [00:41<00:00,  2.34s/it]


-> Validation loss: 0.3712, accuracy: 91.7304%, f1_score: 0.8329, best_thres: 0.0000, auc: 0.9558


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.3704s, loss: 0.0308: 100%|██████████| 1439/1439 [08:58<00:00,  2.76it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:539.0560s, loss: 0.0308, accuracy: 99.0097%, f1_score: 0.9803, auc: 0.9990


100%|██████████| 17/17 [00:41<00:00,  2.34s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4259, accuracy: 91.6956%, f1_score: 0.8336, best_thres: 0.0000, auc: 0.9530
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.3705s, loss: 0.0270: 100%|██████████| 1439/1439 [08:59<00:00,  2.75it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:539.6955s, loss: 0.0270, accuracy: 99.1226%, f1_score: 0.9826, auc: 0.9991


100%|██████████| 17/17 [00:41<00:00,  2.34s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3877, accuracy: 92.1126%, f1_score: 0.8397, best_thres: 0.0000, auc: 0.9554
-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.3698s, loss: 0.0216: 100%|██████████| 1439/1439 [08:58<00:00,  2.86it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:538.4773s, loss: 0.0216, accuracy: 99.2992%, f1_score: 0.9861, auc: 0.9995


100%|██████████| 17/17 [00:42<00:00,  2.35s/it]


-> Validation loss: 0.4310, accuracy: 91.9852%, f1_score: 0.8400, best_thres: 0.0000, auc: 0.9567


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 11


Batch num: 1439. Avg. batch proc. time: 0.3731s, loss: 0.0181: 100%|██████████| 1439/1439 [09:02<00:00,  2.71it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:542.9087s, loss: 0.0181, accuracy: 99.3977%, f1_score: 0.9880, auc: 0.9997


100%|██████████| 17/17 [00:42<00:00,  2.37s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4404, accuracy: 92.0199%, f1_score: 0.8372, best_thres: 0.0000, auc: 0.9565
-> Start epoch 12


Batch num: 1439. Avg. batch proc. time: 0.3749s, loss: 0.0149: 100%|██████████| 1439/1439 [09:05<00:00,  2.67it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:545.6332s, loss: 0.0149, accuracy: 99.4730%, f1_score: 0.9895, auc: 0.9997


100%|██████████| 17/17 [00:42<00:00,  2.35s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.5080, accuracy: 91.9041%, f1_score: 0.8398, best_thres: 0.0000, auc: 0.9542
-> Start epoch 13


Batch num: 1439. Avg. batch proc. time: 0.3734s, loss: 0.0135: 100%|██████████| 1439/1439 [09:03<00:00,  2.86it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:543.1933s, loss: 0.0135, accuracy: 99.5077%, f1_score: 0.9902, auc: 0.9997


100%|██████████| 17/17 [00:42<00:00,  2.36s/it]


-> Validation loss: 0.4624, accuracy: 92.1937%, f1_score: 0.8442, best_thres: 0.0000, auc: 0.9565
-> Early stopping: patience limit reached, stopping...
	* Start 1 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:7.4417s


100%|██████████| 17/17 [00:40<00:00,  2.30s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.9208, accuracy:24.9247, f1_score: 0.3941, best_thres: 0.0000, auc: 0.5323

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3852s, loss: 0.3643: 100%|██████████| 1439/1439 [09:20<00:00,  2.76it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:560.5345s, loss: 0.3643, accuracy: 84.3256%, f1_score: 0.6617, auc: 0.8796


100%|██████████| 17/17 [00:40<00:00,  2.31s/it]


-> Validation loss: 0.2902, accuracy: 87.7693%, f1_score: 0.7561, best_thres: 0.0000, auc: 0.9305


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3831s, loss: 0.2323: 100%|██████████| 1439/1439 [09:17<00:00,  2.80it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:557.5900s, loss: 0.2323, accuracy: 90.9162%, f1_score: 0.8152, auc: 0.9535


100%|██████████| 17/17 [00:41<00:00,  2.32s/it]


-> Validation loss: 0.2693, accuracy: 89.8772%, f1_score: 0.7980, best_thres: 0.0000, auc: 0.9447


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3814s, loss: 0.1511: 100%|██████████| 1439/1439 [09:14<00:00,  2.76it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:555.0512s, loss: 0.1511, accuracy: 94.4489%, f1_score: 0.8891, auc: 0.9802


100%|██████████| 17/17 [00:41<00:00,  2.33s/it]


-> Validation loss: 0.2713, accuracy: 90.9775%, f1_score: 0.8128, best_thres: 0.0000, auc: 0.9515


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3798s, loss: 0.0983: 100%|██████████| 1439/1439 [09:12<00:00,  2.80it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:552.7950s, loss: 0.0983, accuracy: 96.5947%, f1_score: 0.9323, auc: 0.9909


100%|██████████| 17/17 [00:40<00:00,  2.31s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3358, accuracy: 91.0702%, f1_score: 0.8179, best_thres: 0.0000, auc: 0.9500
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3872s, loss: 0.0669: 100%|██████████| 1439/1439 [09:23<00:00,  2.61it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:563.9622s, loss: 0.0669, accuracy: 97.7240%, f1_score: 0.9549, auc: 0.9957


100%|██████████| 17/17 [00:41<00:00,  2.35s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3492, accuracy: 91.0354%, f1_score: 0.8213, best_thres: 0.0000, auc: 0.9506
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3870s, loss: 0.0486: 100%|██████████| 1439/1439 [09:23<00:00,  2.67it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:563.6329s, loss: 0.0486, accuracy: 98.3581%, f1_score: 0.9674, auc: 0.9977


100%|██████████| 17/17 [00:41<00:00,  2.33s/it]


-> Validation loss: 0.4058, accuracy: 91.8114%, f1_score: 0.8311, best_thres: 0.0000, auc: 0.9526


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.4062s, loss: 0.0414: 100%|██████████| 1439/1439 [09:54<00:00,  2.57it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:594.5953s, loss: 0.0414, accuracy: 98.6419%, f1_score: 0.9730, auc: 0.9981


100%|██████████| 17/17 [00:42<00:00,  2.40s/it]


-> Validation loss: 0.3796, accuracy: 91.6145%, f1_score: 0.8323, best_thres: 0.0000, auc: 0.9554


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.4039s, loss: 0.0306: 100%|██████████| 1439/1439 [09:51<00:00,  2.55it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:591.2922s, loss: 0.0306, accuracy: 98.9257%, f1_score: 0.9786, auc: 0.9991


100%|██████████| 17/17 [00:42<00:00,  2.36s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4201, accuracy: 91.0007%, f1_score: 0.8243, best_thres: 0.0000, auc: 0.9523
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.4050s, loss: 0.0270: 100%|██████████| 1439/1439 [09:52<00:00,  2.52it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:592.8185s, loss: 0.0270, accuracy: 99.1052%, f1_score: 0.9822, auc: 0.9992


100%|██████████| 17/17 [00:41<00:00,  2.37s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4524, accuracy: 91.6030%, f1_score: 0.8331, best_thres: 0.0000, auc: 0.9492
-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.4068s, loss: 0.0233: 100%|██████████| 1439/1439 [09:55<00:00,  2.57it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:595.4750s, loss: 0.0233, accuracy: 99.2124%, f1_score: 0.9843, auc: 0.9993


100%|██████████| 17/17 [00:42<00:00,  2.40s/it]


-> Validation loss: 0.4641, accuracy: 91.4408%, f1_score: 0.8306, best_thres: 0.0000, auc: 0.9514
-> Early stopping: patience limit reached, stopping...
	* Start 2 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:12.5361s


100%|██████████| 17/17 [00:44<00:00,  2.46s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 2.3258, accuracy:25.1564, f1_score: 0.4020, best_thres: 0.0000, auc: 0.4619

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.4067s, loss: 0.3782: 100%|██████████| 1439/1439 [09:54<00:00,  2.47it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:595.1355s, loss: 0.3782, accuracy: 84.5544%, f1_score: 0.6739, auc: 0.8833


100%|██████████| 17/17 [00:44<00:00,  2.45s/it]


-> Validation loss: 0.2672, accuracy: 88.8927%, f1_score: 0.7788, best_thres: 0.0000, auc: 0.9381


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.4051s, loss: 0.2065: 100%|██████████| 1439/1439 [09:52<00:00,  2.49it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:592.9643s, loss: 0.2065, accuracy: 91.8892%, f1_score: 0.8352, auc: 0.9635


100%|██████████| 17/17 [00:44<00:00,  2.48s/it]


-> Validation loss: 0.2806, accuracy: 90.0973%, f1_score: 0.7965, best_thres: 0.0000, auc: 0.9423


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.4065s, loss: 0.1284: 100%|██████████| 1439/1439 [09:55<00:00,  2.51it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:595.3150s, loss: 0.1284, accuracy: 95.0918%, f1_score: 0.9017, auc: 0.9853


100%|██████████| 17/17 [00:44<00:00,  2.49s/it]


-> Validation loss: 0.2913, accuracy: 90.3289%, f1_score: 0.8179, best_thres: 0.0000, auc: 0.9496


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3905s, loss: 0.0833: 100%|██████████| 1439/1439 [09:31<00:00,  2.40it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:571.2711s, loss: 0.0833, accuracy: 97.0203%, f1_score: 0.9404, auc: 0.9934


100%|██████████| 17/17 [00:44<00:00,  2.47s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3195, accuracy: 91.1281%, f1_score: 0.8284, best_thres: 0.0000, auc: 0.9495
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3901s, loss: 0.0616: 100%|██████████| 1439/1439 [09:30<00:00,  2.66it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:570.7680s, loss: 0.0616, accuracy: 97.8369%, f1_score: 0.9567, auc: 0.9964


100%|██████████| 17/17 [00:43<00:00,  2.45s/it]


-> Validation loss: 0.3003, accuracy: 91.0702%, f1_score: 0.8292, best_thres: 0.0000, auc: 0.9547


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.4027s, loss: 0.0472: 100%|██████████| 1439/1439 [09:49<00:00,  2.61it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:589.6823s, loss: 0.0472, accuracy: 98.3379%, f1_score: 0.9667, auc: 0.9979


100%|██████████| 17/17 [00:44<00:00,  2.46s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3820, accuracy: 91.3829%, f1_score: 0.8289, best_thres: 0.0000, auc: 0.9492
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.4020s, loss: 0.0389: 100%|██████████| 1439/1439 [09:48<00:00,  2.59it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:588.5362s, loss: 0.0389, accuracy: 98.6940%, f1_score: 0.9739, auc: 0.9985


100%|██████████| 17/17 [00:44<00:00,  2.45s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3701, accuracy: 91.5219%, f1_score: 0.8326, best_thres: 0.0000, auc: 0.9495
-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.4048s, loss: 0.0311: 100%|██████████| 1439/1439 [09:52<00:00,  2.45it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:592.3823s, loss: 0.0311, accuracy: 98.9257%, f1_score: 0.9785, auc: 0.9989


100%|██████████| 17/17 [00:44<00:00,  2.45s/it]


-> Validation loss: 0.4368, accuracy: 92.0662%, f1_score: 0.8415, best_thres: 0.0000, auc: 0.9493
-> Early stopping: patience limit reached, stopping...
	* Start 3 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:12.6269s


100%|██████████| 17/17 [00:43<00:00,  2.49s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.6532, accuracy:74.8291, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5317

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3928s, loss: 0.3624: 100%|██████████| 1439/1439 [09:34<00:00,  2.63it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:574.2233s, loss: 0.3624, accuracy: 84.7546%, f1_score: 0.6682, auc: 0.8786


100%|██████████| 17/17 [00:43<00:00,  2.46s/it]


-> Validation loss: 0.2632, accuracy: 89.0768%, f1_score: 0.7811, best_thres: 0.0000, auc: 0.9407


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3951s, loss: 0.2124: 100%|██████████| 1439/1439 [09:37<00:00,  2.62it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:577.8898s, loss: 0.2124, accuracy: 91.5274%, f1_score: 0.8277, auc: 0.9609


100%|██████████| 17/17 [00:43<00:00,  2.46s/it]


-> Validation loss: 0.2527, accuracy: 90.3510%, f1_score: 0.8178, best_thres: 0.0000, auc: 0.9527


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3936s, loss: 0.1257: 100%|██████████| 1439/1439 [09:35<00:00,  2.62it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:575.6380s, loss: 0.1257, accuracy: 95.4394%, f1_score: 0.9089, auc: 0.9855


100%|██████████| 17/17 [00:43<00:00,  2.47s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.2847, accuracy: 91.3703%, f1_score: 0.8268, best_thres: 0.0000, auc: 0.9504
-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3940s, loss: 0.0825: 100%|██████████| 1439/1439 [09:35<00:00,  2.63it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:576.0913s, loss: 0.0825, accuracy: 97.0986%, f1_score: 0.9420, auc: 0.9936


100%|██████████| 17/17 [00:43<00:00,  2.46s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3023, accuracy: 91.0460%, f1_score: 0.8252, best_thres: 0.0000, auc: 0.9506
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3932s, loss: 0.0580: 100%|██████████| 1439/1439 [09:34<00:00,  2.65it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:574.8097s, loss: 0.0580, accuracy: 97.9760%, f1_score: 0.9595, auc: 0.9968


100%|██████████| 17/17 [00:43<00:00,  2.45s/it]


-> Validation loss: 0.3225, accuracy: 91.4977%, f1_score: 0.8302, best_thres: 0.0000, auc: 0.9499
-> Early stopping: patience limit reached, stopping...
	* Start 4 fold
	* Building dataset...
	* Building model...


  0%|          | 0/17 [00:00<?, ?it/s]

	* Building model time:12.3212s


100%|██████████| 17/17 [00:42<00:00,  2.33s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.8157, accuracy:74.4585, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5224

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.4026s, loss: 0.3933: 100%|██████████| 1439/1439 [09:48<00:00,  2.47it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:588.7756s, loss: 0.3933, accuracy: 83.6079%, f1_score: 0.6256, auc: 0.8547


100%|██████████| 17/17 [00:42<00:00,  2.30s/it]


-> Validation loss: 0.3090, accuracy: 87.4667%, f1_score: 0.7364, best_thres: 0.0000, auc: 0.9250


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3946s, loss: 0.2413: 100%|██████████| 1439/1439 [09:36<00:00,  2.52it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:576.5042s, loss: 0.2413, accuracy: 90.3460%, f1_score: 0.8009, auc: 0.9499


100%|██████████| 17/17 [00:42<00:00,  2.29s/it]


-> Validation loss: 0.2721, accuracy: 89.9571%, f1_score: 0.7929, best_thres: 0.0000, auc: 0.9439


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3948s, loss: 0.1481: 100%|██████████| 1439/1439 [09:36<00:00,  2.50it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:576.9905s, loss: 0.1481, accuracy: 94.4144%, f1_score: 0.8873, auc: 0.9805


100%|██████████| 17/17 [00:42<00:00,  2.29s/it]


-> Validation loss: 0.2765, accuracy: 90.7796%, f1_score: 0.8182, best_thres: 0.0000, auc: 0.9470


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3941s, loss: 0.0913: 100%|██████████| 1439/1439 [09:35<00:00,  2.50it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:576.1085s, loss: 0.0913, accuracy: 96.8119%, f1_score: 0.9360, auc: 0.9921


100%|██████████| 17/17 [00:42<00:00,  2.29s/it]


-> Validation loss: 0.3118, accuracy: 90.5016%, f1_score: 0.8233, best_thres: 0.0000, auc: 0.9488


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3952s, loss: 0.0693: 100%|██████████| 1439/1439 [09:37<00:00,  2.46it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:577.7246s, loss: 0.0693, accuracy: 97.6864%, f1_score: 0.9536, auc: 0.9948


100%|██████████| 17/17 [00:42<00:00,  2.32s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3375, accuracy: 90.5479%, f1_score: 0.8215, best_thres: 0.0000, auc: 0.9481
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3936s, loss: 0.0515: 100%|██████████| 1439/1439 [09:35<00:00,  2.48it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:575.2621s, loss: 0.0515, accuracy: 98.2539%, f1_score: 0.9649, auc: 0.9971


100%|██████████| 17/17 [00:42<00:00,  2.30s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3536, accuracy: 91.5672%, f1_score: 0.8357, best_thres: 0.0000, auc: 0.9458
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3934s, loss: 0.0431: 100%|██████████| 1439/1439 [09:34<00:00,  2.51it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

-> Training time:575.0555s, loss: 0.0431, accuracy: 98.5551%, f1_score: 0.9710, auc: 0.9981


100%|██████████| 17/17 [00:42<00:00,  2.30s/it]


-> Validation loss: 0.3707, accuracy: 90.8607%, f1_score: 0.8266, best_thres: 0.0000, auc: 0.9457
-> Early stopping: patience limit reached, stopping...
k_best_score : [0.95665302 0.95536636 0.95472462 0.95265619 0.94878024]
k weights : [[[0.20063272]]

 [[0.20036288]]

 [[0.20022829]]

 [[0.19979449]]

 [[0.19898161]]]
0.8291517323775388 0.66
dev auc:  0.9468933740671792
	* Saving dev result...
	* Predicting...
	* Saving test result...
