In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import jieba
import re
import os
import time
import gc

from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score, auc, roc_curve, classification_report

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import sys
sys.path.append('..')

In [4]:
import config
from config import device, is_cuda

## dataset

In [5]:
class QAMatchDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len_q, max_seq_len_r, mode):
        assert mode in ['train', 'dev', 'test']

        self.mode = mode
        self.tokenizer = tokenizer
        self.df = df
        self.max_seq_len_q = max_seq_len_q
        self.max_seq_len_r = max_seq_len_r
        # self.df = pd.read_csv(file)
        # self.seqs, self.seq_masks, self.seq_segments, self.labels = self.get_input(file)

    def __getitem__(self, idx):
        token_seq_1 = self.df.iloc[idx]['question']
        token_seq_2 = self.df.iloc[idx]['reply_content']
        if self.mode in ['train', 'dev']:
            label_tensor = torch.tensor(self.df.iloc[idx]['label'])
        else:
            label_tensor = None
        token_seq_1 = self.tokenizer.tokenize(token_seq_1)
        token_seq_2 = self.tokenizer.tokenize(token_seq_2)
#         print("token_seq_1:", token_seq_1, "lens:", len(token_seq_1))
#         print("token_seq_2:", token_seq_2, "lens:", len(token_seq_2))

        # truncate
        if len(token_seq_1) > self.max_seq_len_q:
            token_seq_1 = token_seq_1[:self.max_seq_len_q]
        if len(token_seq_2) > self.max_seq_len_r:
            token_seq_2 = token_seq_2[:self.max_seq_len_r]

        seq = ["[CLS]"] + token_seq_1 + ["[SEP]"] + token_seq_2 + ["[SEP]"]
        seq = self.tokenizer.convert_tokens_to_ids(seq)

        seq_segments = [0] * (len(token_seq_1) + 2) + [1] * (len(token_seq_2) + 1)

        return torch.Tensor(seq).type(torch.long), torch.Tensor(seq_segments).type(torch.long), \
            torch.Tensor([len(token_seq_1), len(token_seq_2)]).type(torch.long), label_tensor
            

    def collate_fn(self, samples):
        seqs = [s[0] for s in samples]
        seq_segments = [s[1] for s in samples]
        seq_lens = torch.stack([s[2] for s in samples])

        if self.mode in ['train', 'dev']:
            labels = torch.stack([s[3] for s in samples])
        else:
            labels = None

        seqs = pad_sequence(seqs, batch_first=True)
        seq_segments = pad_sequence(seq_segments, batch_first=True)

        # attention mask处理
        seq_masks = torch.zeros(seqs.shape, dtype=torch.long)
        seq_masks = seq_masks.masked_fill(seqs != 0, 1)

        return seqs, seq_masks, seq_segments, seq_lens, labels
    def __len__(self):
        return len(self.df)

## model

In [6]:
class BertModelTrain(nn.Module):
    def __init__(self, params):
        super(BertModelTrain, self).__init__()        
        self.bert_config = BertConfig.from_pretrained(os.path.join(params['pretrained_model_path'], 'config.json'))
        self.bert = BertModel.from_pretrained(params['pretrained_model_path'], output_hidden_states=True)
        self.linear = nn.Linear(15*self.bert_config.hidden_size, 1)
        self.dropout = nn.Dropout(p=params['dropout_rate'])
        self.loss_fn = nn.BCELoss()
        for param in self.bert.parameters():
            param.requires_grad = True     # fine-tune，每个参数都要更新

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, batch_seq_lens, labels=None):
        """
        :param batch_seqs: input_ids
        :param batch_seq_masks: attention_mask
        :param batch_seq_segments: token_type_ids
        :param batch_seq_lens: (batch, 2)   记录着每一个样本对中，两个文本的真实长度（即不加[CLS]/[SEP]）
        :param labels:
        :return: outputs: (loss, logits, ...)
                 outputs: (logits, ...)
        """
        # q_embeddings: last_hidden_state, (batch_size, sequence_length, hidden_size)
        # pooler_output : 最后一层[CLS]token的hidden state过一个FN+tanh输出的logits  (batch_size, hidden_size)
        # hidden_states : 每一层的sequence output: (input, layer1, layer2, ..., layer11, layer12)，其中每个的shape: (batch_size, hidden_state)
        q_embeddings, pooler_output, hidden_states = self.bert(input_ids=batch_seqs,
                                                               attention_mask=batch_seq_masks,
                                                               token_type_ids=batch_seq_segments)[:3]
        # 倒数第二层的sequence output
        lbo_embeddings = hidden_states[-2] 
        # 倒数第三层的sequence output
        ltl_embeddings = hidden_states[-3] 
        # last layer cls hidden state: (batch_size, hidden_size)
        last_cls_hidden_state = q_embeddings[:, 0]
        # last but one layer cls hidden state : (batch_size, hidden_size)
        lbo_cls_hidden_state = lbo_embeddings[:, 0]
        
        # 倒数第一层：seq_1和seq_2的句向量embedding（max pooling），均是(batch, hidden_size)
        last_seq_1_embeddings, last_seq_2_embeddings = self.get_seq_embeddings(q_embeddings, batch_seq_lens)
        # |seq_1 - seq_2|
        last_seq_gap = torch.abs(last_seq_1_embeddings - last_seq_2_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        last_seq_multiple = last_seq_1_embeddings * last_seq_2_embeddings
        
        
        # 倒数第二层, last but one layer
        lbo_seq_1_embeddings, lbo_seq_2_embeddings = self.get_seq_embeddings(lbo_embeddings, batch_seq_lens)
        # |seq_1 - seq_2|
        lbo_seq_gap = torch.abs(lbo_seq_1_embeddings - lbo_seq_2_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        lbo_seq_multiple = lbo_seq_1_embeddings * lbo_seq_2_embeddings
        
        # 倒数第三层, last third layer
        ltl_seq_1_embeddings, ltl_seq_2_embeddings = self.get_seq_embeddings(ltl_embeddings, batch_seq_lens)
        # |seq_1 - seq_2|
        ltl_seq_gap = torch.abs(ltl_seq_1_embeddings - ltl_seq_2_embeddings)
        # seq_1 * seq_2 （对应维度相乘）
        ltl_seq_multiple = ltl_seq_1_embeddings * ltl_seq_2_embeddings
        
        
        # concatenate this four tensor -> (batch_size, 15*hidden_size)
        x = torch.cat([pooler_output, last_cls_hidden_state, lbo_cls_hidden_state, 
                       last_seq_1_embeddings, last_seq_gap, last_seq_multiple, last_seq_2_embeddings, 
                       lbo_seq_1_embeddings, lbo_seq_gap, lbo_seq_multiple, lbo_seq_2_embeddings,
                       ltl_seq_1_embeddings, ltl_seq_gap, ltl_seq_multiple, ltl_seq_2_embeddings], dim=1)

        # dropout
        x = self.dropout(x)
        # FC层 -> (batch, 1)
        x = self.linear(x)
        # sigmoid
        output = torch.sigmoid(x)    # (batch_size, 1) 即模型预测每个样本为1的概率

        logits = x
        proba_0 = 1.0 - output     # (batch_size, 1)
        probabilities = torch.cat((proba_0, output), dim=1)   # (batch_size, 2)
        if labels is not None:
            # 有标签，则返回loss, logits, probabilities
            loss = self.loss_fn(output.squeeze(1), labels.type(torch.float))
            outputs = (loss, logits, probabilities)
        else:
            # 无标签，则返回logits, probabilities
            outputs = (logits, probabilities)

        return outputs
    def get_seq_embeddings(self, q_embeddings, batch_seq_lens):
        """
        获取batch中每个样本对中，seq_1和seq_2的
        :param q_embeddings: last_hidden_state, (batch_size, sequence_length, hidden_size)
        :param batch_seq_lens: (batch, 2)   记录着每一个样本对中，两个文本的真实长度（即不加[CLS]/[SEP]）
        """
        batch_seq_1 = []
        batch_seq_2 = []
        for batch in range(q_embeddings.shape[0]):
            seq_1_end_index = 1 + batch_seq_lens[batch][0]     # # 要加1，否则最后一个字取不到。这里不包含第一句的[SEP]
            seq_2_start_index = seq_1_end_index + 1
            seq_2_end_index = seq_2_start_index + batch_seq_lens[batch][1]   # 第二句不包括最后的[SEP]
            seq_1_embedding = q_embeddings[batch, 1:seq_1_end_index,:]
            seq_2_embedding = q_embeddings[batch, seq_2_start_index:seq_2_end_index,:]
            batch_seq_1.append(torch.max(seq_1_embedding, dim=0)[0])
            batch_seq_2.append(torch.max(seq_2_embedding, dim=0)[0])
        batch_seq_1 = torch.stack(batch_seq_1)
        batch_seq_2 = torch.stack(batch_seq_2)
        return batch_seq_1.to(device), batch_seq_2.to(device)

In [7]:
def writeToLog(path, content):
    with open(path, 'a') as fp:
        fp.write(content)
        fp.write('\n')

## train

In [8]:
def train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, fold,
          version, checkpoint=None):
    # ---------------------- Model definition ---------------------- #
    print("\t* Building model...")
    bulid_time = time.time()
    model = BertModelTrain(params).to(device)
    print("\t* Building model time:{:.4f}s".format(time.time()-bulid_time))
    # ---------------------- Preparation for training -------------- #
#     param_optimizer = list(model.named_parameters())
    # 这里，指定部分参数不参与权重衰减
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#     optimizer_grouped_parameters = [{
#         'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#         'weight_decay': params['weight_decay']
#     }, {
#         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#         'weight_decay': 0.0
#     }]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': params['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
#     optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr'])
    optimizer = Adam(model.parameters(), lr=params['lr'])
#     optimizer = SGD(model.parameters(),lr=params['lr'],momentum=params['momentum'], weight_decay=params['l2_weight'])
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.85, patience=params['patience'])
    num_training_steps = len(train_dataloader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

    best_score = 0.0    # 记录validation最好的结果
    best_thres = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epoch_count = []
    train_losses = []
    valid_losses = []
    train_f1s = []
    valid_f1s = []
    train_aucs = []
    valid_aucs = []
    best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')

    # Compute loss and accuracy before starting (or resuming) training
    # 如果准备start training，这里的valid结果就是预训练BERT（做fine-tune之前）对下游任务的效果
    # 如果准备resuming training，这里的valid结果就是上一次fine-tune的结果
    valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
    print("\t* Validation loss before training: {:.4f}, accuracy:{:.4f}, "
          "f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
          format(valid_loss, (valid_accuracy * 100), valid_f1, thres, valid_auc))
    print("\n", 20 * "=", "Training Bert model o device: {}".format(device), 20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, params['epochs']+1):
        print("-> Start epoch {}".format(epoch))
        writeToLog(output_path, "-> Start epoch {}".format(epoch))
        epoch_count.append(epoch)
        # train
        epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc = train_for_one_epoch(model,
                                                                                          train_dataloader,
                                                                                          optimizer,
                                                                                          scheduler,
                                                                                          params['max_gradient_norm'])
        train_losses.append(epoch_loss)
        train_f1s.append(epoch_f1)
        train_aucs.append(epoch_auc)
        print("-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        writeToLog(output_path, "-> Training time:{:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, auc: {:.4f}".
              format(epoch_time, epoch_loss, epoch_accuracy*100, epoch_f1, epoch_auc))
        
        # validation
        valid_loss, valid_accuracy, valid_f1, valid_auc, thres = validate(model, dev_dataloader)
        print("-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        writeToLog(output_path, "-> Validation loss: {:.4f}, accuracy: {:.4f}%, f1_score: {:.4f}, best_thres: {:.4f}, auc: {:.4f}".
              format(valid_loss, valid_accuracy * 100, valid_f1, thres, valid_auc))
        
        valid_losses.append(valid_loss)
        valid_f1s.append(valid_f1)
        valid_aucs.append(valid_auc)
#         scheduler.step(valid_loss)
        
        if valid_auc <= best_score:
            patience_counter += 1
        else:
            best_score = valid_auc
            best_thres = thres
            patience_counter = 0
            best_model_saved_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
            torch.save({
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,    # k fold时以valid auc来看每折的模型的能力，从而对最终的预测结果进行加权平均
                "best_thres": best_thres,
                "epochs_count": epoch_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, best_model_saved_path)

        if patience_counter >= params['early_stoping']:
            print("-> Early stopping: patience limit reached, stopping...")
            break
            
    if patience_counter != 0:
        # 如果最后一个epoch不是最好的模型，则读取之前的最好的模型
        best_checkpoint = torch.load(best_model_saved_path)
        model.load_state_dict(best_checkpoint['model'])
#     return model, best_score, epoch_count, train_losses, train_f1s, train_aucs, valid_losses, valid_f1s, valid_aucs
    return model, best_score


def train_for_one_epoch(model, dataloader, optimizer, scheduler, max_gradient_norm):
    model.train()

    epoch_start_time = time.time()
    running_loss = 0.0   # 记录整个epoch的累加loss
    correct_count = 0.0
    batch_avg_time = 0.0 # 记录该epoch平均batch花费时间
    all_preds = []
    all_pred_probas = []
    all_labels = []

    tqdm_dataloader = tqdm(dataloader)
    for batch_index, data in enumerate(tqdm_dataloader):
        batch_start_time = time.time()
        if is_cuda:
            data = [t.to(device) for t in data if t is not None]
        # 梯度置零
        optimizer.zero_grad()
        seqs, seq_masks, seq_segments, seq_lens, labels = data
        outputs = model(seqs, seq_masks, seq_segments, seq_lens, labels)
        # 回传梯度
        loss = outputs[0]
        logits = outputs[1]
        probabilities = outputs[2]
        # probabilities = nn.functional.softmax(logits, dim=-1)
        loss.backward()
        # 梯度裁剪
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pred = torch.argmax(probabilities, dim=1)
        correct_count = correct_count + (pred == labels).sum().item()
        batch_avg_time += time.time() - batch_start_time
        all_preds.append(pred.cpu())
        all_labels.append(labels.cpu())
        all_pred_probas.append(probabilities.detach().cpu())

        description = "Batch num: {}. Avg. batch proc. time: {:.4f}s, loss: {:.4f}".\
            format(batch_index+1, batch_avg_time/(batch_index+1), running_loss/(batch_index+1))
        tqdm_dataloader.set_description(description)
#         del data
#         torch.cuda.empty_cache()
        
    all_labels = torch.cat(all_labels)    # 把每个batch的labels平铺成一维tensor (samples, )
    all_preds = torch.cat(all_preds)      # 把每个batch的preds平铺成一维tensor (samples, )
    all_pred_probas = torch.cat(all_pred_probas) # 把每个batch的probas平铺成tensor (samples, 2)

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_count / len(dataloader.dataset)
    epoch_time = time.time() - epoch_start_time
    epoch_auc = auc(fpr, tpr)
    epoch_f1 = f1_score(all_labels, all_preds)

    return epoch_time, epoch_loss, epoch_accuracy, epoch_f1, epoch_auc
#     return epoch_time, epoch_loss, epoch_accuracy, 0, epoch_auc


def validate(model, dataloader):
    model.eval()
    running_loss = 0.0  # 记录整个epoch的累加loss
    correct_count = 0.0
    # all_preds = []
    all_labels = []
    all_pred_probas = []
    tqdm_dataloader = tqdm(dataloader)

    # Deactivate autograd for evaluation
    with torch.no_grad():   # 必须加这个，减少显存的使用
        for batch_index, data in enumerate(tqdm_dataloader):
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]

            seqs, seq_masks, seq_segments, seq_lens, labels = data
            outputs = model(seqs, seq_masks, seq_segments, seq_lens, labels)
            loss = outputs[0]
            logits = outputs[1]
            probabilities = outputs[2]
            # probabilities = nn.functional.softmax(logits, dim=-1)

            running_loss += loss.item()
            # _, pred = torch.max(logits, dim=1)

            # correct_count = correct_count + (pred == labels).sum().item()
            # all_preds.append(pred.cpu())
            all_labels.append(labels.cpu())
            all_pred_probas.append(probabilities.cpu())
            
#             del data
#             torch.cuda.empty_cache()
                    
    all_labels = torch.cat(all_labels)  # 把每个batch的labels平铺成一维tensor shape: (samples, )
    # all_preds = torch.cat(all_preds)  # 把每个batch的preds平铺成一维tensor shape: (samples, )
    all_pred_probas = torch.cat(all_pred_probas)  # 把每个batch的probas变成tensor（原来是[tensor, tensor, ...]）


    # best_f1, best_thres = search_f1(all_labels, all_pred_probas[:, 1])
    # all_preds = (all_pred_probas[:, 1] > best_thres).type(torch.long)
    all_preds = torch.argmax(all_pred_probas, dim=1)
    correct_count = (all_preds == all_labels).sum().item()

    fpr, tpr, thresholds = roc_curve(all_labels, all_pred_probas[:, 1], pos_label=1)

    valid_loss = running_loss / len(dataloader)
    valid_acc = correct_count / len(dataloader.dataset)
    valid_f1 = f1_score(all_labels, all_preds)
    # valid_f1 = best_f1
    valid_auc = auc(fpr, tpr)
    best_thres = 0
    return valid_loss, valid_acc, valid_f1, valid_auc, best_thres
    # return valid_loss, valid_acc, 0, 0
    
def search_f1(y_true, y_pred):
    """

    :param y_true: 一维tensor
    :param y_pred: 一维tensor，y_pred[i]表示第i个样本在label为1上的预测概率
    :return:
    """
    best_score = 0.0
    best_thres = 0.0
    for i in range(30, 70):
        thres = i / 100
        y_pred_bin = (y_pred > thres)   # 大于thres的为1，小于thres的为0
        # print("y_pred_bin shape:", y_pred_bin.shape)
        score = f1_score(y_true, y_pred_bin)
        if score > best_score:
            best_score = score
            best_thres = thres

    return best_score, best_thres
    
def get_pred_probas(model, dataloader, is_test=False):
    model.eval()
    probas = None
    all_labels = []
    with torch.no_grad():
        for data in dataloader:
            # 将所有tensors移到GPU上
            if is_cuda:
                data = [t.to(device) for t in data if t is not None]
                
            if is_test:
                seqs, seq_masks, seq_segments, seq_lens = data[:4]
            else:
                seqs, seq_masks, seq_segments, seq_lens, labels = data
                all_labels.append(labels)
            outputs = model(seqs,
                            seq_masks,
                            seq_segments,
                            seq_lens)
            logits = outputs[0]
            probabilities = outputs[1]   # (batch, 2)

            if probas is None:
                probas = probabilities
            else:
                # 将每个batch的预测结果拼接起来
                probas = torch.cat([probas, probabilities])
    if is_test:
        return probas.cpu()
    all_labels = torch.cat(all_labels)  # (len, )
    return probas.cpu(), all_labels.cpu()

## KFold

In [9]:
def k_fold_cross_val(train_df, test_df, params, k, bert_tokenizer, best_model_path, output_path, version):
    kf = KFold(n_splits=k)
    test_dataset = QAMatchDataset(test_df, bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], mode='test')
    test_dataloader = DataLoader(test_dataset, batch_size=256, num_workers=3, collate_fn=test_dataset.collate_fn)
    dev_labels = []
    dev_probas = []
    k_test_probas = []
    k_best_scores = []
    for fold, (train_idxs, dev_idxs) in enumerate(kf.split(train_df)):
        print("\t* Start "+str(fold)+" fold")
        writeToLog(output_path, "\t* Start "+str(fold)+" fold")
#         dev_labels.extend(train_df.iloc[dev_idxs]['label'].tolist())
        # ---------------------- Data loading -------------------------- #
        print("\t* Building dataset...")
        train_dataset = QAMatchDataset(train_df.iloc[train_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'train')
        dev_dataset = QAMatchDataset(train_df.iloc[dev_idxs], bert_tokenizer, params['max_seq_len_q'], params['max_seq_len_r'], 'dev')

        train_dataloader = DataLoader(train_dataset, batch_size=params['batch_size'], num_workers=3,
                                      collate_fn=train_dataset.collate_fn)
        dev_dataloader = DataLoader(dev_dataset, batch_size=256, num_workers=3,
                                    collate_fn=dev_dataset.collate_fn)
        best_model_fold_path = os.path.join(best_model_path, 'best-fine-tune-'+version+'-k'+str(fold)+'.bin')
        checkpoint = None
        if not(os.path.exists(best_model_fold_path)):
            # 若没有
            
            model, best_score = train(train_dataloader, dev_dataloader, params, bert_tokenizer, best_model_path, output_path, 
                                      fold, version, checkpoint=None)
        else:
            checkpoint = torch.load(best_model_fold_path)
            model = BertModelTrain(params).to(device)
            model.load_state_dict(checkpoint['model'])
            best_score = checkpoint['best_score']
        k_best_scores.append(best_score)
        
        fold_dev_proba, dev_label = get_pred_probas(model, dev_dataloader)
        for idx, proba in zip(dev_idxs, fold_dev_proba):
            train_df.loc[idx, 'proba_0'] = proba[0].item()
            train_df.loc[idx, 'proba_1'] = proba[1].item()
        fold_test_proba = get_pred_probas(model, test_dataloader, is_test=True)
        
        dev_labels.append(dev_label)
        dev_probas.append(fold_dev_proba)  # (k, len(dev_idxs), 2)
        k_test_probas.append(fold_test_proba) # (k, len(test_dataset), 2)
#         model.to(torch.device('cpu'))
        del model, train_dataloader, dev_dataloader, checkpoint
        torch.cuda.empty_cache() 
        time.sleep(5)
    
    dev_labels = torch.cat(dev_labels)  # (len(train_df),)      # 把每一折的验证集的label拼接，得到整个训练集的label
    dev_probas = torch.cat(dev_probas)  # (len(train_df), 2)    # 把每一折的验证集的预测结果拼接，得到整个训练集的预测结果
    
    k_test_probas = torch.stack(k_test_probas) # (k, len(test_dataset), 2)， 只是把[tensor, tensor, ... ]转为tensor
#     test_probas = torch.mean(k_test_probas, dim=0)  # (len(test_dataset), 2)  取每一折的平均

    # k折模型加权融合
    k_best_scores = np.array(k_best_scores)              
    k_weights = k_best_scores / k_best_scores.sum()             # (k,)
    k_weights = np.expand_dims(np.expand_dims(k_weights,1),1)   # (k, 1, 1)
    print('k_best_score :', k_best_scores)
    print('k weights :', k_weights)
    k_test_probas = k_test_probas * k_weights               # 广播机制，使得每个模型预测的概率乘上该模型的权重 (k, len(test_dataset), 2)
    test_probas = torch.sum(k_test_probas, dim=0)           # 求和
    # search f1
    best_f1, best_thres = search_f1(dev_labels, dev_probas[:, 1])
    print(best_f1, best_thres)
    test_preds = (test_probas[:, 1] > best_thres).type(torch.long)
    
    # 不用search f1
    # test_preds = torch.argmax(test_probas, dim=1) 
    return test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres

## 操作

In [11]:
model_version = 'FFTPD-5fold-V4.8'     # 模型版本
scheme_version = 'FFTPD-5fold-V4.8'     # 方案版本
# train_df = pd.read_csv(train_all_path)
train_df = pd.read_csv(config.augmented_V0204_path)
# test_df = pd.read_csv(test_path)
# train_df = pd.read_csv(train_V0_path)
test_df = pd.read_csv(config.test_V0_path)
k = 5

params = {
    'batch_size': 24,
    'epochs': 15,
    'lr': 2e-05,
    'l2_weight':0,
    'weight_decay': 0.01,
    'dropout_rate': 0.5,
    'early_stoping':3,
    'patience': 2,
    'max_seq_len_q': config.max_seq_len_q,
    'max_seq_len_r': config.max_seq_len_r,
    'max_gradient_norm': 10.0,
    'pretrained_model_path': config.pretrained_roberta_wwm_ext_large_path, 
}

bert_tokenizer = BertTokenizer.from_pretrained(os.path.join(params['pretrained_model_path'], 'vocab.txt'))
output_path = os.path.join(config.root_path, 'output/'+scheme_version+'.txt')

print("\t* K fold training and validating...")
test_preds, k_test_probas, dev_probas, dev_labels, best_f1, best_thres = k_fold_cross_val(train_df, test_df, params, k, 
                                                                                          bert_tokenizer, config.best_model_path, 
                                                                                          output_path, model_version)
dev_preds = (dev_probas[:, 1] > best_thres).type(torch.long)
fpr, tpr, thresholds = roc_curve(dev_labels, dev_probas[:, 1], pos_label=1)
dev_auc = auc(fpr, tpr)
print('dev auc: ',dev_auc)

print("\t* Saving dev result...")
with open(os.path.join(config.root_path, 'report/'+scheme_version+'_'+'classification_report.txt'), 'w') as fp:
    fp.write(classification_report(dev_labels, dev_preds))
    fp.write('\n')
    fp.write('f1-score: {:.4f}'.format(f1_score(dev_labels, dev_preds)))
    fp.write(' auc: {:.4f}'.format(dev_auc))

train_df.to_csv(os.path.join(config.root_path, 'result/'+scheme_version+'_pred_result.csv'), index=0)

print("\t* Predicting...")
test_df['pred'] = test_preds.cpu().numpy()
k_test_probas = k_test_probas.cpu().numpy()

print("\t* Saving test result...")
# 保存预测结果
time_str = '' + time.strftime("%Y%m%d%H%M", time.localtime())
test_df[['dialog_id', 'reply_id', 'pred']].to_csv(os.path.join(config.root_path,'submission/'+scheme_version+'_'+time_str+'.csv'),
                                                  sep='\t',
                                                  index=0,
                                                  header=0)
# 保存K折预测概率结果
k_test_probas_path = os.path.join(config.root_path, 'result/'+scheme_version+'_'+str(k)+'_test_probas.npz')
# if not os.path.exists(k_test_probas_path):
np.save(k_test_probas_path, k_test_probas)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


	* K fold training and validating...
	* Start 0 fold
	* Building dataset...
	* Building model...


  0%|          | 0/34 [00:00<?, ?it/s]

	* Building model time:10.8502s


100%|██████████| 34/34 [00:39<00:00,  1.05s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 0.9159, accuracy:75.5733, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5303

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3653s, loss: 0.3947: 100%|██████████| 1439/1439 [08:51<00:00,  2.87it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:531.1643s, loss: 0.3947, accuracy: 84.3053%, f1_score: 0.6584, auc: 0.8662


100%|██████████| 34/34 [00:40<00:00,  1.09s/it]


-> Validation loss: 0.2878, accuracy: 87.8504%, f1_score: 0.7589, best_thres: 0.0000, auc: 0.9320


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3635s, loss: 0.2434: 100%|██████████| 1439/1439 [08:48<00:00,  2.88it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:528.7234s, loss: 0.2434, accuracy: 90.4732%, f1_score: 0.8056, auc: 0.9510


100%|██████████| 34/34 [00:39<00:00,  1.07s/it]


-> Validation loss: 0.2707, accuracy: 89.1707%, f1_score: 0.7873, best_thres: 0.0000, auc: 0.9437


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3625s, loss: 0.1566: 100%|██████████| 1439/1439 [08:47<00:00,  2.88it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:527.2816s, loss: 0.1566, accuracy: 94.0464%, f1_score: 0.8811, auc: 0.9792


100%|██████████| 34/34 [00:40<00:00,  1.07s/it]


-> Validation loss: 0.2930, accuracy: 90.0162%, f1_score: 0.8103, best_thres: 0.0000, auc: 0.9513


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3600s, loss: 0.0978: 100%|██████████| 1439/1439 [08:43<00:00,  2.76it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.6300s, loss: 0.0978, accuracy: 96.5831%, f1_score: 0.9321, auc: 0.9915


100%|██████████| 34/34 [00:39<00:00,  1.07s/it]


-> Validation loss: 0.2924, accuracy: 91.2555%, f1_score: 0.8219, best_thres: 0.0000, auc: 0.9533


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3580s, loss: 0.0650: 100%|██████████| 1439/1439 [08:40<00:00,  2.84it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:520.9857s, loss: 0.0650, accuracy: 97.7529%, f1_score: 0.9554, auc: 0.9958


100%|██████████| 34/34 [00:39<00:00,  1.07s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3505, accuracy: 91.4871%, f1_score: 0.8300, best_thres: 0.0000, auc: 0.9513
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3583s, loss: 0.0451: 100%|██████████| 1439/1439 [08:41<00:00,  2.95it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:521.2943s, loss: 0.0451, accuracy: 98.5116%, f1_score: 0.9704, auc: 0.9979


100%|██████████| 34/34 [00:39<00:00,  1.07s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4045, accuracy: 91.2671%, f1_score: 0.8290, best_thres: 0.0000, auc: 0.9498
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3595s, loss: 0.0370: 100%|██████████| 1439/1439 [08:43<00:00,  2.80it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.3577s, loss: 0.0370, accuracy: 98.7954%, f1_score: 0.9761, auc: 0.9985


100%|██████████| 34/34 [00:40<00:00,  1.09s/it]


-> Validation loss: 0.4134, accuracy: 91.2439%, f1_score: 0.8283, best_thres: 0.0000, auc: 0.9491
-> Early stopping: patience limit reached, stopping...
	* Start 1 fold
	* Building dataset...
	* Building model...


  0%|          | 0/34 [00:00<?, ?it/s]

	* Building model time:6.8662s


100%|██████████| 34/34 [00:38<00:00,  1.07s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 1.8544, accuracy:24.4962, f1_score: 0.3935, best_thres: 0.0000, auc: 0.4698

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3673s, loss: 0.4014: 100%|██████████| 1439/1439 [08:54<00:00,  2.81it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:534.4626s, loss: 0.4014, accuracy: 84.1634%, f1_score: 0.6644, auc: 0.8719


100%|██████████| 34/34 [00:38<00:00,  1.09s/it]


-> Validation loss: 0.2804, accuracy: 88.7074%, f1_score: 0.7662, best_thres: 0.0000, auc: 0.9333


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3679s, loss: 0.2241: 100%|██████████| 1439/1439 [08:55<00:00,  2.91it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:535.4907s, loss: 0.2241, accuracy: 91.2463%, f1_score: 0.8223, auc: 0.9577


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]


-> Validation loss: 0.3068, accuracy: 89.2865%, f1_score: 0.7840, best_thres: 0.0000, auc: 0.9379


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3676s, loss: 0.1349: 100%|██████████| 1439/1439 [08:54<00:00,  2.84it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:535.0929s, loss: 0.1349, accuracy: 94.9904%, f1_score: 0.9004, auc: 0.9844


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]


-> Validation loss: 0.3255, accuracy: 90.1668%, f1_score: 0.8042, best_thres: 0.0000, auc: 0.9456


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3650s, loss: 0.0890: 100%|██████████| 1439/1439 [08:51<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:531.4488s, loss: 0.0890, accuracy: 96.9016%, f1_score: 0.9385, auc: 0.9928


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]


-> Validation loss: 0.3388, accuracy: 90.7691%, f1_score: 0.8142, best_thres: 0.0000, auc: 0.9464


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3630s, loss: 0.0601: 100%|██████████| 1439/1439 [08:47<00:00,  2.83it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:528.0575s, loss: 0.0601, accuracy: 97.8948%, f1_score: 0.9583, auc: 0.9967


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3795, accuracy: 90.7691%, f1_score: 0.8142, best_thres: 0.0000, auc: 0.9442
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3609s, loss: 0.0436: 100%|██████████| 1439/1439 [08:45<00:00,  2.95it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.2426s, loss: 0.0436, accuracy: 98.5319%, f1_score: 0.9708, auc: 0.9980


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]


-> Validation loss: 0.4031, accuracy: 91.2208%, f1_score: 0.8280, best_thres: 0.0000, auc: 0.9477


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3618s, loss: 0.0340: 100%|██████████| 1439/1439 [08:46<00:00,  2.83it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:526.4853s, loss: 0.0340, accuracy: 98.8533%, f1_score: 0.9772, auc: 0.9987


100%|██████████| 34/34 [00:38<00:00,  1.09s/it]


-> Validation loss: 0.3725, accuracy: 91.8925%, f1_score: 0.8359, best_thres: 0.0000, auc: 0.9540


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.3608s, loss: 0.0262: 100%|██████████| 1439/1439 [08:44<00:00,  2.81it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.0881s, loss: 0.0262, accuracy: 99.1371%, f1_score: 0.9828, auc: 0.9992


100%|██████████| 34/34 [00:39<00:00,  1.08s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4331, accuracy: 92.0315%, f1_score: 0.8354, best_thres: 0.0000, auc: 0.9537
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.3646s, loss: 0.0221: 100%|██████████| 1439/1439 [08:50<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:530.6843s, loss: 0.0221, accuracy: 99.2819%, f1_score: 0.9857, auc: 0.9993


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]


-> Validation loss: 0.4287, accuracy: 92.1821%, f1_score: 0.8426, best_thres: 0.0000, auc: 0.9562


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.3596s, loss: 0.0159: 100%|██████████| 1439/1439 [08:43<00:00,  2.79it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.6956s, loss: 0.0159, accuracy: 99.4209%, f1_score: 0.9885, auc: 0.9997


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4853, accuracy: 92.2284%, f1_score: 0.8407, best_thres: 0.0000, auc: 0.9503
-> Start epoch 11


Batch num: 1439. Avg. batch proc. time: 0.3622s, loss: 0.0125: 100%|██████████| 1439/1439 [08:47<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:527.3294s, loss: 0.0125, accuracy: 99.5280%, f1_score: 0.9906, auc: 0.9998


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.5320, accuracy: 92.4600%, f1_score: 0.8458, best_thres: 0.0000, auc: 0.9516
-> Start epoch 12


Batch num: 1439. Avg. batch proc. time: 0.3603s, loss: 0.0101: 100%|██████████| 1439/1439 [08:44<00:00,  2.86it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:524.6754s, loss: 0.0101, accuracy: 99.6091%, f1_score: 0.9922, auc: 0.9999


100%|██████████| 34/34 [00:38<00:00,  1.08s/it]


-> Validation loss: 0.5356, accuracy: 92.4021%, f1_score: 0.8451, best_thres: 0.0000, auc: 0.9521
-> Early stopping: patience limit reached, stopping...
	* Start 2 fold
	* Building dataset...
	* Building model...


  0%|          | 0/34 [00:00<?, ?it/s]

	* Building model time:7.1827s


100%|██████████| 34/34 [00:41<00:00,  1.06s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 4.7942, accuracy:25.1564, f1_score: 0.4020, best_thres: 0.0000, auc: 0.4466

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3657s, loss: 0.4335: 100%|██████████| 1439/1439 [08:52<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:532.3565s, loss: 0.4335, accuracy: 84.0331%, f1_score: 0.6638, auc: 0.8735


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.2726, accuracy: 89.0086%, f1_score: 0.7762, best_thres: 0.0000, auc: 0.9355


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3645s, loss: 0.2249: 100%|██████████| 1439/1439 [08:50<00:00,  2.78it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:530.7317s, loss: 0.2249, accuracy: 91.2318%, f1_score: 0.8203, auc: 0.9577


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.2870, accuracy: 89.8772%, f1_score: 0.8015, best_thres: 0.0000, auc: 0.9435


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3641s, loss: 0.1429: 100%|██████████| 1439/1439 [08:50<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:530.2088s, loss: 0.1429, accuracy: 94.7704%, f1_score: 0.8948, auc: 0.9824


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.3172, accuracy: 90.5953%, f1_score: 0.8183, best_thres: 0.0000, auc: 0.9467


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3652s, loss: 0.0938: 100%|██████████| 1439/1439 [08:51<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:531.7847s, loss: 0.0938, accuracy: 96.6526%, f1_score: 0.9328, auc: 0.9921


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.3300, accuracy: 90.6764%, f1_score: 0.8220, best_thres: 0.0000, auc: 0.9474


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3592s, loss: 0.0616: 100%|██████████| 1439/1439 [08:42<00:00,  2.94it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.0529s, loss: 0.0616, accuracy: 97.8514%, f1_score: 0.9569, auc: 0.9966


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.4166, accuracy: 91.1049%, f1_score: 0.8305, best_thres: 0.0000, auc: 0.9496


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3600s, loss: 0.0475: 100%|██████████| 1439/1439 [08:44<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:524.2784s, loss: 0.0475, accuracy: 98.4247%, f1_score: 0.9684, auc: 0.9977


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.3990, accuracy: 91.7420%, f1_score: 0.8385, best_thres: 0.0000, auc: 0.9511


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3582s, loss: 0.0369: 100%|██████████| 1439/1439 [08:41<00:00,  2.81it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:521.7450s, loss: 0.0369, accuracy: 98.8070%, f1_score: 0.9761, auc: 0.9986


100%|██████████| 34/34 [00:41<00:00,  1.09s/it]


-> Validation loss: 0.4535, accuracy: 91.0470%, f1_score: 0.8311, best_thres: 0.0000, auc: 0.9519


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.3600s, loss: 0.0286: 100%|██████████| 1439/1439 [08:43<00:00,  2.86it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.8615s, loss: 0.0286, accuracy: 99.0502%, f1_score: 0.9810, auc: 0.9991


100%|██████████| 34/34 [00:41<00:00,  1.09s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4569, accuracy: 91.5566%, f1_score: 0.8325, best_thres: 0.0000, auc: 0.9517
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.3591s, loss: 0.0233: 100%|██████████| 1439/1439 [08:42<00:00,  2.84it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:522.4740s, loss: 0.0233, accuracy: 99.2326%, f1_score: 0.9846, auc: 0.9995


100%|██████████| 34/34 [00:41<00:00,  1.09s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.5053, accuracy: 92.1126%, f1_score: 0.8370, best_thres: 0.0000, auc: 0.9500
-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.3586s, loss: 0.0170: 100%|██████████| 1439/1439 [08:41<00:00,  2.87it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:521.7485s, loss: 0.0170, accuracy: 99.3861%, f1_score: 0.9877, auc: 0.9997


100%|██████████| 34/34 [00:41<00:00,  1.07s/it]


-> Validation loss: 0.5397, accuracy: 92.2052%, f1_score: 0.8421, best_thres: 0.0000, auc: 0.9509
-> Early stopping: patience limit reached, stopping...
	* Start 3 fold
	* Building dataset...
	* Building model...


  0%|          | 0/34 [00:00<?, ?it/s]

	* Building model time:7.0599s


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 1.2952, accuracy:74.8291, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5518

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3660s, loss: 0.3929: 100%|██████████| 1439/1439 [08:52<00:00,  2.84it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:532.5552s, loss: 0.3929, accuracy: 84.1523%, f1_score: 0.6556, auc: 0.8687


100%|██████████| 34/34 [00:40<00:00,  1.11s/it]


-> Validation loss: 0.2753, accuracy: 88.9494%, f1_score: 0.7841, best_thres: 0.0000, auc: 0.9361


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3661s, loss: 0.2361: 100%|██████████| 1439/1439 [08:52<00:00,  2.86it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:532.5772s, loss: 0.2361, accuracy: 90.6037%, f1_score: 0.8077, auc: 0.9535


100%|██████████| 34/34 [00:40<00:00,  1.12s/it]


-> Validation loss: 0.2823, accuracy: 89.6444%, f1_score: 0.8026, best_thres: 0.0000, auc: 0.9451


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3639s, loss: 0.1432: 100%|██████████| 1439/1439 [08:49<00:00,  2.76it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:529.5446s, loss: 0.1432, accuracy: 94.6634%, f1_score: 0.8930, auc: 0.9827


100%|██████████| 34/34 [00:40<00:00,  1.11s/it]


-> Validation loss: 0.3002, accuracy: 90.6521%, f1_score: 0.8183, best_thres: 0.0000, auc: 0.9486


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3616s, loss: 0.0908: 100%|██████████| 1439/1439 [08:46<00:00,  2.93it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:526.5880s, loss: 0.0908, accuracy: 96.7193%, f1_score: 0.9345, auc: 0.9926


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]


-> Validation loss: 0.3536, accuracy: 91.3935%, f1_score: 0.8290, best_thres: 0.0000, auc: 0.9493


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3622s, loss: 0.0625: 100%|██████████| 1439/1439 [08:47<00:00,  2.82it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:527.3941s, loss: 0.0625, accuracy: 97.8457%, f1_score: 0.9569, auc: 0.9963


100%|██████████| 34/34 [00:40<00:00,  1.11s/it]


-> Validation loss: 0.3660, accuracy: 91.6367%, f1_score: 0.8345, best_thres: 0.0000, auc: 0.9521


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3637s, loss: 0.0457: 100%|██████████| 1439/1439 [08:49<00:00,  2.83it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:529.2730s, loss: 0.0457, accuracy: 98.4364%, f1_score: 0.9687, auc: 0.9980


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4155, accuracy: 91.6252%, f1_score: 0.8321, best_thres: 0.0000, auc: 0.9513
-> Start epoch 7


Batch num: 1439. Avg. batch proc. time: 0.3612s, loss: 0.0349: 100%|██████████| 1439/1439 [08:45<00:00,  2.79it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.8468s, loss: 0.0349, accuracy: 98.8331%, f1_score: 0.9766, auc: 0.9987


100%|██████████| 34/34 [00:40<00:00,  1.14s/it]


-> Validation loss: 0.3890, accuracy: 91.8916%, f1_score: 0.8444, best_thres: 0.0000, auc: 0.9553


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 8


Batch num: 1439. Avg. batch proc. time: 0.3607s, loss: 0.0289: 100%|██████████| 1439/1439 [08:45<00:00,  2.79it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.2496s, loss: 0.0289, accuracy: 99.0387%, f1_score: 0.9808, auc: 0.9990


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4462, accuracy: 91.9263%, f1_score: 0.8421, best_thres: 0.0000, auc: 0.9531
-> Start epoch 9


Batch num: 1439. Avg. batch proc. time: 0.3610s, loss: 0.0216: 100%|██████████| 1439/1439 [08:45<00:00,  2.88it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.6216s, loss: 0.0216, accuracy: 99.2935%, f1_score: 0.9859, auc: 0.9995


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4748, accuracy: 91.6367%, f1_score: 0.8369, best_thres: 0.0000, auc: 0.9511
-> Start epoch 10


Batch num: 1439. Avg. batch proc. time: 0.3595s, loss: 0.0166: 100%|██████████| 1439/1439 [08:42<00:00,  2.85it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:523.0601s, loss: 0.0166, accuracy: 99.4325%, f1_score: 0.9886, auc: 0.9997


100%|██████████| 34/34 [00:39<00:00,  1.11s/it]


-> Validation loss: 0.5109, accuracy: 92.1464%, f1_score: 0.8429, best_thres: 0.0000, auc: 0.9513
-> Early stopping: patience limit reached, stopping...
	* Start 4 fold
	* Building dataset...
	* Building model...


  0%|          | 0/34 [00:00<?, ?it/s]

	* Building model time:7.2090s


100%|██████████| 34/34 [00:40<00:00,  1.05s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

	* Validation loss before training: 1.4945, accuracy:74.4585, f1_score: 0.0000, best_thres: 0.0000, auc: 0.5294

-> Start epoch 1


Batch num: 1439. Avg. batch proc. time: 0.3673s, loss: 0.3833: 100%|██████████| 1439/1439 [08:54<00:00,  2.61it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:534.3232s, loss: 0.3833, accuracy: 84.4824%, f1_score: 0.6600, auc: 0.8724


100%|██████████| 34/34 [00:40<00:00,  1.05s/it]


-> Validation loss: 0.2831, accuracy: 88.7177%, f1_score: 0.7609, best_thres: 0.0000, auc: 0.9353


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 2


Batch num: 1439. Avg. batch proc. time: 0.3656s, loss: 0.2400: 100%|██████████| 1439/1439 [08:51<00:00,  2.67it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:532.0178s, loss: 0.2400, accuracy: 90.5342%, f1_score: 0.8052, auc: 0.9511


100%|██████████| 34/34 [00:40<00:00,  1.06s/it]


-> Validation loss: 0.3050, accuracy: 89.1463%, f1_score: 0.7594, best_thres: 0.0000, auc: 0.9449


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 3


Batch num: 1439. Avg. batch proc. time: 0.3655s, loss: 0.1476: 100%|██████████| 1439/1439 [08:51<00:00,  2.77it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:531.8012s, loss: 0.1476, accuracy: 94.4202%, f1_score: 0.8872, auc: 0.9816


100%|██████████| 34/34 [00:39<00:00,  1.03s/it]


-> Validation loss: 0.2846, accuracy: 90.9533%, f1_score: 0.8219, best_thres: 0.0000, auc: 0.9543


  0%|          | 0/1439 [00:00<?, ?it/s]

-> Start epoch 4


Batch num: 1439. Avg. batch proc. time: 0.3631s, loss: 0.0934: 100%|██████████| 1439/1439 [08:48<00:00,  2.69it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:528.3225s, loss: 0.0934, accuracy: 96.6121%, f1_score: 0.9322, auc: 0.9925


100%|██████████| 34/34 [00:40<00:00,  1.03s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.3454, accuracy: 90.8027%, f1_score: 0.8268, best_thres: 0.0000, auc: 0.9481
-> Start epoch 5


Batch num: 1439. Avg. batch proc. time: 0.3626s, loss: 0.0610: 100%|██████████| 1439/1439 [08:47<00:00,  2.70it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:527.7138s, loss: 0.0610, accuracy: 97.8746%, f1_score: 0.9574, auc: 0.9967


100%|██████████| 34/34 [00:40<00:00,  1.06s/it]
  0%|          | 0/1439 [00:00<?, ?it/s]

-> Validation loss: 0.4040, accuracy: 90.7796%, f1_score: 0.8293, best_thres: 0.0000, auc: 0.9480
-> Start epoch 6


Batch num: 1439. Avg. batch proc. time: 0.3611s, loss: 0.0462: 100%|██████████| 1439/1439 [08:45<00:00,  2.69it/s]
  0%|          | 0/34 [00:00<?, ?it/s]

-> Training time:525.7449s, loss: 0.0462, accuracy: 98.4740%, f1_score: 0.9693, auc: 0.9979


100%|██████████| 34/34 [00:40<00:00,  1.03s/it]


-> Validation loss: 0.3713, accuracy: 91.0923%, f1_score: 0.8327, best_thres: 0.0000, auc: 0.9534
-> Early stopping: patience limit reached, stopping...
k_best_score : [0.95331582 0.95619852 0.95188428 0.95534269 0.95431516]
k weights : [[[0.19981231]]

 [[0.20041652]]

 [[0.19951226]]

 [[0.20023714]]

 [[0.20002177]]]
0.8331430137486867 0.53
dev auc:  0.9491976911023325
	* Saving dev result...
	* Predicting...
	* Saving test result...


In [24]:
best_scores = []
for fold in range(5):
    best_score = torch.load(os.path.join(root_path, 'model/fine-tune/best-fine-tune-V4.5-k'+str(fold)+'.bin'), 
                            map_location={'cuda:0': 'cuda:1'})['best_score']
    best_scores.append(best_score)
#     torch.cuda.empty_cache()

In [28]:
len(best_scores)

5

In [29]:
best_scores = np.array(best_scores)
# count = sum(best_scores)
weighted = best_scores / best_scores.sum()
weighted

array([0.20040982, 0.19991843, 0.19951245, 0.20091086, 0.19924843])

In [19]:
weighted = np.array([0.1, 0.4, 0.15, 0.2, 0.15])

In [10]:
k_probas = torch.tensor([[[0.1, 0.9], [0.2, 0.8]], [[0.3,0.7], [0.4, 0.6]], [[0.5,0.5], [0.6, 0.4]], [[0.7,0.3], [0.8, 0.2]], [[0.9,0.1], [0.2, 0.8]]])

In [52]:
k_probas

tensor([[[0.1000, 0.9000],
         [0.2000, 0.8000]],

        [[0.3000, 0.7000],
         [0.4000, 0.6000]],

        [[0.5000, 0.5000],
         [0.6000, 0.4000]],

        [[0.7000, 0.3000],
         [0.8000, 0.2000]],

        [[0.9000, 0.1000],
         [0.2000, 0.8000]]])

In [31]:
k_probas.shape

(5, 2, 2)

In [48]:
weighted = np.expand_dims(np.expand_dims(weighted,1),1)

In [49]:
weighted.shape

(5, 1, 1)

In [54]:
c = k_probas * weighted
c

tensor([[[0.0100, 0.0900],
         [0.0200, 0.0800]],

        [[0.1200, 0.2800],
         [0.1600, 0.2400]],

        [[0.0750, 0.0750],
         [0.0900, 0.0600]],

        [[0.1400, 0.0600],
         [0.1600, 0.0400]],

        [[0.1350, 0.0150],
         [0.0300, 0.1200]]], dtype=torch.float64)

In [55]:
c.shape

torch.Size([5, 2, 2])

In [59]:
torch.sum(c, dim=0)

tensor([[0.4800, 0.5200],
        [0.4600, 0.5400]], dtype=torch.float64)