## 医学論文の自動仕分けチャレンジ 推論2

In [1]:
import os
import sys
import gc

#from google.colab import drive
#drive.mount('/gdrive')

#!cp /gdrive/MyDrive/Datasets/signate-471/train.csv .
#!cp /gdrive/MyDrive/Datasets/signate-471/test.csv .
#!cp /gdrive/MyDrive/Datasets/signate-471/sample_submit.csv .

In [2]:
#!pip install -q transformers

In [3]:
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModel


In [4]:
class CFG:
    batch_size = 5 #16
    num_workers = 3 #4
    max_length =  256 #72
    n_splits = 5
    version = 121
    model = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext" 
    tokenizer = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    
    epochs = 4


In [5]:
DATA_DIR = "../input/"
OUTPUT_DIR = "../output/"

In [6]:
warnings.filterwarnings("ignore")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'

In [8]:
def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [9]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

## データ読み込み

In [10]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
TARGET = "judgement"

In [11]:
# この値を境に、モデルの出力を 0 と 1 にします。
border = len(train[train[TARGET] == 1]) / len(train[TARGET])
print(border)
init_border = border

0.023282372444280715


## 前処理

In [12]:
 # preprocess
train["text"] = train["title"] + " " + train["abstract"].fillna("")
test["text"] = test["title"] + " " + test["abstract"].fillna("")

In [13]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=seed) #5
    for n, (train_index, val_index) in enumerate(Fold.split(train, train[TARGET])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [14]:
def get_test_data(test):
    return test

In [15]:
train = get_train_data(train)

## データセット定義

In [20]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        #tokenizer = T.BertTokenizer.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.df = df
        self.include_labels = include_labels

        self.title = df["text"].tolist()
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = CFG.max_length,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df[TARGET].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

## モデル定義

In [21]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        #self.model = AutoModel.from_pretrained(model_name)
        self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask) #,labels=labels)
        logits = out.logits
        #out = self.sigmoid(out.logits).squeeze()
        out = self.sigmoid(logits).squeeze()
        #out = logits.squeeze()
        return out

## ツール

In [22]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

## 学習補助関数

In [23]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()
    
    # switch to train mode
    model.train()
    
    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        
        y_preds = model(input_ids, attention_mask)
        
        loss = criterion(y_preds, labels)
        
        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()
        
        optimizer.step()
        
        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    return losses.avg

## 評価 補助関数

In [24]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    predictions = np.concatenate(preds)
    return losses.avg, predictions

## 推論関数

In [25]:
def inference():
    predictions = []

    test_dataset = BaseDataset(test, CFG.model, include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True
    )

    for fold in range(CFG.n_splits): #5
    #for fold in [1,3,4,5]: #5
        LOGGER.info(f"========== model: bert-base-uncased fold: {fold} inference ==========")
        model = BaseModel(CFG.model)
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"{CFG.version}_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
            
        preds = np.concatenate(preds)
        predictions.append(preds)
        
    predictions = np.mean(predictions, axis=0)

    return predictions

## 学習

In [26]:
def train_loop(train, fold):

    LOGGER.info(f"=*========= fold: {fold} training ==========")
    
    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index
    
    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)
    
    train_dataset = BaseDataset(train_folds, CFG.tokenizer) #, CFG.model) #
    valid_dataset = BaseDataset(valid_folds, CFG.tokenizer) #, CFG.model) #

    #print("DataLoader")
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, #16
        shuffle=True,
        num_workers=CFG.num_workers, #4, 
        pin_memory=True,
        drop_last=True,
    )
    
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=CFG.batch_size, #16
        shuffle=False,
        num_workers=CFG.num_workers, #4,
        pin_memory=True,
        drop_last=False,
    )
    #print("Model")
    # ====================================================
    # Model
    # ====================================================
    model = BaseModel(CFG.model)
    model.to(device)

    optimizer = T.AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCELoss()
    #criterion = nn.BCEWithLogitsLoss()
    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf
    #print("Loop")
    for epoch in range(CFG.epochs):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds[TARGET].values
        
        # scoring
        score = fbeta_score(valid_labels, np.where(preds < init_border, 0, 1), beta=7.0)
        LOGGER.info(f"score1 = {score}, thresh={init_border}")
        
        border = opt_fbeta_threshold(valid_labels, preds) ##最適化
        
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)
        LOGGER.info(f"score2 = {score},  thresh={border}")
        
        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        #if score >= best_score: ##
        if avg_val_loss <= best_loss:
        #if True:
            best_loss = avg_val_loss
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} - Best Loss: {best_loss:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"{CFG.version}_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"{CFG.version}_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

## メイン

In [27]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
    LOGGER.info(f"Score1: {score:<.5f}")
    
    best_border = opt_fbeta_threshold(labels, preds)
    LOGGER.info(f"best border: {best_border:<.5f}")
    
    score = fbeta_score(labels, np.where(preds < best_border, 0, 1), beta=7.0)
    LOGGER.info(f"Score2: {score:<.5f}")
    
    return score, best_border

In [28]:
from scipy.optimize import minimize, minimize_scalar
def opt_fbeta_threshold(y_true, y_pred):
    """fbeta score計算時のthresholdを最適化"""
    def opt_(x): 
        return -fbeta_score(y_true, y_pred >= x, beta=7)
    
    #result = minimize(opt_, x0=np.array([0.1]), method='Powell')
    result = minimize_scalar(opt_, bounds=(0, 0.5), method='bounded')
    
    best_threshold = result['x'].item()
    return best_threshold

In [29]:
def visualize_confusion_matrix(
        y_true,
        pred_label,
        height=.6,
        labels=None):
    """混合行列をプロット 
    (https://www.guruguru.science/competitions/11/discussions/2fb11851-67d0-4e96-a4b1-5629b944f363/)"""
    
    conf = confusion_matrix(y_true=y_true,
                            y_pred=pred_label,
                            normalize='true')

    n_labels = len(conf)
    size = n_labels * height
    fig, ax = plt.subplots(figsize=(size * 4, size * 3))
    sns.heatmap(conf, cmap='Blues', ax=ax, annot=True, fmt='.2f')
    ax.set_ylabel('Label')
    ax.set_xlabel('Predict')

    if labels is not None:
        ax.set_yticklabels(labels)
        ax.set_xticklabels(labels)
        ax.tick_params('y', labelrotation=0)
        ax.tick_params('x', labelrotation=90)

    plt.show()
    return fig

In [30]:
##main

oof_df = pd.DataFrame()
for fold in range(CFG.n_splits): 
    _oof_df = train_loop(train, fold)
    oof_df = pd.concat([oof_df, _oof_df])
    LOGGER.info(f"========== fold: {fold} result ==========")
    get_result(_oof_df)
        
# CV result
LOGGER.info(f"========== CV ==========")
score, best_border = get_result(oof_df)
    
# Save OOF result
oof_df.to_csv(OUTPUT_DIR + f"oof_df_{CFG.version}_{score:<.5f}.csv", index=False)


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

Epoch: [1][0/4343] Elapsed 0m 0s (remain 47m 44s) Loss: 0.7081 
Epoch: [1][100/4343] Elapsed 0m 10s (remain 7m 20s) Loss: 0.1202 
Epoch: [1][200/4343] Elapsed 0m 20s (remain 6m 54s) Loss: 0.1147 
Epoch: [1][300/4343] Elapsed 0m 29s (remain 6m 39s) Loss: 0.1171 
Epoch: [1][400/4343] Elapsed 0m 39s (remain 6m 28s) Loss: 0.1089 
Epoch: [1][500/4343] Elapsed 0m 49s (remain 6m 16s) Loss: 0.1028 
Epoch: [1][600/4343] Elapsed 0m 58s (remain 6m 5s) Loss: 0.1003 
Epoch: [1][700/4343] Elapsed 1m 8s (remain 5m 54s) Loss: 0.0951 
Epoch: [1][800/4343] Elapsed 1m 17s (remain 5m 44s) Loss: 0.0932 
Epoch: [1][900/4343] Elapsed 1m 27s (remain 5m 34s) Loss: 0.0933 
Epoch: [1][1000/4343] Elapsed 1m 37s (remain 5m 24s) Loss: 0.0894 
Epoch: [1][1100/4343] Elapsed 1m 46s (remain 5m 14s) Loss: 0.0868 
Epoch: [1][1200/4343] Elapsed 1m 56s (remain 5m 4s) Loss: 0.0863 
Epoch: [1][1300/4343] Elapsed 2m 6s (remain 4m 54s) Loss: 0.0865 
Epoch: [1][1400/4343] Elapsed 2m 15s (remain 4m 45s) Loss: 0.0871 
Epoch: [1][

score1 = 0.8281249999999999, thresh=0.023282372444280715
score2 = 0.8670072766682149,  thresh=0.0132469846281435
Epoch 1 - avg_train_loss: 0.0657  avg_val_loss: 0.0508  time: 445s
Epoch 1 - Score: 0.8670072766682149
Epoch 1 - Save Best Score: 0.8670 - Best Loss: 0.0508 Model


EVAL: [1085/1086] Elapsed 0m 27s (remain 0m 0s) Loss: 0.0508 
Epoch: [2][0/4343] Elapsed 0m 0s (remain 18m 49s) Loss: 0.0014 
Epoch: [2][100/4343] Elapsed 0m 9s (remain 6m 54s) Loss: 0.0287 
Epoch: [2][200/4343] Elapsed 0m 19s (remain 6m 46s) Loss: 0.0394 
Epoch: [2][300/4343] Elapsed 0m 29s (remain 6m 34s) Loss: 0.0397 
Epoch: [2][400/4343] Elapsed 0m 38s (remain 6m 22s) Loss: 0.0384 
Epoch: [2][500/4343] Elapsed 0m 48s (remain 6m 12s) Loss: 0.0415 
Epoch: [2][600/4343] Elapsed 0m 58s (remain 6m 2s) Loss: 0.0417 
Epoch: [2][700/4343] Elapsed 1m 7s (remain 5m 52s) Loss: 0.0403 
Epoch: [2][800/4343] Elapsed 1m 17s (remain 5m 42s) Loss: 0.0433 
Epoch: [2][900/4343] Elapsed 1m 26s (remain 5m 32s) Loss: 0.0416 
Epoch: [2][1000/4343] Elapsed 1m 36s (remain 5m 22s) Loss: 0.0415 
Epoch: [2][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0406 
Epoch: [2][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0418 
Epoch: [2][1300/4343] Elapsed 2m 5s (remain 4m 53s) Loss: 0.0404 
Epoch: [2][1400/4

score1 = 0.8556316116988177, thresh=0.023282372444280715
score2 = 0.8792225821378991,  thresh=0.010550906942656556
Epoch 2 - avg_train_loss: 0.0384  avg_val_loss: 0.0449  time: 444s
Epoch 2 - Score: 0.8792225821378991
Epoch 2 - Save Best Score: 0.8792 - Best Loss: 0.0449 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0449 
Epoch: [3][0/4343] Elapsed 0m 0s (remain 19m 9s) Loss: 0.0019 
Epoch: [3][100/4343] Elapsed 0m 9s (remain 6m 55s) Loss: 0.0184 
Epoch: [3][200/4343] Elapsed 0m 19s (remain 6m 42s) Loss: 0.0126 
Epoch: [3][300/4343] Elapsed 0m 29s (remain 6m 31s) Loss: 0.0181 
Epoch: [3][400/4343] Elapsed 0m 38s (remain 6m 21s) Loss: 0.0194 
Epoch: [3][500/4343] Elapsed 0m 48s (remain 6m 11s) Loss: 0.0228 
Epoch: [3][600/4343] Elapsed 0m 58s (remain 6m 1s) Loss: 0.0220 
Epoch: [3][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0222 
Epoch: [3][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0228 
Epoch: [3][900/4343] Elapsed 1m 26s (remain 5m 32s) Loss: 0.0226 
Epoch: [3][1000/4343] Elapsed 1m 36s (remain 5m 22s) Loss: 0.0239 
Epoch: [3][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0241 
Epoch: [3][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0235 
Epoch: [3][1300/4343] Elapsed 2m 5s (remain 4m 53s) Loss: 0.0231 
Epoch: [3][1400/43

score1 = 0.8354153653966273, thresh=0.023282372444280715
score2 = 0.8315029808597426,  thresh=0.10137670235796244
Epoch 3 - avg_train_loss: 0.0271  avg_val_loss: 0.0586  time: 445s
Epoch 3 - Score: 0.8315029808597426


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0586 
Epoch: [4][0/4343] Elapsed 0m 0s (remain 16m 35s) Loss: 0.0003 
Epoch: [4][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.0082 
Epoch: [4][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0149 
Epoch: [4][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0145 
Epoch: [4][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.0126 
Epoch: [4][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.0138 
Epoch: [4][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0173 
Epoch: [4][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0167 
Epoch: [4][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0168 
Epoch: [4][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0164 
Epoch: [4][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0168 
Epoch: [4][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0165 
Epoch: [4][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0174 
Epoch: [4][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.0181 
Epoch: [4][1400/4

score1 = 0.8137715179968701, thresh=0.023282372444280715
score2 = 0.8658666261582865,  thresh=0.0018956184300904487
Epoch 4 - avg_train_loss: 0.0187  avg_val_loss: 0.0520  time: 445s
Epoch 4 - Score: 0.8658666261582865


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0520 


Score1: 0.85563
best border: 0.01055
Score2: 0.87922
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

Epoch: [1][0/4343] Elapsed 0m 0s (remain 18m 0s) Loss: 0.7411 
Epoch: [1][100/4343] Elapsed 0m 9s (remain 6m 51s) Loss: 0.1548 
Epoch: [1][200/4343] Elapsed 0m 19s (remain 6m 38s) Loss: 0.1355 
Epoch: [1][300/4343] Elapsed 0m 28s (remain 6m 28s) Loss: 0.1205 
Epoch: [1][400/4343] Elapsed 0m 38s (remain 6m 18s) Loss: 0.1135 
Epoch: [1][500/4343] Elapsed 0m 48s (remain 6m 8s) Loss: 0.1053 
Epoch: [1][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.1013 
Epoch: [1][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.0973 
Epoch: [1][800/4343] Elapsed 1m 16s (remain 5m 39s) Loss: 0.0920 
Epoch: [1][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.0922 
Epoch: [1][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.0887 
Epoch: [1][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0867 
Epoch: [1][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0855 
Epoch: [1][1300/4343] Elapsed 2m 4s (remain 4m 52s) Loss: 0.0823 
Epoch: [1][1400/4343] Elapsed 2m 14s (remain 4m 42s) Loss: 0.0809 
Epoch: [1][15

score1 = 0.9005019191024505, thresh=0.023282372444280715
score2 = 0.8469403326720586,  thresh=0.058920625865344925
Epoch 1 - avg_train_loss: 0.0637  avg_val_loss: 0.0467  time: 445s
Epoch 1 - Score: 0.8469403326720586
Epoch 1 - Save Best Score: 0.8469 - Best Loss: 0.0467 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0467 
Epoch: [2][0/4343] Elapsed 0m 0s (remain 17m 29s) Loss: 0.0397 
Epoch: [2][100/4343] Elapsed 0m 9s (remain 6m 54s) Loss: 0.0457 
Epoch: [2][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0401 
Epoch: [2][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0383 
Epoch: [2][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.0368 
Epoch: [2][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.0377 
Epoch: [2][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0367 
Epoch: [2][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0361 
Epoch: [2][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0368 
Epoch: [2][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0376 
Epoch: [2][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0381 
Epoch: [2][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0401 
Epoch: [2][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0394 
Epoch: [2][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.0393 
Epoch: [2][1400/4

score1 = 0.8107682619647355, thresh=0.023282372444280715
score2 = 0.8182533438237607,  thresh=0.016057012542063424
Epoch 2 - avg_train_loss: 0.0355  avg_val_loss: 0.0508  time: 445s
Epoch 2 - Score: 0.8182533438237607


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0508 
Epoch: [3][0/4343] Elapsed 0m 0s (remain 17m 33s) Loss: 0.0008 
Epoch: [3][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.0345 
Epoch: [3][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0267 
Epoch: [3][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0254 
Epoch: [3][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.0227 
Epoch: [3][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.0249 
Epoch: [3][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0279 
Epoch: [3][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0282 
Epoch: [3][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0271 
Epoch: [3][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0264 
Epoch: [3][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0316 
Epoch: [3][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0335 
Epoch: [3][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0340 
Epoch: [3][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.0349 
Epoch: [3][1400/4

score1 = 0.15246348900657997, thresh=0.023282372444280715
score2 = 0.1205206492045637,  thresh=0.17733922702865548
Epoch 3 - avg_train_loss: 0.0342  avg_val_loss: 0.1425  time: 445s
Epoch 3 - Score: 0.1205206492045637


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1425 
Epoch: [4][0/4343] Elapsed 0m 0s (remain 17m 30s) Loss: 0.0021 
Epoch: [4][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.1659 
Epoch: [4][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.1211 
Epoch: [4][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0904 
Epoch: [4][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.0743 
Epoch: [4][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.0672 
Epoch: [4][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0616 
Epoch: [4][700/4343] Elapsed 1m 7s (remain 5m 50s) Loss: 0.0564 
Epoch: [4][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0537 
Epoch: [4][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0515 
Epoch: [4][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0508 
Epoch: [4][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0486 
Epoch: [4][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0484 
Epoch: [4][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.0510 
Epoch: [4][1400/4

score1 = 0.8345031976290749, thresh=0.023282372444280715
score2 = 0.8351545426162973,  thresh=0.027870357653506476
Epoch 4 - avg_train_loss: 0.0450  avg_val_loss: 0.0527  time: 445s
Epoch 4 - Score: 0.8351545426162973


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0527 


Score1: 0.90050
best border: 0.05892
Score2: 0.84694
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

Epoch: [1][0/4343] Elapsed 0m 0s (remain 18m 51s) Loss: 0.8152 
Epoch: [1][100/4343] Elapsed 0m 9s (remain 6m 51s) Loss: 0.2090 
Epoch: [1][200/4343] Elapsed 0m 19s (remain 6m 39s) Loss: 0.1680 
Epoch: [1][300/4343] Elapsed 0m 28s (remain 6m 29s) Loss: 0.1361 
Epoch: [1][400/4343] Elapsed 0m 38s (remain 6m 19s) Loss: 0.1175 
Epoch: [1][500/4343] Elapsed 0m 48s (remain 6m 9s) Loss: 0.1078 
Epoch: [1][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.1014 
Epoch: [1][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.1048 
Epoch: [1][800/4343] Elapsed 1m 16s (remain 5m 40s) Loss: 0.0986 
Epoch: [1][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.1022 
Epoch: [1][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.1008 
Epoch: [1][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0981 
Epoch: [1][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0976 
Epoch: [1][1300/4343] Elapsed 2m 4s (remain 4m 51s) Loss: 0.0942 
Epoch: [1][1400/4343] Elapsed 2m 14s (remain 4m 42s) Loss: 0.0935 
Epoch: [1][1

score1 = 0.8255451713395641, thresh=0.023282372444280715
score2 = 0.8464047212300049,  thresh=0.019357529155368074
Epoch 1 - avg_train_loss: 0.0687  avg_val_loss: 0.0491  time: 443s
Epoch 1 - Score: 0.8464047212300049
Epoch 1 - Save Best Score: 0.8464 - Best Loss: 0.0491 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0491 
Epoch: [2][0/4343] Elapsed 0m 0s (remain 18m 44s) Loss: 0.0015 
Epoch: [2][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.0426 
Epoch: [2][200/4343] Elapsed 0m 19s (remain 6m 40s) Loss: 0.0427 
Epoch: [2][300/4343] Elapsed 0m 29s (remain 6m 29s) Loss: 0.0359 
Epoch: [2][400/4343] Elapsed 0m 38s (remain 6m 19s) Loss: 0.0365 
Epoch: [2][500/4343] Elapsed 0m 48s (remain 6m 9s) Loss: 0.0359 
Epoch: [2][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.0351 
Epoch: [2][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.0348 
Epoch: [2][800/4343] Elapsed 1m 16s (remain 5m 40s) Loss: 0.0367 
Epoch: [2][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.0393 
Epoch: [2][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.0391 
Epoch: [2][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0389 
Epoch: [2][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0386 
Epoch: [2][1300/4343] Elapsed 2m 4s (remain 4m 52s) Loss: 0.0372 
Epoch: [2][1400/4

score1 = 0.8847515002308046, thresh=0.023282372444280715
score2 = 0.8672758246863869,  thresh=0.06531067092868038
Epoch 2 - avg_train_loss: 0.0386  avg_val_loss: 0.0465  time: 443s
Epoch 2 - Score: 0.8672758246863869
Epoch 2 - Save Best Score: 0.8673 - Best Loss: 0.0465 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0465 
Epoch: [3][0/4343] Elapsed 0m 0s (remain 20m 13s) Loss: 0.0011 
Epoch: [3][100/4343] Elapsed 0m 9s (remain 6m 54s) Loss: 0.0290 
Epoch: [3][200/4343] Elapsed 0m 19s (remain 6m 40s) Loss: 0.0338 
Epoch: [3][300/4343] Elapsed 0m 29s (remain 6m 29s) Loss: 0.0368 
Epoch: [3][400/4343] Elapsed 0m 38s (remain 6m 19s) Loss: 0.0350 
Epoch: [3][500/4343] Elapsed 0m 48s (remain 6m 9s) Loss: 0.0332 
Epoch: [3][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0366 
Epoch: [3][700/4343] Elapsed 1m 7s (remain 5m 50s) Loss: 0.0342 
Epoch: [3][800/4343] Elapsed 1m 16s (remain 5m 40s) Loss: 0.0334 
Epoch: [3][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.0321 
Epoch: [3][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0304 
Epoch: [3][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0324 
Epoch: [3][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0309 
Epoch: [3][1300/4343] Elapsed 2m 4s (remain 4m 52s) Loss: 0.0301 
Epoch: [3][1400/43

score1 = 0.8877566774741392, thresh=0.023282372444280715
score2 = 0.8923029174425824,  thresh=0.049739736929296945
Epoch 3 - avg_train_loss: 0.0257  avg_val_loss: 0.0487  time: 444s
Epoch 3 - Score: 0.8923029174425824


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0487 
Epoch: [4][0/4343] Elapsed 0m 0s (remain 18m 13s) Loss: 0.0442 
Epoch: [4][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.0108 
Epoch: [4][200/4343] Elapsed 0m 19s (remain 6m 40s) Loss: 0.0122 
Epoch: [4][300/4343] Elapsed 0m 29s (remain 6m 29s) Loss: 0.0109 
Epoch: [4][400/4343] Elapsed 0m 38s (remain 6m 19s) Loss: 0.0133 
Epoch: [4][500/4343] Elapsed 0m 48s (remain 6m 9s) Loss: 0.0148 
Epoch: [4][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.0152 
Epoch: [4][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.0142 
Epoch: [4][800/4343] Elapsed 1m 16s (remain 5m 40s) Loss: 0.0153 
Epoch: [4][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.0165 
Epoch: [4][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.0154 
Epoch: [4][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0158 
Epoch: [4][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0162 
Epoch: [4][1300/4343] Elapsed 2m 4s (remain 4m 52s) Loss: 0.0165 
Epoch: [4][1400/4

score1 = 0.859643638637074, thresh=0.023282372444280715
score2 = 0.860450563204005,  thresh=0.031225795555332475
Epoch 4 - avg_train_loss: 0.0198  avg_val_loss: 0.0446  time: 443s
Epoch 4 - Score: 0.860450563204005
Epoch 4 - Save Best Score: 0.8605 - Best Loss: 0.0446 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0446 


Score1: 0.85964
best border: 0.03123
Score2: 0.86045
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

Epoch: [1][0/4343] Elapsed 0m 0s (remain 18m 11s) Loss: 0.8129 
Epoch: [1][100/4343] Elapsed 0m 9s (remain 6m 51s) Loss: 0.1385 
Epoch: [1][200/4343] Elapsed 0m 19s (remain 6m 38s) Loss: 0.1257 
Epoch: [1][300/4343] Elapsed 0m 28s (remain 6m 28s) Loss: 0.1356 
Epoch: [1][400/4343] Elapsed 0m 38s (remain 6m 18s) Loss: 0.1236 
Epoch: [1][500/4343] Elapsed 0m 48s (remain 6m 8s) Loss: 0.1211 
Epoch: [1][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.1167 
Epoch: [1][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.1175 
Epoch: [1][800/4343] Elapsed 1m 16s (remain 5m 39s) Loss: 0.1116 
Epoch: [1][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.1069 
Epoch: [1][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.1027 
Epoch: [1][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.0989 
Epoch: [1][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.0954 
Epoch: [1][1300/4343] Elapsed 2m 4s (remain 4m 51s) Loss: 0.0932 
Epoch: [1][1400/4343] Elapsed 2m 14s (remain 4m 42s) Loss: 0.0912 
Epoch: [1][1

score1 = 0.0, thresh=0.023282372444280715
score2 = 0.0,  thresh=0.49999553788108364
Epoch 1 - avg_train_loss: 0.0845  avg_val_loss: 0.1092  time: 443s
Epoch 1 - Score: 0.0
Epoch 1 - Save Best Score: 0.0000 - Best Loss: 0.1092 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1092 
Epoch: [2][0/4343] Elapsed 0m 0s (remain 17m 49s) Loss: 0.0183 
Epoch: [2][100/4343] Elapsed 0m 9s (remain 6m 55s) Loss: 0.1008 
Epoch: [2][200/4343] Elapsed 0m 19s (remain 6m 42s) Loss: 0.1075 
Epoch: [2][300/4343] Elapsed 0m 29s (remain 6m 31s) Loss: 0.1089 
Epoch: [2][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.1075 
Epoch: [2][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.1002 
Epoch: [2][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0931 
Epoch: [2][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0948 
Epoch: [2][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0964 
Epoch: [2][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0982 
Epoch: [2][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.0993 
Epoch: [2][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.1034 
Epoch: [2][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.1034 
Epoch: [2][1300/4343] Elapsed 2m 5s (remain 4m 53s) Loss: 0.1023 
Epoch: [2][1400/4

score1 = 0.8735564110156945, thresh=0.023282372444280715
score2 = 0.8811866647084741,  thresh=0.015273631093868614
Epoch 2 - avg_train_loss: 0.0712  avg_val_loss: 0.0553  time: 445s
Epoch 2 - Score: 0.8811866647084741
Epoch 2 - Save Best Score: 0.8812 - Best Loss: 0.0553 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0553 
Epoch: [3][0/4343] Elapsed 0m 0s (remain 19m 59s) Loss: 0.0010 
Epoch: [3][100/4343] Elapsed 0m 9s (remain 6m 55s) Loss: 0.0406 
Epoch: [3][200/4343] Elapsed 0m 19s (remain 6m 42s) Loss: 0.0440 
Epoch: [3][300/4343] Elapsed 0m 29s (remain 6m 31s) Loss: 0.0404 
Epoch: [3][400/4343] Elapsed 0m 38s (remain 6m 21s) Loss: 0.0396 
Epoch: [3][500/4343] Elapsed 0m 48s (remain 6m 11s) Loss: 0.0402 
Epoch: [3][600/4343] Elapsed 0m 58s (remain 6m 1s) Loss: 0.0434 
Epoch: [3][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0475 
Epoch: [3][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0521 
Epoch: [3][900/4343] Elapsed 1m 26s (remain 5m 32s) Loss: 0.0516 
Epoch: [3][1000/4343] Elapsed 1m 36s (remain 5m 22s) Loss: 0.0509 
Epoch: [3][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0509 
Epoch: [3][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0494 
Epoch: [3][1300/4343] Elapsed 2m 5s (remain 4m 53s) Loss: 0.0487 
Epoch: [3][1400/4

score1 = 0.7554225878833208, thresh=0.023282372444280715
score2 = 0.7446968557243869,  thresh=0.050804882689202344
Epoch 3 - avg_train_loss: 0.0473  avg_val_loss: 0.0721  time: 445s
Epoch 3 - Score: 0.7446968557243869


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0721 
Epoch: [4][0/4343] Elapsed 0m 0s (remain 17m 49s) Loss: 0.0009 
Epoch: [4][100/4343] Elapsed 0m 9s (remain 6m 53s) Loss: 0.0473 
Epoch: [4][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0648 
Epoch: [4][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0587 
Epoch: [4][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.0545 
Epoch: [4][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.0511 
Epoch: [4][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.0496 
Epoch: [4][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0476 
Epoch: [4][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.0448 
Epoch: [4][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.0444 
Epoch: [4][1000/4343] Elapsed 1m 36s (remain 5m 22s) Loss: 0.0429 
Epoch: [4][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.0432 
Epoch: [4][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.0413 
Epoch: [4][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.0413 
Epoch: [4][1400/4

score1 = 0.7240734973528495, thresh=0.023282372444280715
score2 = 0.7109374999999999,  thresh=0.07294496276038648
Epoch 4 - avg_train_loss: 0.0420  avg_val_loss: 0.0610  time: 445s
Epoch 4 - Score: 0.7109374999999999


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0610 


Score1: 0.87356
best border: 0.01527
Score2: 0.88119
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

Epoch: [1][0/4343] Elapsed 0m 0s (remain 18m 27s) Loss: 0.6240 
Epoch: [1][100/4343] Elapsed 0m 9s (remain 6m 51s) Loss: 0.1141 
Epoch: [1][200/4343] Elapsed 0m 19s (remain 6m 38s) Loss: 0.1299 
Epoch: [1][300/4343] Elapsed 0m 28s (remain 6m 28s) Loss: 0.1319 
Epoch: [1][400/4343] Elapsed 0m 38s (remain 6m 18s) Loss: 0.1325 
Epoch: [1][500/4343] Elapsed 0m 48s (remain 6m 8s) Loss: 0.1298 
Epoch: [1][600/4343] Elapsed 0m 57s (remain 5m 59s) Loss: 0.1300 
Epoch: [1][700/4343] Elapsed 1m 7s (remain 5m 49s) Loss: 0.1244 
Epoch: [1][800/4343] Elapsed 1m 16s (remain 5m 39s) Loss: 0.1234 
Epoch: [1][900/4343] Elapsed 1m 26s (remain 5m 30s) Loss: 0.1207 
Epoch: [1][1000/4343] Elapsed 1m 36s (remain 5m 20s) Loss: 0.1248 
Epoch: [1][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.1248 
Epoch: [1][1200/4343] Elapsed 1m 55s (remain 5m 1s) Loss: 0.1214 
Epoch: [1][1300/4343] Elapsed 2m 4s (remain 4m 52s) Loss: 0.1198 
Epoch: [1][1400/4343] Elapsed 2m 14s (remain 4m 42s) Loss: 0.1197 
Epoch: [1][1

score1 = 0.3937007874015748, thresh=0.023282372444280715
score2 = 0.39382482671707625,  thresh=0.4934285039048734
Epoch 1 - avg_train_loss: 0.1122  avg_val_loss: 0.1123  time: 444s
Epoch 1 - Score: 0.39382482671707625
Epoch 1 - Save Best Score: 0.3938 - Best Loss: 0.1123 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1123 
Epoch: [2][0/4343] Elapsed 0m 0s (remain 18m 0s) Loss: 0.0171 
Epoch: [2][100/4343] Elapsed 0m 9s (remain 6m 55s) Loss: 0.0796 
Epoch: [2][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0890 
Epoch: [2][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0954 
Epoch: [2][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.1040 
Epoch: [2][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.1181 
Epoch: [2][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.1152 
Epoch: [2][700/4343] Elapsed 1m 7s (remain 5m 50s) Loss: 0.1118 
Epoch: [2][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.1105 
Epoch: [2][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.1118 
Epoch: [2][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.1122 
Epoch: [2][1100/4343] Elapsed 1m 45s (remain 5m 11s) Loss: 0.1111 
Epoch: [2][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.1130 
Epoch: [2][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.1138 
Epoch: [2][1400/43

score1 = 0.0, thresh=0.023282372444280715
score2 = 0.0,  thresh=0.49999553788108364
Epoch 2 - avg_train_loss: 0.1044  avg_val_loss: 0.1123  time: 444s
Epoch 2 - Score: 0.0
Epoch 2 - Save Best Score: 0.0000 - Best Loss: 0.1123 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1123 
Epoch: [3][0/4343] Elapsed 0m 0s (remain 20m 12s) Loss: 0.0206 
Epoch: [3][100/4343] Elapsed 0m 9s (remain 6m 55s) Loss: 0.1370 
Epoch: [3][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.1100 
Epoch: [3][300/4343] Elapsed 0m 29s (remain 6m 31s) Loss: 0.1037 
Epoch: [3][400/4343] Elapsed 0m 38s (remain 6m 21s) Loss: 0.1105 
Epoch: [3][500/4343] Elapsed 0m 48s (remain 6m 11s) Loss: 0.0999 
Epoch: [3][600/4343] Elapsed 0m 58s (remain 6m 1s) Loss: 0.0984 
Epoch: [3][700/4343] Elapsed 1m 7s (remain 5m 51s) Loss: 0.0964 
Epoch: [3][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.1000 
Epoch: [3][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.1025 
Epoch: [3][1000/4343] Elapsed 1m 36s (remain 5m 22s) Loss: 0.1038 
Epoch: [3][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.1093 
Epoch: [3][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.1073 
Epoch: [3][1300/4343] Elapsed 2m 5s (remain 4m 53s) Loss: 0.1086 
Epoch: [3][1400/4

score1 = 0.0, thresh=0.023282372444280715
score2 = 0.0,  thresh=0.49999553788108364
Epoch 3 - avg_train_loss: 0.1115  avg_val_loss: 0.1115  time: 445s
Epoch 3 - Score: 0.0
Epoch 3 - Save Best Score: 0.0000 - Best Loss: 0.1115 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.1115 
Epoch: [4][0/4343] Elapsed 0m 0s (remain 18m 2s) Loss: 0.0207 
Epoch: [4][100/4343] Elapsed 0m 9s (remain 6m 54s) Loss: 0.1135 
Epoch: [4][200/4343] Elapsed 0m 19s (remain 6m 41s) Loss: 0.0944 
Epoch: [4][300/4343] Elapsed 0m 29s (remain 6m 30s) Loss: 0.0989 
Epoch: [4][400/4343] Elapsed 0m 38s (remain 6m 20s) Loss: 0.1125 
Epoch: [4][500/4343] Elapsed 0m 48s (remain 6m 10s) Loss: 0.1131 
Epoch: [4][600/4343] Elapsed 0m 57s (remain 6m 0s) Loss: 0.1110 
Epoch: [4][700/4343] Elapsed 1m 7s (remain 5m 50s) Loss: 0.1047 
Epoch: [4][800/4343] Elapsed 1m 17s (remain 5m 41s) Loss: 0.1050 
Epoch: [4][900/4343] Elapsed 1m 26s (remain 5m 31s) Loss: 0.1035 
Epoch: [4][1000/4343] Elapsed 1m 36s (remain 5m 21s) Loss: 0.1038 
Epoch: [4][1100/4343] Elapsed 1m 46s (remain 5m 12s) Loss: 0.1046 
Epoch: [4][1200/4343] Elapsed 1m 55s (remain 5m 2s) Loss: 0.1029 
Epoch: [4][1300/4343] Elapsed 2m 5s (remain 4m 52s) Loss: 0.1020 
Epoch: [4][1400/43

score1 = 0.36421219319081555, thresh=0.023282372444280715
score2 = 0.36421219319081555,  thresh=0.49999553788108364
Epoch 4 - avg_train_loss: 0.0959  avg_val_loss: 0.0894  time: 445s
Epoch 4 - Score: 0.36421219319081555
Epoch 4 - Save Best Score: 0.3642 - Best Loss: 0.0894 Model


EVAL: [1085/1086] Elapsed 0m 26s (remain 0m 0s) Loss: 0.0894 


Score1: 0.36421
best border: 0.50000
Score2: 0.36421
Score1: 0.77443
best border: 0.01511
Score2: 0.77987


In [31]:
border, best_border

(0.023282372444280715, 0.015106569612934143)

In [32]:
# Inference
predictions = inference()

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

  0%|          | 0/8167 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

  0%|          | 0/8167 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

  0%|          | 0/8167 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

  0%|          | 0/8167 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

  0%|          | 0/8167 [00:00<?, ?it/s]

In [33]:
# for ensemble
test["judgement"] = predictions
test.to_csv(OUTPUT_DIR + f"predictions_{CFG.version}.csv", index=False, header=False)

In [34]:
# submission
predictions2 = np.where(predictions < border, 0, 1)
sub["judgement"] = predictions2
sub.to_csv(OUTPUT_DIR + f"submission_{CFG.version}.csv", index=False, header=False)