<a href="https://colab.research.google.com/github/nkimoto/signate/blob/main/SRWS-PSG/notebooks/bert_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#!pip uninstall torch-xla -y
#!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.6-cp37-cp37m-linux_x86_64.whl
#!pip uninstall torch -y
#!pip install torch==1.6

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install tqdm

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 13.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 63.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.1 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 65.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
 

In [38]:
import torch
#import torch_xla
#import torch_xla.core.xla_model as xm
#device = xm.xla_device()
#print(device)

In [9]:
import os
import sys
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

In [39]:
BATCH_SIZE = 16
NUM_WORKERS = 4
NUM_EPOCH = 4
NUM_FOLD = 5
#BERT_MODEL_NAME = "bert-base-uncased"
#BERT_MODEL_NAME = "allenai/scibert_scivocab_uncased"
BERT_MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

#SAVE_MODEL_NAME = "scibert_scivocab_uncased"
SAVE_MODEL_NAME = "pubmedbert_uncased"

TEXT_TYPE = "title+abstract"
if TEXT_TYPE == "title":
  MAX_TEXT_LENGTH = 72
elif TEXT_TYPE == "abstract":
  MAX_TEXT_LENGTH = 512
elif TEXT_TYPE == "title+abstract":
  MAX_TEXT_LENGTH = 512

# モデル設定
ITERS_TO_ACCUMULATE = 2    #パラメータ更新頻度 1で通常学習
FIX_PARAM = False    #パラメータ固定を実施するか
CHANGE_LEARNING_RATE = False    #層ごとに学習率を変更するか

In [40]:
DATA_DIR = "./drive/MyDrive/signate/rawdata/"
OUTPUT_DIR = f"./results_{SAVE_MODEL_NAME}_{TEXT_TYPE}/"
try:
  os.mkdir(OUTPUT_DIR)
except:
  pass

In [41]:
#warnings.filterwarnings("ignore")

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [43]:
def init_logger(log_file=OUTPUT_DIR + "train.log"):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [44]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

## データ読み込み

In [45]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [46]:
# この値を境に、モデルの出力を 0 と 1 にします。
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)

0.023282372444280715


## 前処理

In [51]:
def get_train_data(train):
    if TEXT_TYPE == "abstract":
        train_index = train.index.tolist()
        train = train.dropna()
        train_remain_index = train.index.tolist()
        train_drop_index = list(set(train_index) - set(train_remain_index)).sort()
        train = train.reset_index(drop=True)

    elif TEXT_TYPE == "title+abstract":
        train["title+abstract"] = train["title"]
        train.loc[train.notnull().all(axis=1), "title+abstract"] = train["title"] + "Abstract: " + train["abstract"]

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train


def get_test_data(test):
    if TEXT_TYPE == "abstract": 
        test_index = test.index.tolist()
        test = test.dropna()
        test_remain_index = test.index.tolist()
        test_drop_index = list(set(test_index) - set(test_remain_index)).sort()
        test = test.reset_index(drop=True)
    
    elif TEXT_TYPE == "title+abstract":
        train["title+abstract"] = train["title"]
        train.loc[train.notnull().all(axis=1), "title+abstract"] = train["title"] + "Abstract: " + train["abstract"]

    return test


train = get_train_data(train)
test = get_test_data(test)

## データセット定義

In [52]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        tokenizer = T.BertTokenizer.from_pretrained(model_name)

        self.df = df
        self.include_labels = include_labels

        self.title = df[TEXT_TYPE].tolist()
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = MAX_TEXT_LENGTH,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["judgement"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

## モデル定義

In [53]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        out = self.sigmoid(out.logits).squeeze()

        return out

# ツール

In [54]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

## 学習補助関数

In [55]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()
    
    # switch to train mode
    model.train()
    
    #step = 0
    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
    #for input_ids, attention_mask, labels in train_loader:
        optimizer.zero_grad()
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)

        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )
        #step += 1
    return losses.avg

In [56]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()
    
    # switch to train mode
    model.train()
    
    #step = 0
    optimizer.zero_grad()
    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
    #for input_ids, attention_mask, labels in train_loader:
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        loss = loss / ITERS_TO_ACCUMULATE

        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        if (step + 1) % ITERS_TO_ACCUMULATE == 0:
          optimizer.step()
          optimizer.zero_grad()

        #optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )
        #step += 1
    return losses.avg

## 評価補助関数

In [57]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    #step = 0
    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
    #for input_ids, attention_mask, labels in valid_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )
        #step += 1

    predictions = np.concatenate(preds)
    return losses.avg, predictions

## 推論関数

In [58]:
def inference():
    predictions = []

    test_dataset = BaseDataset(test, BERT_MODEL_NAME, include_labels=False)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False, 
        num_workers=NUM_WORKERS, 
        pin_memory=True
    )

    for fold in range(NUM_FOLD):
        LOGGER.info(f"========== model: {SAVE_MODEL_NAME} fold: {fold} inference ==========")
        model = BaseModel(BERT_MODEL_NAME)
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"{SAVE_MODEL_NAME}_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
    predictions = np.mean(predictions, axis=0)

    return predictions

## 学習

In [59]:
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds, BERT_MODEL_NAME)
    valid_dataset = BaseDataset(valid_folds, BERT_MODEL_NAME)

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = BaseModel(BERT_MODEL_NAME)
    model.to(device)

    # 重み固定
    if FIX_PARAM:
      model_params = list(model.named_parameters())
      # BERTの重みを固定する
      params = [p for n, p in model_params if not "bert" in n]
      optimizer = T.AdamW(params, lr=2e-5, correct_bias=True)

    # 学習率変更
    if CHANGE_LEARNING_RATE:
      model_params = list(model.named_parameters())
      # BERTの学習率を層ごとに変更する
      bert_params = [p for n, p in model_params if "bert" in n]
      other_params = [p for n, p in model_params if not "bert" in n]
      params = [
          {'params': bert_params, 'lr': 2e-5},
          {'params': other_params, 'lr': 2e-5 * 500}
      ]
      optimizer = T.AdamW(params)

    if not (FIX_PARAM or CHANGE_LEARNING_RATE):
      optimizer = T.AdamW(model.parameters(), lr=2e-5, correct_bias=True)

    criterion = nn.BCELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf

    for epoch in range(NUM_EPOCH):
        start_time = time.time()

        # 2エポック目から全レイヤー学習
        if epoch == 1:
          if FIX_PARAM:
            model_params = list(model.named_parameters())
            # BERTの重みを固定を解除する
            params = [p for n, p in model_params if "bert" in n]
            optimizer.add_param_group({'params': params})

        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values

        # scoring
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)

        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"{SAVE_MODEL_NAME}_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"{SAVE_MODEL_NAME}_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

## メイン

In [60]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

In [61]:
def main():
    # Training
    oof_df = pd.DataFrame()
    for fold in range(NUM_FOLD):
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
        get_result(_oof_df)
        
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    
    # Save OOF result
    oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    # Inference
    predictions = inference()
    predictions = np.where(predictions < border, 0, 1)

    # submission
    if TEXT_TYPE == "title":
      sub["judgement"] = predictions
    elif TEXT_TYPE == "abstract":
      sub["judgement"] = np.nan
      sub.loc[test_remain_index, "judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)

In [62]:
#torch.multiprocessing.set_sharing_strategy("file_system")

In [63]:
# CV
# pubmed_bert_abstract: 0.919
# pubmed_bert_title: 0.879
# scibert_abstract: 0.901くらい
# scibert_title: 0.8５２くらい

In [None]:
if __name__ == "__main__":
    main()

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSeque

Epoch: [1][0/1357] Elapsed 0m 0s (remain 16m 24s) Loss: 0.3514 
Epoch: [1][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0744 
Epoch: [1][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0620 
Epoch: [1][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0546 
Epoch: [1][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0513 
Epoch: [1][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0482 
Epoch: [1][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0451 
Epoch: [1][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0427 
Epoch: [1][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0403 
Epoch: [1][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0389 
Epoch: [1][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0368 
Epoch: [1][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0358 
Epoch: [1][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0351 
Epoch: [1][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0341 
Epoch: [1][1356/1357] Elapsed 10m 31s (remain 0m 0s) Loss: 0.0337 
EVAL: [0/

Epoch 1 - avg_train_loss: 0.0337  avg_val_loss: 0.0398  time: 685s
Epoch 1 - avg_train_loss: 0.0337  avg_val_loss: 0.0398  time: 685s
Epoch 1 - Score: 0.9067745803357314
Epoch 1 - Score: 0.9067745803357314
Epoch 1 - Save Best Score: 0.9068 Model
Epoch 1 - Save Best Score: 0.9068 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0398 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 14m 29s) Loss: 0.0101 
Epoch: [2][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0170 
Epoch: [2][200/1357] Elapsed 1m 33s (remain 8m 59s) Loss: 0.0187 
Epoch: [2][300/1357] Elapsed 2m 20s (remain 8m 12s) Loss: 0.0183 
Epoch: [2][400/1357] Elapsed 3m 6s (remain 7m 25s) Loss: 0.0180 
Epoch: [2][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0177 
Epoch: [2][600/1357] Elapsed 4m 39s (remain 5m 52s) Loss: 0.0169 
Epoch: [2][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0164 
Epoch: [2][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0168 
Epoch: [2][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0169 
Epoch: [2][1000/1357] Elapsed 7m 46s (remain 2m 45s) Loss: 0.0163 
Epoch: [2][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0162 
Epoch: [2][1200/1357] Elapsed 9m 19s (remain 1m 12s) Loss: 0.0158 
Epoch: [2][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0159 
Epoch: [2][1356/

Epoch 2 - avg_train_loss: 0.0159  avg_val_loss: 0.0353  time: 686s
Epoch 2 - avg_train_loss: 0.0159  avg_val_loss: 0.0353  time: 686s
Epoch 2 - Score: 0.8876196356900277
Epoch 2 - Score: 0.8876196356900277


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0353 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 14m 33s) Loss: 0.0051 
Epoch: [3][100/1357] Elapsed 0m 47s (remain 9m 47s) Loss: 0.0050 
Epoch: [3][200/1357] Elapsed 1m 33s (remain 8m 59s) Loss: 0.0087 
Epoch: [3][300/1357] Elapsed 2m 20s (remain 8m 12s) Loss: 0.0090 
Epoch: [3][400/1357] Elapsed 3m 6s (remain 7m 25s) Loss: 0.0089 
Epoch: [3][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0087 
Epoch: [3][600/1357] Elapsed 4m 39s (remain 5m 52s) Loss: 0.0088 
Epoch: [3][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0090 
Epoch: [3][800/1357] Elapsed 6m 13s (remain 4m 18s) Loss: 0.0091 
Epoch: [3][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0095 
Epoch: [3][1000/1357] Elapsed 7m 46s (remain 2m 45s) Loss: 0.0091 
Epoch: [3][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0089 
Epoch: [3][1200/1357] Elapsed 9m 19s (remain 1m 12s) Loss: 0.0090 
Epoch: [3][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0091 
Epoch: [3][1356/

Epoch 3 - avg_train_loss: 0.0090  avg_val_loss: 0.0425  time: 686s
Epoch 3 - avg_train_loss: 0.0090  avg_val_loss: 0.0425  time: 686s
Epoch 3 - Score: 0.9081030735796336
Epoch 3 - Score: 0.9081030735796336
Epoch 3 - Save Best Score: 0.9081 Model
Epoch 3 - Save Best Score: 0.9081 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0425 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 14m 14s) Loss: 0.0001 
Epoch: [4][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0049 
Epoch: [4][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0050 
Epoch: [4][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0045 
Epoch: [4][400/1357] Elapsed 3m 6s (remain 7m 25s) Loss: 0.0041 
Epoch: [4][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0040 
Epoch: [4][600/1357] Elapsed 4m 39s (remain 5m 52s) Loss: 0.0041 
Epoch: [4][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0041 
Epoch: [4][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0043 
Epoch: [4][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0044 
Epoch: [4][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0046 
Epoch: [4][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0050 
Epoch: [4][1200/1357] Elapsed 9m 19s (remain 1m 12s) Loss: 0.0050 
Epoch: [4][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0049 
Epoch: [4][1356/

Epoch 4 - avg_train_loss: 0.0048  avg_val_loss: 0.0712  time: 686s
Epoch 4 - avg_train_loss: 0.0048  avg_val_loss: 0.0712  time: 686s
Epoch 4 - Score: 0.7057890563045202
Epoch 4 - Score: 0.7057890563045202


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0712 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 14m 28s) Loss: 0.1966 
Epoch: [5][100/1357] Elapsed 0m 47s (remain 9m 47s) Loss: 0.0030 
Epoch: [5][200/1357] Elapsed 1m 33s (remain 8m 59s) Loss: 0.0021 
Epoch: [5][300/1357] Elapsed 2m 20s (remain 8m 12s) Loss: 0.0021 
Epoch: [5][400/1357] Elapsed 3m 6s (remain 7m 25s) Loss: 0.0021 
Epoch: [5][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0023 
Epoch: [5][600/1357] Elapsed 4m 39s (remain 5m 52s) Loss: 0.0036 
Epoch: [5][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0044 
Epoch: [5][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0045 
Epoch: [5][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0043 
Epoch: [5][1000/1357] Elapsed 7m 46s (remain 2m 45s) Loss: 0.0042 
Epoch: [5][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0040 
Epoch: [5][1200/1357] Elapsed 9m 19s (remain 1m 12s) Loss: 0.0040 
Epoch: [5][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0039 
Epoch: [5][1356/

Epoch 5 - avg_train_loss: 0.0039  avg_val_loss: 0.0599  time: 686s
Epoch 5 - avg_train_loss: 0.0039  avg_val_loss: 0.0599  time: 686s
Epoch 5 - Score: 0.758893280632411
Epoch 5 - Score: 0.758893280632411


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0599 


Score: 0.90810
Score: 0.90810
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1357] Elapsed 0m 0s (remain 14m 53s) Loss: 0.2336 
Epoch: [1][100/1357] Elapsed 0m 47s (remain 9m 44s) Loss: 0.0687 
Epoch: [1][200/1357] Elapsed 1m 33s (remain 8m 57s) Loss: 0.0558 
Epoch: [1][300/1357] Elapsed 2m 19s (remain 8m 11s) Loss: 0.0494 
Epoch: [1][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0459 
Epoch: [1][500/1357] Elapsed 3m 52s (remain 6m 38s) Loss: 0.0424 
Epoch: [1][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0396 
Epoch: [1][700/1357] Elapsed 5m 25s (remain 5m 5s) Loss: 0.0378 
Epoch: [1][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0363 
Epoch: [1][900/1357] Elapsed 6m 58s (remain 3m 32s) Loss: 0.0346 
Epoch: [1][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0332 
Epoch: [1][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0322 
Epoch: [1][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0320 
Epoch: [1][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0313 
Epoch: [1][1356/1357] Elapsed 10m 30s (remain 0m 0s) Loss: 0.0313 
EVAL: [0/

Epoch 1 - avg_train_loss: 0.0313  avg_val_loss: 0.0459  time: 685s
Epoch 1 - avg_train_loss: 0.0313  avg_val_loss: 0.0459  time: 685s
Epoch 1 - Score: 0.8849557522123894
Epoch 1 - Score: 0.8849557522123894
Epoch 1 - Save Best Score: 0.8850 Model
Epoch 1 - Save Best Score: 0.8850 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0459 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 14m 30s) Loss: 0.0058 
Epoch: [2][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0145 
Epoch: [2][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0172 
Epoch: [2][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0173 
Epoch: [2][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0183 
Epoch: [2][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0170 
Epoch: [2][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0173 
Epoch: [2][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0170 
Epoch: [2][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0172 
Epoch: [2][900/1357] Elapsed 6m 58s (remain 3m 32s) Loss: 0.0168 
Epoch: [2][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0167 
Epoch: [2][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0165 
Epoch: [2][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0162 
Epoch: [2][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0163 
Epoch: [2][1356/

Epoch 2 - avg_train_loss: 0.0161  avg_val_loss: 0.0437  time: 685s
Epoch 2 - avg_train_loss: 0.0161  avg_val_loss: 0.0437  time: 685s
Epoch 2 - Score: 0.8510306058713305
Epoch 2 - Score: 0.8510306058713305


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0437 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 14m 22s) Loss: 0.0083 
Epoch: [3][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0104 
Epoch: [3][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0085 
Epoch: [3][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0087 
Epoch: [3][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0098 
Epoch: [3][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0096 
Epoch: [3][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0100 
Epoch: [3][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0102 
Epoch: [3][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0101 
Epoch: [3][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0103 
Epoch: [3][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0100 
Epoch: [3][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0101 
Epoch: [3][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0101 
Epoch: [3][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0101 
Epoch: [3][1356/

Epoch 3 - avg_train_loss: 0.0101  avg_val_loss: 0.0429  time: 685s
Epoch 3 - avg_train_loss: 0.0101  avg_val_loss: 0.0429  time: 685s
Epoch 3 - Score: 0.9229595728451565
Epoch 3 - Score: 0.9229595728451565
Epoch 3 - Save Best Score: 0.9230 Model
Epoch 3 - Save Best Score: 0.9230 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0429 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 14m 20s) Loss: 0.0027 
Epoch: [4][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0043 
Epoch: [4][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0042 
Epoch: [4][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0040 
Epoch: [4][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0046 
Epoch: [4][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0052 
Epoch: [4][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0052 
Epoch: [4][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0047 
Epoch: [4][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0058 
Epoch: [4][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0060 
Epoch: [4][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0059 
Epoch: [4][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0058 
Epoch: [4][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0058 
Epoch: [4][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0060 
Epoch: [4][1356/

Epoch 4 - avg_train_loss: 0.0059  avg_val_loss: 0.0503  time: 685s
Epoch 4 - avg_train_loss: 0.0059  avg_val_loss: 0.0503  time: 685s
Epoch 4 - Score: 0.7965299684542587
Epoch 4 - Score: 0.7965299684542587


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0503 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 14m 25s) Loss: 0.0005 
Epoch: [5][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0039 
Epoch: [5][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0031 
Epoch: [5][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0045 
Epoch: [5][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0044 
Epoch: [5][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0039 
Epoch: [5][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0035 
Epoch: [5][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0040 
Epoch: [5][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0041 
Epoch: [5][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0041 
Epoch: [5][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0040 
Epoch: [5][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0042 
Epoch: [5][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0044 
Epoch: [5][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0044 
Epoch: [5][1356/

Epoch 5 - avg_train_loss: 0.0045  avg_val_loss: 0.0646  time: 685s
Epoch 5 - avg_train_loss: 0.0045  avg_val_loss: 0.0646  time: 685s
Epoch 5 - Score: 0.8637976245565324
Epoch 5 - Score: 0.8637976245565324


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0646 


Score: 0.92296
Score: 0.92296
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1357] Elapsed 0m 0s (remain 14m 36s) Loss: 0.3041 
Epoch: [1][100/1357] Elapsed 0m 46s (remain 9m 44s) Loss: 0.0746 
Epoch: [1][200/1357] Elapsed 1m 33s (remain 8m 57s) Loss: 0.0661 
Epoch: [1][300/1357] Elapsed 2m 19s (remain 8m 10s) Loss: 0.0574 
Epoch: [1][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0530 
Epoch: [1][500/1357] Elapsed 3m 52s (remain 6m 37s) Loss: 0.0488 
Epoch: [1][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0453 
Epoch: [1][700/1357] Elapsed 5m 25s (remain 5m 4s) Loss: 0.0432 
Epoch: [1][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0413 
Epoch: [1][900/1357] Elapsed 6m 58s (remain 3m 31s) Loss: 0.0388 
Epoch: [1][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0370 
Epoch: [1][1100/1357] Elapsed 8m 31s (remain 1m 58s) Loss: 0.0358 
Epoch: [1][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0347 
Epoch: [1][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0341 
Epoch: [1][1356/1357] Elapsed 10m 30s (remain 0m 0s) Loss: 0.0336 
EVAL: [0/

Epoch 1 - avg_train_loss: 0.0336  avg_val_loss: 0.0401  time: 685s
Epoch 1 - avg_train_loss: 0.0336  avg_val_loss: 0.0401  time: 685s
Epoch 1 - Score: 0.900175644028103
Epoch 1 - Score: 0.900175644028103
Epoch 1 - Save Best Score: 0.9002 Model
Epoch 1 - Save Best Score: 0.9002 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0401 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 14m 20s) Loss: 0.0056 
Epoch: [2][100/1357] Elapsed 0m 47s (remain 9m 45s) Loss: 0.0142 
Epoch: [2][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0168 
Epoch: [2][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0144 
Epoch: [2][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0141 
Epoch: [2][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0148 
Epoch: [2][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0136 
Epoch: [2][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0144 
Epoch: [2][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0142 
Epoch: [2][900/1357] Elapsed 6m 58s (remain 3m 32s) Loss: 0.0145 
Epoch: [2][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0151 
Epoch: [2][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0159 
Epoch: [2][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0158 
Epoch: [2][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0157 
Epoch: [2][1356/

Epoch 2 - avg_train_loss: 0.0159  avg_val_loss: 0.0388  time: 685s
Epoch 2 - avg_train_loss: 0.0159  avg_val_loss: 0.0388  time: 685s
Epoch 2 - Score: 0.9220839096357767
Epoch 2 - Score: 0.9220839096357767
Epoch 2 - Save Best Score: 0.9221 Model
Epoch 2 - Save Best Score: 0.9221 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0388 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 14m 14s) Loss: 0.0196 
Epoch: [3][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0093 
Epoch: [3][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0089 
Epoch: [3][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0094 
Epoch: [3][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0085 
Epoch: [3][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0087 
Epoch: [3][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0089 
Epoch: [3][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0083 
Epoch: [3][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0091 
Epoch: [3][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0095 
Epoch: [3][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0103 
Epoch: [3][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0103 
Epoch: [3][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0101 
Epoch: [3][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0100 
Epoch: [3][1356/

Epoch 3 - avg_train_loss: 0.0099  avg_val_loss: 0.0450  time: 685s
Epoch 3 - avg_train_loss: 0.0099  avg_val_loss: 0.0450  time: 685s
Epoch 3 - Score: 0.8777429467084639
Epoch 3 - Score: 0.8777429467084639


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0450 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 14m 13s) Loss: 0.0002 
Epoch: [4][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0085 
Epoch: [4][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0080 
Epoch: [4][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0064 
Epoch: [4][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0061 
Epoch: [4][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0065 
Epoch: [4][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0062 
Epoch: [4][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0058 
Epoch: [4][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0059 
Epoch: [4][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0058 
Epoch: [4][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0061 
Epoch: [4][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0062 
Epoch: [4][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0064 
Epoch: [4][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0063 
Epoch: [4][1356/

Epoch 4 - avg_train_loss: 0.0062  avg_val_loss: 0.0502  time: 685s
Epoch 4 - avg_train_loss: 0.0062  avg_val_loss: 0.0502  time: 685s
Epoch 4 - Score: 0.8250825082508251
Epoch 4 - Score: 0.8250825082508251


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0502 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 14m 30s) Loss: 0.0001 
Epoch: [5][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0033 
Epoch: [5][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0035 
Epoch: [5][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0036 
Epoch: [5][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0039 
Epoch: [5][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0041 
Epoch: [5][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0044 
Epoch: [5][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0046 
Epoch: [5][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0047 
Epoch: [5][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0046 
Epoch: [5][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0044 
Epoch: [5][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0044 
Epoch: [5][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0044 
Epoch: [5][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0043 
Epoch: [5][1356/

Epoch 5 - avg_train_loss: 0.0042  avg_val_loss: 0.0580  time: 685s
Epoch 5 - avg_train_loss: 0.0042  avg_val_loss: 0.0580  time: 685s
Epoch 5 - Score: 0.8483896307934014
Epoch 5 - Score: 0.8483896307934014


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0580 


Score: 0.92208
Score: 0.92208
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1357] Elapsed 0m 0s (remain 14m 37s) Loss: 0.3745 
Epoch: [1][100/1357] Elapsed 0m 46s (remain 9m 44s) Loss: 0.0805 
Epoch: [1][200/1357] Elapsed 1m 33s (remain 8m 57s) Loss: 0.0672 
Epoch: [1][300/1357] Elapsed 2m 19s (remain 8m 10s) Loss: 0.0596 
Epoch: [1][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0524 
Epoch: [1][500/1357] Elapsed 3m 52s (remain 6m 37s) Loss: 0.0487 
Epoch: [1][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0437 
Epoch: [1][700/1357] Elapsed 5m 25s (remain 5m 4s) Loss: 0.0415 
Epoch: [1][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0399 
Epoch: [1][900/1357] Elapsed 6m 58s (remain 3m 31s) Loss: 0.0388 
Epoch: [1][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0372 
Epoch: [1][1100/1357] Elapsed 8m 31s (remain 1m 59s) Loss: 0.0357 
Epoch: [1][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0347 
Epoch: [1][1300/1357] Elapsed 10m 4s (remain 0m 26s) Loss: 0.0337 
Epoch: [1][1356/1357] Elapsed 10m 30s (remain 0m 0s) Loss: 0.0332 
EVAL: [0/

Epoch 1 - avg_train_loss: 0.0332  avg_val_loss: 0.0472  time: 685s
Epoch 1 - avg_train_loss: 0.0332  avg_val_loss: 0.0472  time: 685s
Epoch 1 - Score: 0.8773256693389805
Epoch 1 - Score: 0.8773256693389805
Epoch 1 - Save Best Score: 0.8773 Model
Epoch 1 - Save Best Score: 0.8773 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0472 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 14m 19s) Loss: 0.0080 
Epoch: [2][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0165 
Epoch: [2][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0152 
Epoch: [2][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0171 
Epoch: [2][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0182 
Epoch: [2][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0169 
Epoch: [2][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0159 
Epoch: [2][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0160 
Epoch: [2][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0160 
Epoch: [2][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0159 
Epoch: [2][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0156 
Epoch: [2][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0153 
Epoch: [2][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0154 
Epoch: [2][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0156 
Epoch: [2][1356/

Epoch 2 - avg_train_loss: 0.0156  avg_val_loss: 0.0403  time: 685s
Epoch 2 - avg_train_loss: 0.0156  avg_val_loss: 0.0403  time: 685s
Epoch 2 - Score: 0.8844874508618084
Epoch 2 - Score: 0.8844874508618084
Epoch 2 - Save Best Score: 0.8845 Model
Epoch 2 - Save Best Score: 0.8845 Model


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0403 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 14m 12s) Loss: 0.0058 
Epoch: [3][100/1357] Elapsed 0m 47s (remain 9m 45s) Loss: 0.0077 
Epoch: [3][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0087 
Epoch: [3][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0084 
Epoch: [3][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0085 
Epoch: [3][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0089 
Epoch: [3][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0090 
Epoch: [3][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0089 
Epoch: [3][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0106 
Epoch: [3][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0104 
Epoch: [3][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0103 
Epoch: [3][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0099 
Epoch: [3][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0104 
Epoch: [3][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0105 
Epoch: [3][1356/

Epoch 3 - avg_train_loss: 0.0105  avg_val_loss: 0.0525  time: 685s
Epoch 3 - avg_train_loss: 0.0105  avg_val_loss: 0.0525  time: 685s
Epoch 3 - Score: 0.8225707257072569
Epoch 3 - Score: 0.8225707257072569


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0525 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 14m 23s) Loss: 0.0080 
Epoch: [4][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0064 
Epoch: [4][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0078 
Epoch: [4][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0071 
Epoch: [4][400/1357] Elapsed 3m 6s (remain 7m 24s) Loss: 0.0073 
Epoch: [4][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0067 
Epoch: [4][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0065 
Epoch: [4][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0064 
Epoch: [4][800/1357] Elapsed 6m 12s (remain 4m 18s) Loss: 0.0065 
Epoch: [4][900/1357] Elapsed 6m 59s (remain 3m 32s) Loss: 0.0064 
Epoch: [4][1000/1357] Elapsed 7m 45s (remain 2m 45s) Loss: 0.0066 
Epoch: [4][1100/1357] Elapsed 8m 32s (remain 1m 59s) Loss: 0.0066 
Epoch: [4][1200/1357] Elapsed 9m 18s (remain 1m 12s) Loss: 0.0068 
Epoch: [4][1300/1357] Elapsed 10m 5s (remain 0m 26s) Loss: 0.0069 
Epoch: [4][1356/

Epoch 4 - avg_train_loss: 0.0070  avg_val_loss: 0.0443  time: 685s
Epoch 4 - avg_train_loss: 0.0070  avg_val_loss: 0.0443  time: 685s
Epoch 4 - Score: 0.8393654705066996
Epoch 4 - Score: 0.8393654705066996


EVAL: [339/340] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0443 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 14m 27s) Loss: 0.0014 
Epoch: [5][100/1357] Elapsed 0m 47s (remain 9m 46s) Loss: 0.0025 
Epoch: [5][200/1357] Elapsed 1m 33s (remain 8m 58s) Loss: 0.0023 
Epoch: [5][300/1357] Elapsed 2m 20s (remain 8m 11s) Loss: 0.0037 
Epoch: [5][400/1357] Elapsed 3m 6s (remain 7m 25s) Loss: 0.0037 
Epoch: [5][500/1357] Elapsed 3m 53s (remain 6m 38s) Loss: 0.0036 
Epoch: [5][600/1357] Elapsed 4m 39s (remain 5m 51s) Loss: 0.0033 
Epoch: [5][700/1357] Elapsed 5m 26s (remain 5m 5s) Loss: 0.0042 


In [None]:
from google.colab import files
files.download(OUTPUT_DIR + "train.log")
files.download(OUTPUT_DIR + "oof_df.csv")
files.download(OUTPUT_DIR + "submission.csv")

In [28]:
if __name__ == "__main__":
    main()



Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSeque

Epoch: [1][0/1357] Elapsed 0m 0s (remain 6m 49s) Loss: 0.3569 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 28s) Loss: 0.0780 
Epoch: [1][200/1357] Elapsed 0m 23s (remain 2m 15s) Loss: 0.0640 
Epoch: [1][300/1357] Elapsed 0m 35s (remain 2m 3s) Loss: 0.0569 
Epoch: [1][400/1357] Elapsed 0m 46s (remain 1m 51s) Loss: 0.0532 
Epoch: [1][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0500 
Epoch: [1][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0475 
Epoch: [1][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0450 
Epoch: [1][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0429 
Epoch: [1][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0415 
Epoch: [1][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0394 
Epoch: [1][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0387 
Epoch: [1][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0380 
Epoch: [1][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0368 
Epoch: [1][1356/1357] Elapsed 2m 37s (remain 0m 0s) Loss: 0.0364 
EVAL: [0/340] 

Epoch 1 - avg_train_loss: 0.0364  avg_val_loss: 0.0458  time: 170s
Epoch 1 - Score: 0.8858858858858859
Epoch 1 - Save Best Score: 0.8859 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0458 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 57s) Loss: 0.0032 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0206 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0195 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0188 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0184 
Epoch: [2][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0193 
Epoch: [2][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0187 
Epoch: [2][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0198 
Epoch: [2][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0203 
Epoch: [2][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0202 
Epoch: [2][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0194 
Epoch: [2][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0194 
Epoch: [2][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0191 
Epoch: [2][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0195 
Epoch: [2][1356/1357

Epoch 2 - avg_train_loss: 0.0194  avg_val_loss: 0.0427  time: 170s
Epoch 2 - Score: 0.8894341358687318
Epoch 2 - Save Best Score: 0.8894 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0427 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 56s) Loss: 0.0013 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0088 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0091 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0106 
Epoch: [3][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0110 
Epoch: [3][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0105 
Epoch: [3][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0106 
Epoch: [3][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0105 
Epoch: [3][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0109 
Epoch: [3][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0110 
Epoch: [3][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0105 
Epoch: [3][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0104 
Epoch: [3][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0105 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0106 
Epoch: [3][1356/1357

Epoch 3 - avg_train_loss: 0.0106  avg_val_loss: 0.0537  time: 169s
Epoch 3 - Score: 0.8338480543545398


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0537 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 3m 59s) Loss: 0.0011 
Epoch: [4][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0081 
Epoch: [4][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0073 
Epoch: [4][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0061 
Epoch: [4][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0065 
Epoch: [4][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0067 
Epoch: [4][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0069 
Epoch: [4][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0064 
Epoch: [4][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0065 
Epoch: [4][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0069 
Epoch: [4][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0071 
Epoch: [4][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0074 
Epoch: [4][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0074 
Epoch: [4][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0075 
Epoch: [4][1356/1357

Epoch 4 - avg_train_loss: 0.0076  avg_val_loss: 0.0629  time: 169s
Epoch 4 - Score: 0.7035573122529645


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0629 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 3m 57s) Loss: 0.0339 
Epoch: [5][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0054 
Epoch: [5][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0045 
Epoch: [5][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0047 
Epoch: [5][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0043 
Epoch: [5][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0039 
Epoch: [5][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0045 
Epoch: [5][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0046 
Epoch: [5][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0049 
Epoch: [5][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0050 
Epoch: [5][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0048 
Epoch: [5][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0045 
Epoch: [5][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0044 
Epoch: [5][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0046 
Epoch: [5][1356/1357

Epoch 5 - avg_train_loss: 0.0045  avg_val_loss: 0.0716  time: 170s
Epoch 5 - Score: 0.7335541883577852


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0716 


Score: 0.88943
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 4s) Loss: 0.2031 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0697 
Epoch: [1][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0554 
Epoch: [1][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0495 
Epoch: [1][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0461 
Epoch: [1][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0430 
Epoch: [1][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0402 
Epoch: [1][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0389 
Epoch: [1][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0375 
Epoch: [1][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0361 
Epoch: [1][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0347 
Epoch: [1][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0342 
Epoch: [1][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0340 
Epoch: [1][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0335 
Epoch: [1][1356/1357] Elapsed 2m 37s (remain 0m 0s) Loss: 0.0335 
EVAL: [0/340] E

Epoch 1 - avg_train_loss: 0.0335  avg_val_loss: 0.0486  time: 170s
Epoch 1 - Score: 0.8807686708400058
Epoch 1 - Save Best Score: 0.8808 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0486 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 4m 9s) Loss: 0.0058 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0167 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0188 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0190 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0202 
Epoch: [2][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0199 
Epoch: [2][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0193 
Epoch: [2][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0187 
Epoch: [2][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0185 
Epoch: [2][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0182 
Epoch: [2][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0181 
Epoch: [2][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0176 
Epoch: [2][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0175 
Epoch: [2][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0175 
Epoch: [2][1356/1357]

Epoch 2 - avg_train_loss: 0.0173  avg_val_loss: 0.0538  time: 170s
Epoch 2 - Score: 0.839494762784966


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0538 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 59s) Loss: 0.0019 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0129 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0097 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0102 
Epoch: [3][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0111 
Epoch: [3][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0107 
Epoch: [3][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0100 
Epoch: [3][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0105 
Epoch: [3][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0107 
Epoch: [3][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0104 
Epoch: [3][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0101 
Epoch: [3][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0104 
Epoch: [3][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0104 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0102 
Epoch: [3][1356/1357

Epoch 3 - avg_train_loss: 0.0103  avg_val_loss: 0.0492  time: 169s
Epoch 3 - Score: 0.8651320871311602


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0492 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 4m 13s) Loss: 0.0005 
Epoch: [4][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0070 
Epoch: [4][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0049 
Epoch: [4][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0064 
Epoch: [4][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0064 
Epoch: [4][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0071 
Epoch: [4][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0069 
Epoch: [4][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0068 
Epoch: [4][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0071 
Epoch: [4][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0073 
Epoch: [4][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0072 
Epoch: [4][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0070 
Epoch: [4][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0071 
Epoch: [4][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0073 
Epoch: [4][1356/1357

Epoch 4 - avg_train_loss: 0.0073  avg_val_loss: 0.0590  time: 170s
Epoch 4 - Score: 0.7988721804511277


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0590 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 4m 2s) Loss: 0.0008 
Epoch: [5][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0041 
Epoch: [5][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0044 
Epoch: [5][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0052 
Epoch: [5][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0045 
Epoch: [5][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0044 
Epoch: [5][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0040 
Epoch: [5][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0049 
Epoch: [5][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0050 
Epoch: [5][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0049 
Epoch: [5][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0048 
Epoch: [5][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0047 
Epoch: [5][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0048 
Epoch: [5][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0048 
Epoch: [5][1356/1357]

Epoch 5 - avg_train_loss: 0.0048  avg_val_loss: 0.0728  time: 170s
Epoch 5 - Score: 0.7794048181388757


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0728 


Score: 0.88077
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 0s) Loss: 0.2413 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0731 
Epoch: [1][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0606 
Epoch: [1][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0516 
Epoch: [1][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0481 
Epoch: [1][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0453 
Epoch: [1][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0420 
Epoch: [1][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0410 
Epoch: [1][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0399 
Epoch: [1][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0383 
Epoch: [1][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0371 
Epoch: [1][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0363 
Epoch: [1][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0353 
Epoch: [1][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0350 
Epoch: [1][1356/1357] Elapsed 2m 37s (remain 0m 0s) Loss: 0.0346 
EVAL: [0/340] E

Epoch 1 - avg_train_loss: 0.0346  avg_val_loss: 0.0453  time: 170s
Epoch 1 - Score: 0.872836719337848
Epoch 1 - Save Best Score: 0.8728 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0453 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 4m 4s) Loss: 0.0070 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0177 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0201 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0198 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0190 
Epoch: [2][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0190 
Epoch: [2][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0174 
Epoch: [2][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0183 
Epoch: [2][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0182 
Epoch: [2][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0185 
Epoch: [2][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0193 
Epoch: [2][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0198 
Epoch: [2][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0194 
Epoch: [2][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0190 
Epoch: [2][1356/1357]

Epoch 2 - avg_train_loss: 0.0191  avg_val_loss: 0.0459  time: 170s
Epoch 2 - Score: 0.8827893175074184
Epoch 2 - Save Best Score: 0.8828 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0459 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 4m 5s) Loss: 0.0086 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0090 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0078 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0080 
Epoch: [3][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0082 
Epoch: [3][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0099 
Epoch: [3][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0105 
Epoch: [3][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0106 
Epoch: [3][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0112 
Epoch: [3][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0110 
Epoch: [3][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0115 
Epoch: [3][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0113 
Epoch: [3][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0117 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0116 
Epoch: [3][1356/1357]

Epoch 3 - avg_train_loss: 0.0113  avg_val_loss: 0.0594  time: 170s
Epoch 3 - Score: 0.8222153273347811


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0594 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 3m 55s) Loss: 0.0188 
Epoch: [4][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0083 
Epoch: [4][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0062 
Epoch: [4][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0059 
Epoch: [4][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0053 
Epoch: [4][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0061 
Epoch: [4][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0069 
Epoch: [4][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0067 
Epoch: [4][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0067 
Epoch: [4][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0072 
Epoch: [4][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0071 
Epoch: [4][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0077 
Epoch: [4][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0074 
Epoch: [4][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0073 
Epoch: [4][1356/1357

Epoch 4 - avg_train_loss: 0.0073  avg_val_loss: 0.0646  time: 170s
Epoch 4 - Score: 0.8161044613710554


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0646 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 3m 57s) Loss: 0.0002 
Epoch: [5][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0041 
Epoch: [5][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0048 
Epoch: [5][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0055 
Epoch: [5][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0047 
Epoch: [5][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0047 
Epoch: [5][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0044 
Epoch: [5][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0043 
Epoch: [5][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0044 
Epoch: [5][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0044 
Epoch: [5][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0042 
Epoch: [5][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0039 
Epoch: [5][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0042 
Epoch: [5][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0045 
Epoch: [5][1356/1357

Epoch 5 - avg_train_loss: 0.0046  avg_val_loss: 0.0698  time: 170s
Epoch 5 - Score: 0.7545983335953467


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0698 


Score: 0.88279
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 6s) Loss: 0.3686 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0866 
Epoch: [1][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0718 
Epoch: [1][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0644 
Epoch: [1][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0571 
Epoch: [1][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0538 
Epoch: [1][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0491 
Epoch: [1][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0464 
Epoch: [1][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0443 
Epoch: [1][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0429 
Epoch: [1][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0415 
Epoch: [1][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0402 
Epoch: [1][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0389 
Epoch: [1][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0377 
Epoch: [1][1356/1357] Elapsed 2m 37s (remain 0m 0s) Loss: 0.0371 
EVAL: [0/340] E

Epoch 1 - avg_train_loss: 0.0371  avg_val_loss: 0.0535  time: 170s
Epoch 1 - Score: 0.8475836431226764
Epoch 1 - Save Best Score: 0.8476 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0535 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 4m 13s) Loss: 0.0195 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0226 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0194 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0201 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0204 
Epoch: [2][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0204 
Epoch: [2][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0199 
Epoch: [2][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0200 
Epoch: [2][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0202 
Epoch: [2][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0197 
Epoch: [2][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0197 
Epoch: [2][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0193 
Epoch: [2][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0193 
Epoch: [2][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0195 
Epoch: [2][1356/1357

Epoch 2 - avg_train_loss: 0.0193  avg_val_loss: 0.0536  time: 170s
Epoch 2 - Score: 0.7747775391224302


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0536 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 4m 12s) Loss: 0.0058 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0093 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0097 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0103 
Epoch: [3][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0098 
Epoch: [3][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0103 
Epoch: [3][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0117 
Epoch: [3][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0117 
Epoch: [3][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0125 
Epoch: [3][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0122 
Epoch: [3][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0121 
Epoch: [3][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0119 
Epoch: [3][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0121 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0119 
Epoch: [3][1356/1357

Epoch 3 - avg_train_loss: 0.0118  avg_val_loss: 0.0653  time: 170s
Epoch 3 - Score: 0.8083140877598151


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0653 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 4m 12s) Loss: 0.0125 
Epoch: [4][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0047 
Epoch: [4][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0075 
Epoch: [4][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0079 
Epoch: [4][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0077 
Epoch: [4][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0070 
Epoch: [4][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0069 
Epoch: [4][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0066 
Epoch: [4][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0070 
Epoch: [4][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0071 
Epoch: [4][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0071 
Epoch: [4][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0071 
Epoch: [4][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0073 
Epoch: [4][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0073 
Epoch: [4][1356/1357

Epoch 4 - avg_train_loss: 0.0073  avg_val_loss: 0.0601  time: 170s
Epoch 4 - Score: 0.782460489618841


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0601 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 4m 11s) Loss: 0.0022 
Epoch: [5][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0022 
Epoch: [5][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0020 
Epoch: [5][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0053 
Epoch: [5][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0050 
Epoch: [5][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0050 
Epoch: [5][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0055 
Epoch: [5][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0060 
Epoch: [5][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0059 
Epoch: [5][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0062 
Epoch: [5][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0060 
Epoch: [5][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0060 
Epoch: [5][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0058 
Epoch: [5][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0057 
Epoch: [5][1356/1357

Epoch 5 - avg_train_loss: 0.0057  avg_val_loss: 0.0821  time: 170s
Epoch 5 - Score: 0.6291286568103177


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0821 


Score: 0.84758
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 9s) Loss: 0.4665 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0934 
Epoch: [1][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0653 
Epoch: [1][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0581 
Epoch: [1][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0533 
Epoch: [1][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0498 
Epoch: [1][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0477 
Epoch: [1][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0444 
Epoch: [1][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0431 
Epoch: [1][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0419 
Epoch: [1][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0403 
Epoch: [1][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0404 
Epoch: [1][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0400 
Epoch: [1][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0387 
Epoch: [1][1356/1357] Elapsed 2m 37s (remain 0m 0s) Loss: 0.0386 
EVAL: [0/340] E

Epoch 1 - avg_train_loss: 0.0386  avg_val_loss: 0.0494  time: 170s
Epoch 1 - Score: 0.8509345084333688
Epoch 1 - Save Best Score: 0.8509 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0494 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 4m 6s) Loss: 0.0017 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0232 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0209 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0203 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0212 
Epoch: [2][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0217 
Epoch: [2][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0210 
Epoch: [2][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0211 
Epoch: [2][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0210 
Epoch: [2][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0206 
Epoch: [2][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0204 
Epoch: [2][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0204 
Epoch: [2][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0201 
Epoch: [2][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0203 
Epoch: [2][1356/1357]

Epoch 2 - avg_train_loss: 0.0201  avg_val_loss: 0.0771  time: 170s
Epoch 2 - Score: 0.8991745283018867
Epoch 2 - Save Best Score: 0.8992 Model


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0771 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 4m 8s) Loss: 0.0994 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0134 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0125 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0107 
Epoch: [3][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0111 
Epoch: [3][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0116 
Epoch: [3][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0122 
Epoch: [3][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0125 
Epoch: [3][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0123 
Epoch: [3][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0126 
Epoch: [3][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0125 
Epoch: [3][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0127 
Epoch: [3][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0127 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0130 
Epoch: [3][1356/1357]

Epoch 3 - avg_train_loss: 0.0129  avg_val_loss: 0.0533  time: 170s
Epoch 3 - Score: 0.8806861693980701


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0533 
Epoch: [4][0/1357] Elapsed 0m 0s (remain 4m 4s) Loss: 0.0004 
Epoch: [4][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0056 
Epoch: [4][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0067 
Epoch: [4][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0068 
Epoch: [4][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0071 
Epoch: [4][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0070 
Epoch: [4][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0070 
Epoch: [4][700/1357] Elapsed 1m 21s (remain 1m 15s) Loss: 0.0078 
Epoch: [4][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0077 
Epoch: [4][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0079 
Epoch: [4][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0080 
Epoch: [4][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0079 
Epoch: [4][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0079 
Epoch: [4][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0082 
Epoch: [4][1356/1357]

Epoch 4 - avg_train_loss: 0.0083  avg_val_loss: 0.0588  time: 170s
Epoch 4 - Score: 0.7858699035169623


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0588 
Epoch: [5][0/1357] Elapsed 0m 0s (remain 4m 4s) Loss: 0.0007 
Epoch: [5][100/1357] Elapsed 0m 11s (remain 2m 26s) Loss: 0.0083 
Epoch: [5][200/1357] Elapsed 0m 23s (remain 2m 14s) Loss: 0.0065 
Epoch: [5][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0069 
Epoch: [5][400/1357] Elapsed 0m 46s (remain 1m 50s) Loss: 0.0070 
Epoch: [5][500/1357] Elapsed 0m 58s (remain 1m 39s) Loss: 0.0066 
Epoch: [5][600/1357] Elapsed 1m 9s (remain 1m 27s) Loss: 0.0063 
Epoch: [5][700/1357] Elapsed 1m 21s (remain 1m 16s) Loss: 0.0060 
Epoch: [5][800/1357] Elapsed 1m 32s (remain 1m 4s) Loss: 0.0056 
Epoch: [5][900/1357] Elapsed 1m 44s (remain 0m 52s) Loss: 0.0052 
Epoch: [5][1000/1357] Elapsed 1m 55s (remain 0m 41s) Loss: 0.0049 
Epoch: [5][1100/1357] Elapsed 2m 7s (remain 0m 29s) Loss: 0.0054 
Epoch: [5][1200/1357] Elapsed 2m 19s (remain 0m 18s) Loss: 0.0054 
Epoch: [5][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0051 
Epoch: [5][1356/1357]

Epoch 5 - avg_train_loss: 0.0051  avg_val_loss: 0.0772  time: 170s
Epoch 5 - Score: 0.8423493044822257


EVAL: [339/340] Elapsed 0m 12s (remain 0m 0s) Loss: 0.0772 


Score: 0.89917
Score: 0.87993
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

In [34]:
if __name__ == "__main__":
    main()



Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSeque

Epoch: [1][0/1137] Elapsed 0m 0s (remain 13m 32s) Loss: 0.3552 
Epoch: [1][100/1137] Elapsed 0m 47s (remain 8m 3s) Loss: 0.0767 
Epoch: [1][200/1137] Elapsed 1m 33s (remain 7m 16s) Loss: 0.0683 
Epoch: [1][300/1137] Elapsed 2m 20s (remain 6m 29s) Loss: 0.0613 
Epoch: [1][400/1137] Elapsed 3m 6s (remain 5m 43s) Loss: 0.0565 
Epoch: [1][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0496 
Epoch: [1][600/1137] Elapsed 4m 40s (remain 4m 9s) Loss: 0.0450 
Epoch: [1][700/1137] Elapsed 5m 26s (remain 3m 23s) Loss: 0.0420 
Epoch: [1][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0389 
Epoch: [1][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0373 
Epoch: [1][1000/1137] Elapsed 7m 46s (remain 1m 3s) Loss: 0.0353 
Epoch: [1][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0340 
Epoch: [1][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0334 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0461 
EVAL: [100/285] Elapsed 0m 16s (remain 0m 29s) Loss: 0.0369 
EVAL: [200/285] Elapsed 0m 3

Epoch 1 - avg_train_loss: 0.0334  avg_val_loss: 0.0418  time: 576s
Epoch 1 - avg_train_loss: 0.0334  avg_val_loss: 0.0418  time: 576s
Epoch 1 - Score: 0.9128946367440093
Epoch 1 - Score: 0.9128946367440093
Epoch 1 - Save Best Score: 0.9129 Model
Epoch 1 - Save Best Score: 0.9129 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0418 
Epoch: [2][0/1137] Elapsed 0m 0s (remain 11m 50s) Loss: 0.0007 
Epoch: [2][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0168 
Epoch: [2][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0156 
Epoch: [2][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0165 
Epoch: [2][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0163 
Epoch: [2][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0171 
Epoch: [2][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0177 
Epoch: [2][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0176 
Epoch: [2][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0169 
Epoch: [2][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0165 
Epoch: [2][1000/1137] Elapsed 7m 46s (remain 1m 3s) Loss: 0.0170 
Epoch: [2][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0167 
Epoch: [2][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0164 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0030 
EVAL: [100/285] Elapsed 0m 1

Epoch 2 - avg_train_loss: 0.0164  avg_val_loss: 0.0373  time: 576s
Epoch 2 - avg_train_loss: 0.0164  avg_val_loss: 0.0373  time: 576s
Epoch 2 - Score: 0.8678881388621021
Epoch 2 - Score: 0.8678881388621021


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0373 
Epoch: [3][0/1137] Elapsed 0m 0s (remain 11m 49s) Loss: 0.0247 
Epoch: [3][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0086 
Epoch: [3][200/1137] Elapsed 1m 33s (remain 7m 16s) Loss: 0.0076 
Epoch: [3][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0087 
Epoch: [3][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0101 
Epoch: [3][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0103 
Epoch: [3][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0116 
Epoch: [3][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0118 
Epoch: [3][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0114 
Epoch: [3][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0112 
Epoch: [3][1000/1137] Elapsed 7m 46s (remain 1m 3s) Loss: 0.0108 
Epoch: [3][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0113 
Epoch: [3][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0112 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0013 
EVAL: [100/285] Elapsed 0m 1

Epoch 3 - avg_train_loss: 0.0112  avg_val_loss: 0.0379  time: 576s
Epoch 3 - avg_train_loss: 0.0112  avg_val_loss: 0.0379  time: 576s
Epoch 3 - Score: 0.8639099204038052
Epoch 3 - Score: 0.8639099204038052


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0379 
Epoch: [4][0/1137] Elapsed 0m 0s (remain 12m 2s) Loss: 0.0002 
Epoch: [4][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0052 
Epoch: [4][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0048 
Epoch: [4][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0056 
Epoch: [4][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0054 
Epoch: [4][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0052 
Epoch: [4][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0050 
Epoch: [4][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0051 
Epoch: [4][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0052 
Epoch: [4][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0058 
Epoch: [4][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0061 
Epoch: [4][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0061 
Epoch: [4][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0060 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0305 
EVAL: [100/285] Elapsed 0m 16

Epoch 4 - avg_train_loss: 0.0060  avg_val_loss: 0.0367  time: 576s
Epoch 4 - avg_train_loss: 0.0060  avg_val_loss: 0.0367  time: 576s
Epoch 4 - Score: 0.9117082533589252
Epoch 4 - Score: 0.9117082533589252


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0367 
Epoch: [5][0/1137] Elapsed 0m 0s (remain 11m 49s) Loss: 0.0002 
Epoch: [5][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0031 
Epoch: [5][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0028 
Epoch: [5][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0035 
Epoch: [5][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0030 
Epoch: [5][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0029 
Epoch: [5][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0035 
Epoch: [5][700/1137] Elapsed 5m 26s (remain 3m 23s) Loss: 0.0038 
Epoch: [5][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0037 
Epoch: [5][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0039 
Epoch: [5][1000/1137] Elapsed 7m 46s (remain 1m 3s) Loss: 0.0038 
Epoch: [5][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0039 
Epoch: [5][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0038 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0757 
EVAL: [100/285] Elapsed 0m 1

Epoch 5 - avg_train_loss: 0.0038  avg_val_loss: 0.0480  time: 576s
Epoch 5 - avg_train_loss: 0.0038  avg_val_loss: 0.0480  time: 576s
Epoch 5 - Score: 0.9205983889528194
Epoch 5 - Score: 0.9205983889528194
Epoch 5 - Save Best Score: 0.9206 Model
Epoch 5 - Save Best Score: 0.9206 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0480 


Score: 0.92060
Score: 0.92060
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1137] Elapsed 0m 0s (remain 12m 29s) Loss: 0.2305 
Epoch: [1][100/1137] Elapsed 0m 47s (remain 8m 3s) Loss: 0.0642 
Epoch: [1][200/1137] Elapsed 1m 33s (remain 7m 16s) Loss: 0.0518 
Epoch: [1][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0483 
Epoch: [1][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0444 
Epoch: [1][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0418 
Epoch: [1][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0388 
Epoch: [1][700/1137] Elapsed 5m 26s (remain 3m 23s) Loss: 0.0355 
Epoch: [1][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0342 
Epoch: [1][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0331 
Epoch: [1][1000/1137] Elapsed 7m 46s (remain 1m 3s) Loss: 0.0320 
Epoch: [1][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0310 
Epoch: [1][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0307 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0171 
EVAL: [100/285] Elapsed 0m 16s (remain 0m 29s) Loss: 0.0469 
EVAL: [200/285] Elapsed 0m 

Epoch 1 - avg_train_loss: 0.0307  avg_val_loss: 0.0392  time: 576s
Epoch 1 - avg_train_loss: 0.0307  avg_val_loss: 0.0392  time: 576s
Epoch 1 - Score: 0.8836247414927617
Epoch 1 - Score: 0.8836247414927617
Epoch 1 - Save Best Score: 0.8836 Model
Epoch 1 - Save Best Score: 0.8836 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0392 
Epoch: [2][0/1137] Elapsed 0m 0s (remain 12m 5s) Loss: 0.0025 
Epoch: [2][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0179 
Epoch: [2][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0230 
Epoch: [2][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0195 
Epoch: [2][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0186 
Epoch: [2][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0176 
Epoch: [2][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0174 
Epoch: [2][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0175 
Epoch: [2][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0168 
Epoch: [2][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0173 
Epoch: [2][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0170 
Epoch: [2][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0170 
Epoch: [2][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0169 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0533 
EVAL: [100/285] Elapsed 0m 16

Epoch 2 - avg_train_loss: 0.0169  avg_val_loss: 0.0322  time: 576s
Epoch 2 - avg_train_loss: 0.0169  avg_val_loss: 0.0322  time: 576s
Epoch 2 - Score: 0.938566552901024
Epoch 2 - Score: 0.938566552901024
Epoch 2 - Save Best Score: 0.9386 Model
Epoch 2 - Save Best Score: 0.9386 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0322 
Epoch: [3][0/1137] Elapsed 0m 0s (remain 12m 2s) Loss: 0.0032 
Epoch: [3][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0045 
Epoch: [3][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0069 
Epoch: [3][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0077 
Epoch: [3][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0079 
Epoch: [3][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0086 
Epoch: [3][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0091 
Epoch: [3][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0092 
Epoch: [3][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0095 
Epoch: [3][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0094 
Epoch: [3][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0096 
Epoch: [3][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0094 
Epoch: [3][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0099 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0374 
EVAL: [100/285] Elapsed 0m 16

Epoch 3 - avg_train_loss: 0.0099  avg_val_loss: 0.0446  time: 576s
Epoch 3 - avg_train_loss: 0.0099  avg_val_loss: 0.0446  time: 576s
Epoch 3 - Score: 0.8347893612890701
Epoch 3 - Score: 0.8347893612890701


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0446 
Epoch: [4][0/1137] Elapsed 0m 0s (remain 12m 1s) Loss: 0.0065 
Epoch: [4][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0051 
Epoch: [4][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0046 
Epoch: [4][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0052 
Epoch: [4][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0062 
Epoch: [4][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0062 
Epoch: [4][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0063 
Epoch: [4][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0062 
Epoch: [4][800/1137] Elapsed 6m 14s (remain 2m 36s) Loss: 0.0059 
Epoch: [4][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0064 
Epoch: [4][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0067 
Epoch: [4][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0069 
Epoch: [4][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0069 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0065 
EVAL: [100/285] Elapsed 0m 16

Epoch 4 - avg_train_loss: 0.0069  avg_val_loss: 0.0365  time: 576s
Epoch 4 - avg_train_loss: 0.0069  avg_val_loss: 0.0365  time: 576s
Epoch 4 - Score: 0.8778699594829251
Epoch 4 - Score: 0.8778699594829251


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0365 
Epoch: [5][0/1137] Elapsed 0m 0s (remain 12m 13s) Loss: 0.0075 
Epoch: [5][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0041 
Epoch: [5][200/1137] Elapsed 1m 34s (remain 7m 17s) Loss: 0.0043 
Epoch: [5][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0038 
Epoch: [5][400/1137] Elapsed 3m 7s (remain 5m 44s) Loss: 0.0039 
Epoch: [5][500/1137] Elapsed 3m 54s (remain 4m 57s) Loss: 0.0042 
Epoch: [5][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0044 
Epoch: [5][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0045 
Epoch: [5][800/1137] Elapsed 6m 14s (remain 2m 37s) Loss: 0.0050 
Epoch: [5][900/1137] Elapsed 7m 1s (remain 1m 50s) Loss: 0.0049 
Epoch: [5][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0047 
Epoch: [5][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0045 
Epoch: [5][1136/1137] Elapsed 8m 51s (remain 0m 0s) Loss: 0.0044 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0021 
EVAL: [100/285] Elapsed 0m 1

Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0600  time: 577s
Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0600  time: 577s
Epoch 5 - Score: 0.7898285494124446
Epoch 5 - Score: 0.7898285494124446


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0600 


Score: 0.93857
Score: 0.93857
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1137] Elapsed 0m 0s (remain 12m 30s) Loss: 0.3382 
Epoch: [1][100/1137] Elapsed 0m 47s (remain 8m 3s) Loss: 0.0721 
Epoch: [1][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0613 
Epoch: [1][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0533 
Epoch: [1][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0464 
Epoch: [1][500/1137] Elapsed 3m 53s (remain 4m 57s) Loss: 0.0416 
Epoch: [1][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0390 
Epoch: [1][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0368 
Epoch: [1][800/1137] Elapsed 6m 14s (remain 2m 36s) Loss: 0.0353 
Epoch: [1][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0336 
Epoch: [1][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0320 
Epoch: [1][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0309 
Epoch: [1][1136/1137] Elapsed 8m 51s (remain 0m 0s) Loss: 0.0306 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 36s) Loss: 0.1417 
EVAL: [100/285] Elapsed 0m 16s (remain 0m 29s) Loss: 0.0474 
EVAL: [200/285] Elapsed 0m 

Epoch 1 - avg_train_loss: 0.0306  avg_val_loss: 0.0568  time: 577s
Epoch 1 - avg_train_loss: 0.0306  avg_val_loss: 0.0568  time: 577s
Epoch 1 - Score: 0.8715759516186411
Epoch 1 - Score: 0.8715759516186411
Epoch 1 - Save Best Score: 0.8716 Model
Epoch 1 - Save Best Score: 0.8716 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0568 
Epoch: [2][0/1137] Elapsed 0m 0s (remain 12m 0s) Loss: 0.0010 
Epoch: [2][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0150 
Epoch: [2][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0138 
Epoch: [2][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0152 
Epoch: [2][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0154 
Epoch: [2][500/1137] Elapsed 3m 54s (remain 4m 57s) Loss: 0.0155 
Epoch: [2][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0160 
Epoch: [2][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0158 
Epoch: [2][800/1137] Elapsed 6m 14s (remain 2m 37s) Loss: 0.0160 
Epoch: [2][900/1137] Elapsed 7m 1s (remain 1m 50s) Loss: 0.0165 
Epoch: [2][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0162 
Epoch: [2][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0160 
Epoch: [2][1136/1137] Elapsed 8m 51s (remain 0m 0s) Loss: 0.0158 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0120 
EVAL: [100/285] Elapsed 0m 16

Epoch 2 - avg_train_loss: 0.0158  avg_val_loss: 0.0414  time: 577s
Epoch 2 - avg_train_loss: 0.0158  avg_val_loss: 0.0414  time: 577s
Epoch 2 - Score: 0.9184629803186505
Epoch 2 - Score: 0.9184629803186505
Epoch 2 - Save Best Score: 0.9185 Model
Epoch 2 - Save Best Score: 0.9185 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0414 
Epoch: [3][0/1137] Elapsed 0m 0s (remain 12m 7s) Loss: 0.0032 
Epoch: [3][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0073 
Epoch: [3][200/1137] Elapsed 1m 34s (remain 7m 17s) Loss: 0.0072 
Epoch: [3][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0094 
Epoch: [3][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0093 
Epoch: [3][500/1137] Elapsed 3m 54s (remain 4m 57s) Loss: 0.0099 
Epoch: [3][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0096 
Epoch: [3][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0097 
Epoch: [3][800/1137] Elapsed 6m 14s (remain 2m 36s) Loss: 0.0097 
Epoch: [3][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0096 
Epoch: [3][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0100 
Epoch: [3][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0098 
Epoch: [3][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0098 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0042 
EVAL: [100/285] Elapsed 0m 16

Epoch 3 - avg_train_loss: 0.0098  avg_val_loss: 0.0459  time: 576s
Epoch 3 - avg_train_loss: 0.0098  avg_val_loss: 0.0459  time: 576s
Epoch 3 - Score: 0.87248322147651
Epoch 3 - Score: 0.87248322147651


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0459 
Epoch: [4][0/1137] Elapsed 0m 0s (remain 12m 7s) Loss: 0.0004 
Epoch: [4][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0060 
Epoch: [4][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0051 
Epoch: [4][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0050 
Epoch: [4][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0052 
Epoch: [4][500/1137] Elapsed 3m 53s (remain 4m 57s) Loss: 0.0051 
Epoch: [4][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0051 
Epoch: [4][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0051 
Epoch: [4][800/1137] Elapsed 6m 14s (remain 2m 36s) Loss: 0.0055 
Epoch: [4][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0057 
Epoch: [4][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0057 
Epoch: [4][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0058 
Epoch: [4][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0059 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 34s) Loss: 0.0122 
EVAL: [100/285] Elapsed 0m 16

Epoch 4 - avg_train_loss: 0.0059  avg_val_loss: 0.0430  time: 576s
Epoch 4 - avg_train_loss: 0.0059  avg_val_loss: 0.0430  time: 576s
Epoch 4 - Score: 0.8978032473734479
Epoch 4 - Score: 0.8978032473734479


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0430 
Epoch: [5][0/1137] Elapsed 0m 0s (remain 12m 4s) Loss: 0.0024 
Epoch: [5][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0023 
Epoch: [5][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0029 
Epoch: [5][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0028 
Epoch: [5][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0031 
Epoch: [5][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0039 
Epoch: [5][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0042 
Epoch: [5][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0046 
Epoch: [5][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0044 
Epoch: [5][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0042 
Epoch: [5][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0039 
Epoch: [5][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0039 
Epoch: [5][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0040 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0017 
EVAL: [100/285] Elapsed 0m 16

Epoch 5 - avg_train_loss: 0.0040  avg_val_loss: 0.0558  time: 576s
Epoch 5 - avg_train_loss: 0.0040  avg_val_loss: 0.0558  time: 576s
Epoch 5 - Score: 0.777907429015947
Epoch 5 - Score: 0.777907429015947


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0558 


Score: 0.91846
Score: 0.91846
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1137] Elapsed 0m 0s (remain 12m 18s) Loss: 0.3734 
Epoch: [1][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0682 
Epoch: [1][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0591 
Epoch: [1][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0527 
Epoch: [1][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0470 
Epoch: [1][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0447 
Epoch: [1][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0419 
Epoch: [1][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0391 
Epoch: [1][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0367 
Epoch: [1][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0353 
Epoch: [1][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0338 
Epoch: [1][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0330 
Epoch: [1][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0327 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0482 
EVAL: [100/285] Elapsed 0m 16s (remain 0m 29s) Loss: 0.0444 
EVAL: [200/285] Elapsed 0m 

Epoch 1 - avg_train_loss: 0.0327  avg_val_loss: 0.0426  time: 576s
Epoch 1 - avg_train_loss: 0.0327  avg_val_loss: 0.0426  time: 576s
Epoch 1 - Score: 0.9037255625230541
Epoch 1 - Score: 0.9037255625230541
Epoch 1 - Save Best Score: 0.9037 Model
Epoch 1 - Save Best Score: 0.9037 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0426 
Epoch: [2][0/1137] Elapsed 0m 0s (remain 12m 0s) Loss: 0.1755 
Epoch: [2][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0214 
Epoch: [2][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0163 
Epoch: [2][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0173 
Epoch: [2][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0164 
Epoch: [2][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0154 
Epoch: [2][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0162 
Epoch: [2][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0168 
Epoch: [2][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0164 
Epoch: [2][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0153 
Epoch: [2][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0153 
Epoch: [2][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0157 
Epoch: [2][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0157 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0107 
EVAL: [100/285] Elapsed 0m 16

Epoch 2 - avg_train_loss: 0.0157  avg_val_loss: 0.0325  time: 576s
Epoch 2 - avg_train_loss: 0.0157  avg_val_loss: 0.0325  time: 576s
Epoch 2 - Score: 0.9347541596560105
Epoch 2 - Score: 0.9347541596560105
Epoch 2 - Save Best Score: 0.9348 Model
Epoch 2 - Save Best Score: 0.9348 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0325 
Epoch: [3][0/1137] Elapsed 0m 0s (remain 12m 0s) Loss: 0.0012 
Epoch: [3][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0058 
Epoch: [3][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0083 
Epoch: [3][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0085 
Epoch: [3][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0081 
Epoch: [3][500/1137] Elapsed 3m 53s (remain 4m 57s) Loss: 0.0091 
Epoch: [3][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0094 
Epoch: [3][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0093 
Epoch: [3][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0092 
Epoch: [3][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0096 
Epoch: [3][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0095 
Epoch: [3][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0097 
Epoch: [3][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0096 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0840 
EVAL: [100/285] Elapsed 0m 16

Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0464  time: 576s
Epoch 3 - avg_train_loss: 0.0096  avg_val_loss: 0.0464  time: 576s
Epoch 3 - Score: 0.8354138659496833
Epoch 3 - Score: 0.8354138659496833


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0464 
Epoch: [4][0/1137] Elapsed 0m 0s (remain 12m 13s) Loss: 0.0033 
Epoch: [4][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0074 
Epoch: [4][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0075 
Epoch: [4][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0065 
Epoch: [4][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0059 
Epoch: [4][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0062 
Epoch: [4][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0066 
Epoch: [4][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0061 
Epoch: [4][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0061 
Epoch: [4][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0064 
Epoch: [4][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0064 
Epoch: [4][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0065 
Epoch: [4][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0064 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.1042 
EVAL: [100/285] Elapsed 0m 1

Epoch 4 - avg_train_loss: 0.0064  avg_val_loss: 0.0556  time: 576s
Epoch 4 - avg_train_loss: 0.0064  avg_val_loss: 0.0556  time: 576s
Epoch 4 - Score: 0.8036405886909374
Epoch 4 - Score: 0.8036405886909374


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0556 
Epoch: [5][0/1137] Elapsed 0m 0s (remain 12m 9s) Loss: 0.0001 
Epoch: [5][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0019 
Epoch: [5][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0039 
Epoch: [5][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0029 
Epoch: [5][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0024 
Epoch: [5][500/1137] Elapsed 3m 54s (remain 4m 57s) Loss: 0.0026 
Epoch: [5][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0033 
Epoch: [5][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0039 
Epoch: [5][800/1137] Elapsed 6m 14s (remain 2m 36s) Loss: 0.0038 
Epoch: [5][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0039 
Epoch: [5][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0040 
Epoch: [5][1100/1137] Elapsed 8m 34s (remain 0m 16s) Loss: 0.0042 
Epoch: [5][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0044 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0019 
EVAL: [100/285] Elapsed 0m 16

Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0467  time: 576s
Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0467  time: 576s
Epoch 5 - Score: 0.841461082424938
Epoch 5 - Score: 0.841461082424938


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0467 


Score: 0.93475
Score: 0.93475
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Epoch: [1][0/1137] Elapsed 0m 0s (remain 12m 21s) Loss: 0.4143 
Epoch: [1][100/1137] Elapsed 0m 47s (remain 8m 3s) Loss: 0.0653 
Epoch: [1][200/1137] Elapsed 1m 33s (remain 7m 16s) Loss: 0.0525 
Epoch: [1][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0469 
Epoch: [1][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0422 
Epoch: [1][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0393 
Epoch: [1][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0373 
Epoch: [1][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0362 
Epoch: [1][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0345 
Epoch: [1][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0334 
Epoch: [1][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0321 
Epoch: [1][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0311 
Epoch: [1][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0307 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0044 
EVAL: [100/285] Elapsed 0m 16s (remain 0m 29s) Loss: 0.0424 
EVAL: [200/285] Elapsed 0m 

Epoch 1 - avg_train_loss: 0.0307  avg_val_loss: 0.0403  time: 576s
Epoch 1 - avg_train_loss: 0.0307  avg_val_loss: 0.0403  time: 576s
Epoch 1 - Score: 0.8839779005524863
Epoch 1 - Score: 0.8839779005524863
Epoch 1 - Save Best Score: 0.8840 Model
Epoch 1 - Save Best Score: 0.8840 Model


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0403 
Epoch: [2][0/1137] Elapsed 0m 0s (remain 11m 55s) Loss: 0.0073 
Epoch: [2][100/1137] Elapsed 0m 47s (remain 8m 4s) Loss: 0.0217 
Epoch: [2][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0205 
Epoch: [2][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0208 
Epoch: [2][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0198 
Epoch: [2][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0184 
Epoch: [2][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0177 
Epoch: [2][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0176 
Epoch: [2][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0178 
Epoch: [2][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0168 
Epoch: [2][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0167 
Epoch: [2][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0161 
Epoch: [2][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0158 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0007 
EVAL: [100/285] Elapsed 0m 1

Epoch 2 - avg_train_loss: 0.0158  avg_val_loss: 0.0364  time: 576s
Epoch 2 - avg_train_loss: 0.0158  avg_val_loss: 0.0364  time: 576s
Epoch 2 - Score: 0.8531441717791411
Epoch 2 - Score: 0.8531441717791411


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0364 
Epoch: [3][0/1137] Elapsed 0m 0s (remain 12m 25s) Loss: 0.0003 
Epoch: [3][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0073 
Epoch: [3][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0087 
Epoch: [3][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0085 
Epoch: [3][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0080 
Epoch: [3][500/1137] Elapsed 3m 53s (remain 4m 56s) Loss: 0.0082 
Epoch: [3][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0076 
Epoch: [3][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0099 
Epoch: [3][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0105 
Epoch: [3][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0102 
Epoch: [3][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0103 
Epoch: [3][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0109 
Epoch: [3][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0109 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0003 
EVAL: [100/285] Elapsed 0m 1

Epoch 3 - avg_train_loss: 0.0109  avg_val_loss: 0.0410  time: 576s
Epoch 3 - avg_train_loss: 0.0109  avg_val_loss: 0.0410  time: 576s
Epoch 3 - Score: 0.813165537270087
Epoch 3 - Score: 0.813165537270087


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0410 
Epoch: [4][0/1137] Elapsed 0m 0s (remain 12m 7s) Loss: 0.0002 
Epoch: [4][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0068 
Epoch: [4][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0057 
Epoch: [4][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0060 
Epoch: [4][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0064 
Epoch: [4][500/1137] Elapsed 3m 53s (remain 4m 57s) Loss: 0.0067 
Epoch: [4][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0073 
Epoch: [4][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0073 
Epoch: [4][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0068 
Epoch: [4][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0073 
Epoch: [4][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0078 
Epoch: [4][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0077 
Epoch: [4][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0077 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0026 
EVAL: [100/285] Elapsed 0m 16

Epoch 4 - avg_train_loss: 0.0077  avg_val_loss: 0.0405  time: 576s
Epoch 4 - avg_train_loss: 0.0077  avg_val_loss: 0.0405  time: 576s
Epoch 4 - Score: 0.863060989643268
Epoch 4 - Score: 0.863060989643268


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0405 
Epoch: [5][0/1137] Elapsed 0m 0s (remain 12m 4s) Loss: 0.0082 
Epoch: [5][100/1137] Elapsed 0m 47s (remain 8m 5s) Loss: 0.0045 
Epoch: [5][200/1137] Elapsed 1m 33s (remain 7m 17s) Loss: 0.0043 
Epoch: [5][300/1137] Elapsed 2m 20s (remain 6m 30s) Loss: 0.0046 
Epoch: [5][400/1137] Elapsed 3m 7s (remain 5m 43s) Loss: 0.0049 
Epoch: [5][500/1137] Elapsed 3m 53s (remain 4m 57s) Loss: 0.0048 
Epoch: [5][600/1137] Elapsed 4m 40s (remain 4m 10s) Loss: 0.0049 
Epoch: [5][700/1137] Elapsed 5m 27s (remain 3m 23s) Loss: 0.0046 
Epoch: [5][800/1137] Elapsed 6m 13s (remain 2m 36s) Loss: 0.0046 
Epoch: [5][900/1137] Elapsed 7m 0s (remain 1m 50s) Loss: 0.0044 
Epoch: [5][1000/1137] Elapsed 7m 47s (remain 1m 3s) Loss: 0.0047 
Epoch: [5][1100/1137] Elapsed 8m 33s (remain 0m 16s) Loss: 0.0048 
Epoch: [5][1136/1137] Elapsed 8m 50s (remain 0m 0s) Loss: 0.0048 
EVAL: [0/285] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0001 
EVAL: [100/285] Elapsed 0m 16

Epoch 5 - avg_train_loss: 0.0048  avg_val_loss: 0.0540  time: 576s
Epoch 5 - avg_train_loss: 0.0048  avg_val_loss: 0.0540  time: 576s
Epoch 5 - Score: 0.7427677873338546
Epoch 5 - Score: 0.7427677873338546


EVAL: [284/285] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0540 


Score: 0.88398
Score: 0.88398
Score: 0.91910
Score: 0.91910
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

In [27]:
ZIP_OUTPUT_DIR = OUTPUT_DIR[:-1]+".zip"
!zip -r $ZIP_OUTPUT_DIR $OUTPUT_DIR

  adding: results_scibert_scivocab_uncased_title/ (stored 0%)
  adding: results_scibert_scivocab_uncased_title/scibert_scivocab_uncased_fold0_best.pth (deflated 7%)
  adding: results_scibert_scivocab_uncased_title/scibert_scivocab_uncased_fold3_best.pth (deflated 7%)
  adding: results_scibert_scivocab_uncased_title/oof_df.csv (deflated 65%)
  adding: results_scibert_scivocab_uncased_title/scibert_scivocab_uncased_fold4_best.pth (deflated 7%)
  adding: results_scibert_scivocab_uncased_title/scibert_scivocab_uncased_fold1_best.pth (deflated 7%)
  adding: results_scibert_scivocab_uncased_title/scibert_scivocab_uncased_fold2_best.pth (deflated 7%)
  adding: results_scibert_scivocab_uncased_title/submission.csv (deflated 72%)
  adding: results_scibert_scivocab_uncased_title/train.log (deflated 80%)


In [28]:
from google.colab import files
files.download(ZIP_OUTPUT_DIR)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

【TODO】
・(きもとん)PUBMED等の他のBERT使ってみる。
・(きもとん)ABSTRACTモデル作ってみる。
・(がみ)BERTの後ろにつけるモデルをリッチ（複数層/Lightgbm）にしてみる。
・(がみ)今後の方針を真剣に考える

In [None]:
!nvidia-smi