# Version v11b
* ELECTRA-large backbone (with ITPT, no PL)

In [None]:
!nvidia-smi

Wed Apr 13 12:27:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets



# Packages

In [None]:
import os, gc, pickle, math, time, random, copy, shutil, ast, itertools, re
from glob import glob
from tqdm.notebook import tqdm
from pylab import cm, matplotlib

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score
from scipy.special import softmax
from spacy import displacy

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict

# Mixed precision in Pytorch
from torch.cuda.amp import autocast, GradScaler

# Transformers
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, AdamW
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForTokenClassification, DebertaTokenizer
from transformers import DefaultDataCollator, DataCollatorForTokenClassification

from transformers.utils.logging import set_verbosity, WARNING
set_verbosity(WARNING)

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.__version__

'1.3.5'

# Initial settings and configuration

In [None]:
class config(object):
    # General settings
    env = 'colab'
    seed = 7777
    use_tqdm = True
    apex = True
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    debug = False
    mode = 'train'    # 'train', 'valid'
    # Validation
    nfolds = 5
    verbose = 750
    # Dataset and DataLoader
    aug = ['mask_texts', 'change_feature_ratio'] # None 'shuffling', 'mask_labels', 
    if aug is not None:
        aug_ratio = 0.5
        mask_text_ratio = 0.1
        mask_label_ratio = 0.02
        change_feature_ratio = 0.5
        shuffling_window = [2, 3]
    max_len = 512
    batch_size = 4
    num_workers = os.cpu_count()
    # Model
    backbone = 'google/electra-large-discriminator'
    tokenizer = AutoTokenizer.from_pretrained(backbone, trim_offsets = False)
    model_name = 'v11b_electra_large'
    pretrained_model_name = 'v11a_electra_large'
    do_pl = False
    if do_pl:
        filter_thres = 0.5    # If this value is <= 0.5, then we don't filter anything
        pl_model_name = 'v8d_roberta_large'
        if debug:
            sample_ratio = 50
        else:
            sample_ratio = 50_000
    dropout = 0.
    # Training
    training_folds = [0, 1, 2, 3, 4]
    if do_pl:
        nepochs = 1
        val_check_interval = 0.05
    else:
        nepochs = 5
        val_check_interval = 0.2
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    # Optimizer
    lr = 2e-5
    weight_decay = 1e-4
    encoder_lr = 2e-5
    decoder_lr = 1e-3
    min_lr = 1e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    # For AWP
    use_awp = True
    if use_awp:
        adv_lr = 1
        adv_eps = 1e-3
        adv_step = 1
    # Scheduler
    scheduler_type = 'linear'    # 'linear'
    if scheduler_type == 'cosine':
        num_cycles = 0.5
    num_warmup_steps = 0.
    batch_scheduler = True
    # Data paths
    if env == 'colab':
        data_dir = '/content/drive/My Drive/Kaggle competitions/NBME/data'
        output_dir = '/content/drive/My Drive/Kaggle competitions/NBME/model'
    elif env == 'kaggle':
        data_dir = ...
        output_dir = os.getcwd()
    elif env == 'jarvis':
        data_dir = 'data'
        output_dir = os.getcwd()
    os.makedirs(os.path.join(output_dir, model_name.split('_')[0][:-1], model_name.split('_')[0][-1]), exist_ok = True)
    # Pretrained model
    pretrained_model_dir = os.path.join(output_dir, pretrained_model_name.split('_')[0][:-1], pretrained_model_name.split('_')[0][-1])
    config = AutoConfig.from_pretrained(pretrained_model_dir)
    config.output_hidden_states = True
    freeze_until = -1
    
cfg = config()

In [None]:
def set_random_seed(seed, use_cuda = True):
    np.random.seed(seed) # cpu vars
    torch.manual_seed(seed) # cpu  vars
    random.seed(seed) # Python
    os.environ['PYTHONHASHSEED'] = str(seed) # Python hash building
    if use_cuda: 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

# Set up logs

In [None]:
import logging
from imp import reload
reload(logging)
logging.basicConfig(
    level = logging.INFO,
    format = '%(asctime)s %(message)s',
    datefmt = '%H:%M:%S',
    handlers = [
        logging.FileHandler(f"train_{cfg.model_name}_{time.strftime('%m%d_%H%M', time.localtime())}_{cfg.seed}.log"),
        logging.StreamHandler()
    ]
)

logging.info(
    '\nmodel_name: {}\n'
    'env: {}\n'
    'seed: {}\n'
    'nfolds: {}\n'
    'max_len: {}\n'
    'batch_size: {}\n'
    'num_workers: {}\n'
    'nepochs: {}\n'
    'lr: {}\n'
    'weight_decay: {}\n'
    'gradient_accumulation_steps: {}'.format(cfg.model_name, cfg.env, cfg.seed, cfg.nfolds, 
                                            cfg.max_len, cfg.batch_size, cfg.num_workers, cfg.nepochs, 
                                            cfg.lr, cfg.weight_decay, cfg.gradient_accumulation_steps)
)

12:27:54 
model_name: v11b_electra_large
env: colab
seed: 7777
nfolds: 5
max_len: 512
batch_size: 4
num_workers: 12
nepochs: 5
lr: 2e-05
weight_decay: 0.0001
gradient_accumulation_steps: 1


# Import data

* Train data

In [None]:
train = pd.read_csv(os.path.join(cfg.data_dir, 'train.csv'))
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,[photobia],[274 282]
14298,95333_915,9,95333,915,[no sick contacts],[421 437]


* Features and encode features

In [None]:
features = pd.read_csv(os.path.join(cfg.data_dir, 'features.csv'))
def preprocess_features(features):
    features.loc[27, 'feature_text'] = 'Last-Pap-smear-1-year-ago'
    return features

feature_map = dict(zip(np.sort(features.feature_num.unique()), range(features.feature_num.nunique())))
features['encoded_feature_text'] = features['feature_num'].map(feature_map)
feature_map = features[['feature_num', 'feature_text']].set_index('feature_num').to_dict()['feature_text']
cfg.num_labels = len(feature_map)
features

Unnamed: 0,feature_num,case_num,feature_text,encoded_feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...,0
1,1,0,Family-history-of-thyroid-disorder,1
2,2,0,Chest-pressure,2
3,3,0,Intermittent-symptoms,3
4,4,0,Lightheaded,4
...,...,...,...,...
138,912,9,Family-history-of-migraines,138
139,913,9,Female,139
140,914,9,Photophobia,140
141,915,9,No-known-illness-contacts,141


* Patient's note

In [None]:
patient_notes = pd.read_csv(os.path.join(cfg.data_dir, 'patient_notes.csv'))
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


* Merge train data

In [None]:
train = train.merge(features, on = ['feature_num', 'case_num'], how = 'left')
train = train.merge(patient_notes, on = ['pn_num', 'case_num'], how = 'left')
display(train)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,encoded_feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,0,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,1,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,2,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,3,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,4,HPI: 17yo M presents with palpitations. Patien...
...,...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,138,Stephanie madden is a 20 year old woman compla...
14296,95333_913,9,95333,913,[],[],Female,139,Stephanie madden is a 20 year old woman compla...
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,140,Stephanie madden is a 20 year old woman compla...
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,141,Stephanie madden is a 20 year old woman compla...


* Some annotations are wrong, Nakama fixes them by...

In [None]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [None]:
train['annotation_len'] = train['annotation'].apply(lambda x: len(x))
display(train['annotation_len'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_len, dtype: int64

# Encode feature_text

In [None]:
feature_text_map = dict(zip(np.sort(train.feature_text.unique()), range(train.feature_text.nunique())))
train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,encoded_feature_text,pn_history,annotation_len
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,0,HPI: 17yo M presents with palpitations. Patien...,1
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,1,HPI: 17yo M presents with palpitations. Patien...,1
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,2,HPI: 17yo M presents with palpitations. Patien...,1
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,3,HPI: 17yo M presents with palpitations. Patien...,2
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,4,HPI: 17yo M presents with palpitations. Patien...,1
...,...,...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,138,Stephanie madden is a 20 year old woman compla...,0
14296,95333_913,9,95333,913,[],[],Female,139,Stephanie madden is a 20 year old woman compla...,0
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,140,Stephanie madden is a 20 year old woman compla...,1
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,141,Stephanie madden is a 20 year old woman compla...,1


In [None]:
if cfg.debug:
    train = train.iloc[:1000]

# CV split

* We use the GroupKFold which groups "pn_num"

In [None]:
kfold = GroupKFold(n_splits = cfg.nfolds)
train['kfold'] = -1
for i, (trn_idx, val_idx) in enumerate(kfold.split(train, train['location'], train['pn_num'])):
    train.loc[val_idx, 'kfold'] = i
train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,encoded_feature_text,pn_history,annotation_len,kfold
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,0,HPI: 17yo M presents with palpitations. Patien...,1,4
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,1,HPI: 17yo M presents with palpitations. Patien...,1,4
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,2,HPI: 17yo M presents with palpitations. Patien...,1,4
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,3,HPI: 17yo M presents with palpitations. Patien...,2,4
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,4,HPI: 17yo M presents with palpitations. Patien...,1,4
...,...,...,...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,138,Stephanie madden is a 20 year old woman compla...,0,4
14296,95333_913,9,95333,913,[],[],Female,139,Stephanie madden is a 20 year old woman compla...,0,4
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,140,Stephanie madden is a 20 year old woman compla...,1,4
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,141,Stephanie madden is a 20 year old woman compla...,1,4


In [None]:
def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

def clean_spaces(txt):
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
    return txt

train['feature_text'] = train['feature_text'].apply(process_feature_text)
train['feature_text'] = train['feature_text'].apply(clean_spaces)
train['pn_history'] = train['pn_history'].apply(clean_spaces)

# Unlabeled data

In [None]:
remaining_pn_num = np.array(list(set(patient_notes.pn_num.unique()) - set(train.pn_num.unique())))
remaining_pn_num_idx = np.where(np.in1d(patient_notes.pn_num.values, remaining_pn_num))[0]
remaining_pn_history = patient_notes.loc[remaining_pn_num_idx].reset_index(drop = True)
remaining_pn_history['pn_history'] = remaining_pn_history['pn_history'].apply(clean_spaces)
remaining_pn_history

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; ...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
41141,95328,9,20 YO F C/O DULL HEADACHE SINCE YESTERDAY. SHE...
41142,95329,9,20 y/o F c/o headache that started yesterday m...
41143,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
41144,95332,9,Ms. Madden is a 20yo female who presents with ...


#### CV split for unlabeled data - use the StratifiedKFold

In [None]:
kfold = StratifiedKFold(n_splits = cfg.nfolds, shuffle = True, random_state = cfg.seed)
remaining_pn_history['kfold'] = -1
for i, (trn_idx, val_idx) in enumerate(kfold.split(remaining_pn_history, remaining_pn_history['case_num'])):
    remaining_pn_history.loc[val_idx, 'kfold'] = i
remaining_pn_history

Unnamed: 0,pn_num,case_num,pn_history,kfold
0,0,0,"17-year-old male, has come to the student heal...",2
1,1,0,17 yo male with recurrent palpitations for the...,3
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...,4
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; ...,1
4,4,0,17yo male with no pmh here for evaluation of p...,1
...,...,...,...,...
41141,95328,9,20 YO F C/O DULL HEADACHE SINCE YESTERDAY. SHE...,1
41142,95329,9,20 y/o F c/o headache that started yesterday m...,1
41143,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...,2
41144,95332,9,Ms. Madden is a 20yo female who presents with ...,0


# Dataset

#### Data augmentation

In [None]:
def aug_mask_texts(cfg, input_ids, mask):
    num_ids = len(input_ids)
    all_idxs = list(range(num_ids))
    mask_idxs = random.choices(all_idxs, k = int(num_ids * cfg.mask_text_ratio))
    aug_input_ids = input_ids.copy()
    for idx in mask_idxs[:sum(mask)]:
        aug_input_ids[idx] = cfg.tokenizer.mask_token_id
    return aug_input_ids

def aug_shuffle(cfg, input_ids, labels):
    period_locations = np.where(np.array([4] + input_ids) == 4)[0]   # Locate the periods, assume they are sentence delimiters
    sentence_span = list(zip(period_locations[:-1], period_locations[1:]))   # Locate the span of each sentence
    sentence_input_ids = [input_ids[i:j] for (i, j) in sentence_span]    # Locate the input_ids of each sentence
    sentence_labels = [labels[i:j] for (i, j) in sentence_span]    # Locate the input_ids of each sentence

    ws = np.random.choice(cfg.shuffling_window)
    num_chunk = len(sentence_span) // ws

    idx = np.arange(0, len(sentence_span))
    aug_sentence_input_ids = []
    aug_sentence_labels = []
    for i in range(num_chunk):
        np.random.shuffle(idx[i * ws: (i + 1) * ws])
        for j in idx[i * ws: (i + 1) * ws]:
            aug_sentence_input_ids.append(sentence_input_ids[j])
            aug_sentence_labels.append(sentence_labels[j])
    
    aug_sentence_input_ids = [sentence for sublist in aug_sentence_input_ids for sentence in sublist]
    aug_sentence_labels = [sentence for sublist in aug_sentence_labels for sentence in sublist]

    # Padding or trimming
    if len(aug_sentence_input_ids) < cfg.max_len:
        aug_sentence_input_ids = aug_sentence_input_ids + [cfg.tokenizer.pad_token_id] * (cfg.max_len - len(aug_sentence_input_ids))
        aug_sentence_labels = aug_sentence_labels + [-100] * (cfg.max_len - len(aug_sentence_labels))

    return aug_sentence_input_ids, aug_sentence_labels

def aug_mask_labels(cfg, labels, encoded_feature_text):
    aug_labels = np.asarray(labels)
    if encoded_feature_text in [0, 4, 5, 8, 9, 32]:
        # Only mask the location if they are features (after being encoded) 0, 4, 5, 8, 9
        aug_labels[aug_labels == 1] = 0
    return aug_labels.tolist()

def aug_replace_feature_text(feature_text):
    aug_feature_text = dict()
    # For 'Adderall-use'
    Adderall_use = ['adderrall', 'aderall', 'aderrall', 'aderol', 'adderal', 'amphetimine', 'amphetamine', 'Addral']
    aug_feature_text['Adderall-use'] = [f'{i}-use' for i in Adderall_use] + [f'take-{i}' for i in Adderall_use]
    # For 'heart-pounding-OR-heart-racing'
    aug_feature_text['heart-pounding-OR-heart-racing'] = ['heart-pounding-OR-heart-racing-OR-palpitations'] + ['heart-pounding-OR-heart-racing-OR-palpitaions']
    # For 'Photophobia'
    aug_feature_text['Photophobia'] = ['photobia']
    # For 'Caffeine-use', 'Heavy-caffeine-use'
    aug_feature_text['Caffeine-use'] = ['Caffeine-consumption', 'Caffeine-consume', 'use-caffeine', 'consume-caffeine']
    aug_feature_text['Heavy-caffeine-use'] = ['Heavy-caffeine-consumption', 'Heavy-caffeine-consume']
    # For 'Chest-pressure'
    aug_feature_text['Chest-pressure'] = ['Chest-tightness', 'Chest-throbbing']
    # For 'Shortness-of-breath'
    aug_feature_text['Shortness-of-breath'] = ['dyspnea']

    if feature_text in ['Adderall-use', 'heart-pounding-OR-heart-racing', 'Photophobia', 'Caffeine-use', 'Heavy-caffeine-use', 'Chest-pressure', 'Shortness-of-breath']:
        new_feature_text = random.choice(aug_feature_text[feature_text])
        return new_feature_text

    return feature_text

#### Helper functions

In [None]:
def create_labels(cfg, tokenized_text, annotation_len, locations, mode = 'train'):
    offset_mapping = tokenized_text['offset_mapping']
    input_ids = tokenized_text['input_ids']
    attention_mask = tokenized_text['attention_mask']
    labels = []
    for i, example_locations in enumerate(locations):
        label = np.zeros(len(offset_mapping[i]))
        if annotation_len[i] != 0:
            for location in example_locations:
                for loc in [s.split() for s in location.split(';')]:
                    start_idx = -1
                    end_idx = -1
                    start_, end_ = [int(j) for j in loc]
                    for j, (token_start, token_end) in enumerate(offset_mapping[i]):
                        if (start_idx == -1) & (start_ < token_start):
                            start_idx = j - 1
                        if (end_idx == -1) & (end_ <= token_end):
                            end_idx = j + 1

                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx: end_idx] = 1
            
            for k, id in enumerate(input_ids[i]):
                if id in [0, 1, 2]:
                    label[k] = -100    # Ignore index
            
        # Augmentation
        if mode == 'train':
            if 'mask_texts' in cfg.aug:
                if random.random() < cfg.aug_ratio:
                    input_ids[i] = aug_mask_texts(cfg, input_ids[i], attention_mask[i])
            
        labels.append(label.tolist())

    tokenized_text['input_ids'] = input_ids
    tokenized_text['labels'] = labels

    return tokenized_text

def preparing_dataset(example, cfg = cfg, mode = 'train'):
    assert mode in ['train', 'valid', 'infer', 'test']
    # Extract data
    pn_history = example['pn_history']
    feature_text = example['feature_text']

    if mode == 'train':
        if cfg.aug is not None:
            if 'change_feature_ratio' in cfg.aug:
                for i, feat in enumerate(feature_text):
                    if random.random() < cfg.change_feature_ratio:
                        feature_text[i] = aug_replace_feature_text(feat)

    # Tokenize texts
    if mode == 'train':
        padding = 'max_length'
    else:
        padding = False
    tokenized_text = cfg.tokenizer(pn_history, feature_text, 
                                   return_token_type_ids = True, 
                                   return_offsets_mapping = True,
                                   return_attention_mask = True,
                                   truncation = 'only_second',
                                   padding = padding,
                                   max_length = cfg.max_len,
                                   return_length = True)
    # Create labels
    if mode in ['train', 'valid']:
        annotation_len = example['annotation_len']
        location = example['location']
        tokenized_text = create_labels(cfg, tokenized_text, annotation_len, location, mode = mode)
    
    return tokenized_text

def generate_tokenized_dataset(cfg, df, shuffle = False, mode = 'train'):
    assert mode in ['train', 'valid', 'infer'], "The 'mode' argument only takes 'train', 'valid', or 'infer' value!"
    # Form raw datasets
    raw_dataset = Dataset.from_pandas(df)

    # Tokenized datasets
    tokenized_dataset = raw_dataset.map(preparing_dataset, fn_kwargs = {'cfg': cfg, 'mode': mode}, batched = True, batch_size = 20_000)

    if shuffle:
        tokenized_dataset = tokenized_dataset.shuffle(seed = cfg.seed)
    
    return raw_dataset, tokenized_dataset

#### Colatte function

In [None]:
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union

def pad_sequence(sequences: List[List[int]], max_len, padding_side = 'right'):
    if padding_side == 'right':
        return [x + [-100] * (max_len - len(x)) for x in sequences]
    else:
        return [[-100] * (max_len - len(x)) + x for x in sequences]

def collate_fn(batch):
    input_ids = []
    attention_mask = []
    token_type_ids = []
    labels = []
    
    max_len = max([len(sample['input_ids']) for sample in batch])

    for sample in batch:
        input_ids.append(sample['input_ids'])
        attention_mask.append(sample['attention_mask'])
        token_type_ids.append(sample['token_type_ids'])

        if 'labels' in list(sample.keys()):
            labels.append(sample['labels'])

    batch_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
    }

    # Pad the inputs
    batch_dict = cfg.tokenizer.pad(batch_dict)
    
    if len(labels) != 0:
        labels = pad_sequence(labels, max_len = max_len)
        return {
            'input_ids': torch.tensor(batch_dict['input_ids'], dtype = torch.long),
            'attention_mask': torch.tensor(batch_dict['attention_mask'], dtype = torch.long),
            'token_type_ids': torch.tensor(batch_dict['token_type_ids'], dtype = torch.long),
            'labels': torch.tensor(labels, dtype = torch.float),
        }
    else:
        return {
            'input_ids': torch.tensor(batch_dict['input_ids'], dtype = torch.long),
            'attention_mask': torch.tensor(batch_dict['attention_mask'], dtype = torch.long),
            'token_type_ids': torch.tensor(batch_dict['token_type_ids'], dtype = torch.long),
        }

# Model

In [None]:
class DiceLoss(nn.Module):
    def __init__(self, ignore_index = -100, smooth = 1.):
        super(DiceLoss, self).__init__()
        self.smooth = smooth
        self.ignore_index = ignore_index

    def forward(self, pred, true):
        _pred = torch.masked_select(pred, true != self.ignore_index)
        _true = torch.masked_select(true, true != self.ignore_index)
        intersection = (_pred * _true).sum()
        A_sum = (_pred * _pred).sum()
        B_sum = (_true * _true).sum()
        return 1 - intersection / (A_sum + B_sum + self.smooth)

In [None]:
class NBME_Model(nn.Module):
    def __init__(self, cfg):
        super(NBME_Model, self).__init__()
        self.cfg = cfg
        self.backbone = AutoModel.from_pretrained(cfg.pretrained_model_dir, config = cfg.config)

        if cfg.freeze_until >= 0:
            self.freeze_backbone(freeze_until = cfg.freeze_until)

        # Head
        self.labels_classifier = nn.Linear(cfg.config.hidden_size, 1)

        # Initialization
        self._init_weights(self.labels_classifier)
        
    def criterion(self, pred, true):
        bce_loss = nn.BCEWithLogitsLoss(reduction = 'none')(pred.view(-1,1), true.view(-1,1))
        bce_loss = (true.view(-1,1) == 0) * bce_loss * 0.5 + (true.view(-1,1) == 1) * bce_loss
        bce_loss = torch.masked_select(bce_loss, true.view(-1,1) != -100).mean()
        dice_loss = DiceLoss(smooth = 1., ignore_index = -100)(pred.view(-1,1), true.view(-1,1))
        return bce_loss + dice_loss

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.cfg.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.cfg.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask = None, token_type_ids = None, labels = None):
        output_backbone = self.backbone(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        hidden_states = output_backbone.hidden_states
        hidden_states = torch.mean(torch.stack(hidden_states[-4:]), dim = 0)    # Take the mean of the last 4 hidden layers
        output_labels = self.labels_classifier(hidden_states).squeeze(-1)
        
        if labels is not None:
            loss = self.criterion(output_labels, labels)
        else:
            loss = None
        return output_labels, loss

    def freeze_backbone(self, freeze_until: int = 6):
        freeze_until = max(min(freeze_until, 11), 0)
        for name, para in self.backbone.encoder.named_parameters():
            for i in range(freeze_until):
                if str(i) in name:
                    para.requires_grad = False
    
    def unfreeze_backbone(self):
        for para in self.backbone.parameters():
            para.requires_grad = True

# Metrics

#### Helper functions for scoring, Nakama notebook

In [None]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length = None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

def get_char_probs(cfg, texts, predictions):
    """
    Convert token probabilities into character probabilities

    Args:
        cfg: the general configuration object
        texts: the plain texts
        predictions: raw predictions from the model
    """
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = cfg.tokenizer(text, add_special_tokens = True, return_offsets_mapping = True)
        start = 0
        for offset_mapping, pred in zip(encoded['offset_mapping'], prediction):
            if offset_mapping[0] != offset_mapping[1]:
                start = min([start, offset_mapping[0]])
                end = offset_mapping[1]
                results[i][start:end] = pred
                start = end
    return results

def get_results(char_probs, texts, th = 0.5, ignore_chars = [' ']):
    results = []
    scores = []
    for char_prob, text in zip(char_probs, texts):
        result = np.where(char_prob >= th)[0]
        result = [list(g) for _, g in itertools.groupby(result, key = lambda n, c = itertools.count(): n - next(c))]
        result = [[i for i in r if not text[i] in ignore_chars] for r in result]
        score = [np.mean(char_prob[min(r): max(r) + 1]) for r in result if r != []]
        result = [f'{min(r)} {max(r) + 1}' for r in result if r != []]
        result = ';'.join(result)
        results.append(result)
        scores.append(score)
    return results, scores

def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != '':
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

# Training/Validating functions

#### Helper

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

#### Per-epoch training function

In [None]:
def train_fn(cfg, model, train_dataloader, optimizer, epoch, num_train_steps, scheduler, 
             valid_dataloader, val_df, valid_texts, best_score = -np.inf, fold = 0):
    # Set up for training
    scaler = GradScaler(enabled = cfg.apex)   # Enable APEX
    loss = 0
    total_samples = 0
    global_step = 0
    start = end = time.time()

    if cfg.use_awp:
        # Initialize AWP
        awp = AWP(model, optimizer, adv_lr = cfg.adv_lr, adv_eps = cfg.adv_eps, start_step = num_train_steps / cfg.nepochs, scaler = scaler)

    if cfg.use_tqdm:
        tbar = tqdm(train_dataloader)
    else:
        tbar = train_dataloader
    
    eval_every = int(cfg.val_check_interval * len(tbar))

    for i, item in enumerate(tbar):
        model.train()
        # Set up inputs
        input_ids = item['input_ids'].to(cfg.device)
        attention_mask = item['attention_mask'].to(cfg.device)
        token_type_ids = item['token_type_ids'].to(cfg.device)
        labels = item['labels'].to(cfg.device)

        batch_size = input_ids.shape[0]

        # Forward
        with autocast(enabled = cfg.apex):
            batch_logits, batch_loss = model(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, 
                                             labels = labels)
        
        if cfg.gradient_accumulation_steps > 1:
            batch_loss = batch_loss / cfg.gradient_accumulation_steps

        # Backward
        scaler.scale(batch_loss).backward()
        
        if cfg.use_awp:
            if (i + 1) % cfg.gradient_accumulation_steps == 0:
                awp.attack_backward(item, global_step)

        # Update loss
        loss += batch_loss.item() * batch_size
        total_samples += batch_size

        if cfg.use_tqdm:
            tbar.set_description('Batch loss: {:.4f} - Avg loss: {:.4f}'.format(batch_loss, loss / total_samples))

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        if (i + 1) % cfg.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if cfg.batch_scheduler:
                scheduler.step()

        # Evaluate
        if (i + 1) % eval_every == 0 or (i + 1) == len(tbar):
            logging.info('Epoch: [{0}][{1}/{2}] - Start evaluating...'.format(epoch + 1, i + 1, len(tbar)))
            val_loss, prediction = valid_fn(cfg, model, valid_dataloader, epoch)
            char_probs = get_char_probs(cfg, valid_texts, prediction)
            results, _ = get_results(char_probs, valid_texts, th = 0.5)
            preds = get_predictions(results)
            val_df['pred_location'] = preds
            val_df['pred_annotation'] = val_df.apply(predspan_extract, axis = 1)
            score = get_score(val_df['location_for_create_labels'].values, preds)

            end = time.time()
            logging.info('Epoch: [{0}][{1}/{2}] - '
                         'Elapsed {remain:s} - '
                         'Train Loss: {train_loss:.4f} - '
                         'Val Loss: {val_loss:.4f} - '
                         'F1: {score:.4f} - '
                         'LR: {lr:.8f}'
                         .format(epoch + 1, i + 1, len(tbar), 
                                 remain = timeSince(start, float(i + 1) / len(tbar)),
                                 train_loss = loss / total_samples,
                                 val_loss = val_loss,
                                 score = score,
                                 lr = scheduler.get_lr()[0]))
            if score > best_score:
                best_score = score
                logging.info(f'Epoch [{epoch + 1}][{i + 1}/{len(tbar)}] - The Best Score Updated to: {best_score:.4f} Model')
                model_state_dict = {
                    'state_dict': model.state_dict(),
                    'prediction': prediction
                }
                ckp = os.path.join(cfg.output_dir, cfg.model_name.split('_')[0][:-1], cfg.model_name.split('_')[0][-1], f'fold_{fold}.pt')
                torch.save(model_state_dict, ckp)
            else:
                logging.info(f'Epoch [{epoch + 1}][{i + 1}/{len(tbar)}] - Not The Best Score ({score:.4f}), Current Best Score: {best_score:.4f} Model')
            
    return best_score, val_df

#### Per-epoch validating function

In [None]:
def valid_fn(cfg, model, valid_dataloader, epoch):
    # Set up for training
    model.eval()

    loss = 0
    total_samples = 0
    start = end = time.time()

    preds = []

    for i, item in enumerate(valid_dataloader):
        # Set up inputs
        input_ids = item['input_ids'].to(cfg.device)
        attention_mask = item['attention_mask'].to(cfg.device)
        token_type_ids = item['token_type_ids'].to(cfg.device)
        labels = item['labels'].to(cfg.device)

        batch_size = input_ids.shape[0]

        # Forward
        with torch.no_grad():
            batch_logits, batch_loss = model(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, labels = labels)
        
        # Update loss
        loss += batch_loss.item() * batch_size
        total_samples += batch_size

        # Store the prediction
        batch_logits = batch_logits.sigmoid()
        batch_logits = F.pad(batch_logits, (0, cfg.max_len - batch_logits.shape[-1]), 'constant', 0.)
        preds.append(batch_logits.to('cpu').numpy())

        # Logging
        end = time.time()
            
    preds = np.concatenate(preds)
    return loss / total_samples, preds

#### Per-epoch inferring function

In [None]:
def infer_fn(cfg, model, infer_dataloader):
    # Set up for training
    model.eval()

    if cfg.use_tqdm:
        tbar = tqdm(infer_dataloader)
    else:
        tbar = infer_dataloader

    preds = []

    for i, item in enumerate(tbar):
        # Set up inputs
        input_ids = item['input_ids'].to(cfg.device)
        attention_mask = item['attention_mask'].to(cfg.device)
        token_type_ids = item['token_type_ids'].to(cfg.device)

        batch_size = input_ids.shape[0]

        # Forward
        with torch.no_grad():
            batch_logits, _ = model(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)

        # Store the prediction
        batch_logits = batch_logits.sigmoid()
        batch_logits = F.pad(batch_logits, (0, cfg.max_len - batch_logits.shape[-1]), 'constant', 0.)
        preds.append(batch_logits.to('cpu').numpy())
        
    preds = np.concatenate(preds)
    return preds

#### Optimizer and scheduler

In [None]:
def get_optimizer(cfg, model):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': cfg.encoder_lr, 'weight_decay': cfg.weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': cfg.encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if 'backbone' not in n],
             'lr': cfg.decoder_lr, 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_parameters, lr = cfg.lr, eps = cfg.eps, betas = cfg.betas)
    return optimizer

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler_type == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps = cfg.num_warmup_steps, num_training_steps = num_train_steps
        )
    elif cfg.scheduler_type == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps = cfg.num_warmup_steps, num_training_steps = num_train_steps, num_cycles = cfg.num_cycles
        )
    return scheduler

#### AWP (Adversial Weight Pertubation), code adapted from https://www.kaggle.com/code/wht1996/feedback-nn-train/notebook

In [None]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param = 'weight',
        adv_lr = 1,
        adv_eps = 0.2,
        start_step = 0,
        adv_step = 1,
        scaler = None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_step = start_step
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, batch, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_step):
            return None

        self._save()
        for i in range(self.adv_step):
            self._attack_step() 
            with autocast(enabled = cfg.apex):
                input_ids = batch['input_ids'].to(cfg.device)
                attention_mask = batch['attention_mask'].to(cfg.device)
                token_type_ids = batch['token_type_ids'].to(cfg.device)
                labels = batch['labels'].to(cfg.device)
                tr_logits, adv_loss = self.model(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, labels = labels)
                adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                    
    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# Pseudo-labeling function

In [None]:
# Helper functions
def generate_id_fn(x):
    pn_num = str(x['pn_num'])
    case_num = str(x['feature_num'])
    return (5 - len(pn_num)) * '0' + pn_num + '_' + (3 - len(case_num)) * '0' + case_num

def predspan_extract(x):
    pred_text = []
    for location in x['location']:
        start_, end_ = location
        pred_text.append(x['pn_history'][start_:end_])
    return pred_text

def filter_pseudo_label_fn(pl_df, thres = 0.8):
    # Identify true negatives
    true_neg = pl_df['scores'].apply(lambda x: len(x) == 0)

    # Choose the labels which have average labels of larger than a threshold
    pl_df['new_location'] = pl_df[['location', 'scores']].apply(lambda x: np.array(x['location'])[np.array(x['scores']) > thres], axis = 1)
    true_neg_after_filter = pl_df['new_location'].apply(lambda x: len(x) == 0)

    # Choose only pseudo-labels which have high probabilities
    new_pl_df = pl_df[true_neg == true_neg_after_filter].copy()
    new_pl_df['location'] = new_pl_df['new_location'].apply(lambda x: x.tolist())
    new_pl_df.drop(['new_location', 'scores'], axis = 1, inplace = True)
    return new_pl_df

# Main function
def pseudo_label_fn(cfg, unlabeled_data, features, fold = 0):
    logging.info('Generating the unlabeled sampled dataframe...')
    pl_df = unlabeled_data[unlabeled_data.kfold == fold]
    pl_df = pl_df.merge(features, right_on = 'case_num', left_on = 'case_num')
    pl_df['feature_text'] = pl_df['feature_text'].apply(process_feature_text)
    pl_df['feature_text'] = pl_df['feature_text'].apply(clean_spaces)
    pl_df['pn_history'] = pl_df['pn_history'].apply(clean_spaces)
    if cfg.debug:
        pl_df = pl_df.sample(n = 1000, random_state = cfg.seed + fold)
    infer_texts = pl_df.pn_history.values
    logging.info(f'Use {len(infer_texts)} samples for pseudo-labeling...')

    logging.info('Preparing dataloader for pseudo-labeling...')
    _, tokenized_infer_dataset = generate_tokenized_dataset(cfg, pl_df, mode = 'infer')
    infer_dataloader = DataLoader(tokenized_infer_dataset, batch_size = 4 * cfg.batch_size, num_workers = cfg.num_workers, shuffle = False, collate_fn = collate_fn)

    logging.info('Loading the pretrained model...')
    ckp = torch.load(os.path.join(cfg.output_dir, cfg.pl_model_name.split('_')[0][:-1], cfg.pl_model_name.split('_')[0][-1], f'fold_{fold}.pt'), map_location = cfg.device)
    logging.info(f"Checkpoint: {os.path.join(cfg.output_dir, cfg.pl_model_name.split('_')[0][:-1], cfg.pl_model_name.split('_')[0][-1], f'fold_{fold}.pt')}")
    model = NBME_Model(cfg).to(cfg.device)
    model.load_state_dict(ckp['state_dict'])

    logging.info('Pseudo-labeling...')
    preds = infer_fn(cfg, model, infer_dataloader)

    logging.info('Converting raw predictions into spans...')
    char_probs = get_char_probs(cfg, infer_texts, preds)
    results, scores = get_results(char_probs, infer_texts, th = 0.5)
    pl_df['location'] = get_predictions(results)
    pl_df['annotation_len'] = pl_df['location'].apply(lambda x: len(x))
    pl_df['location'] = pl_df['location'].apply(lambda x: [f'{start_} {end_}' for (start_, end_) in x])
    pl_df['scores'] = scores

    pl_df['id'] = pl_df[['pn_num', 'feature_num']].apply(generate_id_fn, axis = 1)
    pl_df['annotation'] = [['too lazy to derive!']] * pl_df.shape[0]

    # Filter out low-probability labels
    pl_df = filter_pseudo_label_fn(pl_df, thres = cfg.filter_thres)
    pl_df = pl_df[train.columns]
    return pl_df

# Training-loop function

In [None]:
def training_loop(cfg, fold = 0, pl_args = None):
    logging.info(f' Fold {fold} '.center(50, '*'))
    set_random_seed(cfg.seed + fold)
    
    logging.info('Preparing training and validating dataloader...')
    trn = train[train.kfold != fold].reset_index(drop = True)
    val = train[train.kfold == fold].reset_index(drop = True)
    valid_texts = val.pn_history.values
    val['location_for_create_labels'] = create_labels_for_scoring(val)

    if cfg.do_pl:
        assert pl_args is not None, "Arguments for the pseudo-labeling function needed! They are 'unlabeled_data' and 'features'."
        pl_df = pseudo_label_fn(cfg, pl_args['unlabeled_data'], pl_args['features'], fold = fold)
        trn = pd.concat([trn, pl_df])

    _, train_dataset = generate_tokenized_dataset(cfg, trn, shuffle = True, mode = 'train')
    _, valid_dataset = generate_tokenized_dataset(cfg, val, shuffle = False, mode = 'valid')

    train_dataloader = DataLoader(train_dataset, batch_size = cfg.batch_size, num_workers = cfg.num_workers, shuffle = False, collate_fn = collate_fn)
    valid_dataloader = DataLoader(valid_dataset, batch_size = cfg.batch_size * 4, num_workers = cfg.num_workers, shuffle = False, collate_fn = collate_fn)

    logging.info('Preparing model, optimizer, and scheduler...')
    model = NBME_Model(cfg).to(cfg.device)    # Freeze the first 8 layers in the encoder
    optimizer = get_optimizer(cfg, model)
    num_training_steps = int(len(trn) / cfg.batch_size * cfg.nepochs)
    scheduler = get_scheduler(cfg, optimizer, num_training_steps)

    best_score = -np.inf
    if cfg.mode == 'train':
        for epoch in range(cfg.nepochs):
            start_time = time.time()
            # Train
            best_score, val = train_fn(cfg, model, train_dataloader, optimizer, epoch, num_training_steps, scheduler, 
                                       valid_dataloader, val, valid_texts, best_score = best_score, fold = fold)
                
    prediction = torch.load(os.path.join(cfg.output_dir, cfg.model_name.split('_')[0][:-1], cfg.model_name.split('_')[0][-1], f'fold_{fold}.pt'), 
                            map_location = torch.device('cpu'))['prediction']
    val[[i for i in range(cfg.max_len)]] = prediction

    del model, optimizer, scheduler
    torch.cuda.empty_cache()
    gc.collect()
    
    return val

# Main

In [None]:
if __name__ == '__main__':
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(cfg.max_len)]].values
        char_probs = get_char_probs(cfg, oof_df.pn_history.values, predictions)
        results, _ = get_results(char_probs, oof_df.pn_history.values, th = 0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        logging.info(f'Score: {score:<.4f}')
        return preds
    
    def predspan_extract(x):
        pred_text = []
        for location in x['pred_location']:
            start_, end_ = location
            pred_text.append(x['pn_history'][start_:end_])
        return pred_text

    oof_df = pd.DataFrame()
    for fold in cfg.training_folds:
        if fold in cfg.training_folds:
            _oof_df = training_loop(cfg, fold = fold, pl_args = {'unlabeled_data': remaining_pn_history, 'features': features})
            oof_df = pd.concat([oof_df, _oof_df])
            logging.info(f' Fold: {fold} result '.center(50, '*'))
            _ = get_result(_oof_df)
    oof_df = oof_df.reset_index(drop = True)
    if set(cfg.training_folds) == {0, 1, 2, 3, 4}:
        # Re-arange back to the original order
        oof_df = oof_df.loc[np.vectorize(lambda x: oof_df.id.tolist().index(x))(train.id.tolist())].reset_index(drop = True)
    logging.info(f' CV '.center(50, '*'))
    pred_location = get_result(oof_df)
    oof_df['pred_location'] = pred_location
    oof_df['pred_annotation'] = oof_df.apply(predspan_extract, axis = 1)
    oof_df.to_pickle(os.path.join(cfg.output_dir, cfg.model_name.split('_')[0][:-1], cfg.model_name.split('_')[0][-1], f"oof_df_{''.join([str(i) for i in cfg.training_folds])}.pkl"))

12:27:58 ********************* Fold 0 *********************
12:27:58 Preparing training and validating dataloader...


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

12:30:28 Preparing model, optimizer, and scheduler...
12:30:28 loading weights file /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a/pytorch_model.bin
12:30:40 Some weights of the model checkpoint at /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a were not used when initializing ElectraModel: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
12:30:40 Al

  0%|          | 0/2860 [00:00<?, ?it/s]

12:32:19 Epoch: [1][572/2860] - Start evaluating...
12:32:33 Epoch: [1][572/2860] - Elapsed 1m 41s (remain 6m 46s) - Train Loss: 1.0308 - Val Loss: 1.0162 - F1: 0.7665 - LR: 0.00001920
12:32:33 Epoch [1][572/2860] - The Best Score Updated to: 0.7665 Model
12:34:04 Epoch: [1][1144/2860] - Start evaluating...
12:34:17 Epoch: [1][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0217 - Val Loss: 1.0132 - F1: 0.8250 - LR: 0.00001840
12:34:17 Epoch [1][1144/2860] - The Best Score Updated to: 0.8250 Model
12:35:49 Epoch: [1][1716/2860] - Start evaluating...
12:36:02 Epoch: [1][1716/2860] - Elapsed 5m 11s (remain 3m 27s) - Train Loss: 1.0178 - Val Loss: 1.0103 - F1: 0.8359 - LR: 0.00001760
12:36:02 Epoch [1][1716/2860] - The Best Score Updated to: 0.8359 Model
12:37:34 Epoch: [1][2288/2860] - Start evaluating...
12:37:48 Epoch: [1][2288/2860] - Elapsed 6m 56s (remain 1m 44s) - Train Loss: 1.0156 - Val Loss: 1.0097 - F1: 0.8579 - LR: 0.00001680
12:37:48 Epoch [1][2288/2860] - The Best

  0%|          | 0/2860 [00:00<?, ?it/s]

12:41:06 Epoch: [2][572/2860] - Start evaluating...
12:41:19 Epoch: [2][572/2860] - Elapsed 1m 42s (remain 6m 48s) - Train Loss: 1.0074 - Val Loss: 1.0122 - F1: 0.8690 - LR: 0.00001520
12:41:19 Epoch [2][572/2860] - The Best Score Updated to: 0.8690 Model
12:42:51 Epoch: [2][1144/2860] - Start evaluating...
12:43:04 Epoch: [2][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0069 - Val Loss: 1.0232 - F1: 0.8627 - LR: 0.00001440
12:43:04 Epoch [2][1144/2860] - Not The Best Score (0.8627), Current Best Score: 0.8690 Model
12:44:32 Epoch: [2][1716/2860] - Start evaluating...
12:44:45 Epoch: [2][1716/2860] - Elapsed 5m 7s (remain 3m 25s) - Train Loss: 1.0067 - Val Loss: 1.0173 - F1: 0.8688 - LR: 0.00001360
12:44:45 Epoch [2][1716/2860] - Not The Best Score (0.8688), Current Best Score: 0.8690 Model
12:46:13 Epoch: [2][2288/2860] - Start evaluating...
12:46:26 Epoch: [2][2288/2860] - Elapsed 6m 48s (remain 1m 42s) - Train Loss: 1.0066 - Val Loss: 1.0230 - F1: 0.8601 - LR: 0.00001

  0%|          | 0/2860 [00:00<?, ?it/s]

12:49:35 Epoch: [3][572/2860] - Start evaluating...
12:49:49 Epoch: [3][572/2860] - Elapsed 1m 41s (remain 6m 47s) - Train Loss: 1.0072 - Val Loss: 1.0248 - F1: 0.8705 - LR: 0.00001120
12:49:49 Epoch [3][572/2860] - The Best Score Updated to: 0.8705 Model
12:51:21 Epoch: [3][1144/2860] - Start evaluating...
12:51:35 Epoch: [3][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0061 - Val Loss: 1.0334 - F1: 0.8719 - LR: 0.00001040
12:51:35 Epoch [3][1144/2860] - The Best Score Updated to: 0.8719 Model
12:53:07 Epoch: [3][1716/2860] - Start evaluating...
12:53:21 Epoch: [3][1716/2860] - Elapsed 5m 13s (remain 3m 29s) - Train Loss: 1.0056 - Val Loss: 1.0238 - F1: 0.8731 - LR: 0.00000960
12:53:21 Epoch [3][1716/2860] - The Best Score Updated to: 0.8731 Model
12:54:54 Epoch: [3][2288/2860] - Start evaluating...
12:55:07 Epoch: [3][2288/2860] - Elapsed 6m 59s (remain 1m 44s) - Train Loss: 1.0054 - Val Loss: 1.0281 - F1: 0.8683 - LR: 0.00000880
12:55:07 Epoch [3][2288/2860] - Not The

  0%|          | 0/2860 [00:00<?, ?it/s]

12:58:21 Epoch: [4][572/2860] - Start evaluating...
12:58:34 Epoch: [4][572/2860] - Elapsed 1m 42s (remain 6m 49s) - Train Loss: 1.0055 - Val Loss: 1.0229 - F1: 0.8712 - LR: 0.00000720
12:58:34 Epoch [4][572/2860] - Not The Best Score (0.8712), Current Best Score: 0.8753 Model
13:00:02 Epoch: [4][1144/2860] - Start evaluating...
13:00:15 Epoch: [4][1144/2860] - Elapsed 3m 23s (remain 5m 5s) - Train Loss: 1.0035 - Val Loss: 1.0357 - F1: 0.8751 - LR: 0.00000640
13:00:15 Epoch [4][1144/2860] - Not The Best Score (0.8751), Current Best Score: 0.8753 Model
13:01:43 Epoch: [4][1716/2860] - Start evaluating...
13:01:56 Epoch: [4][1716/2860] - Elapsed 5m 4s (remain 3m 22s) - Train Loss: 1.0030 - Val Loss: 1.0320 - F1: 0.8787 - LR: 0.00000560
13:01:56 Epoch [4][1716/2860] - The Best Score Updated to: 0.8787 Model
13:03:28 Epoch: [4][2288/2860] - Start evaluating...
13:03:41 Epoch: [4][2288/2860] - Elapsed 6m 48s (remain 1m 42s) - Train Loss: 1.0028 - Val Loss: 1.0405 - F1: 0.8702 - LR: 0.000004

  0%|          | 0/2860 [00:00<?, ?it/s]

13:06:50 Epoch: [5][572/2860] - Start evaluating...
13:07:03 Epoch: [5][572/2860] - Elapsed 1m 41s (remain 6m 46s) - Train Loss: 1.0019 - Val Loss: 1.0311 - F1: 0.8789 - LR: 0.00000320
13:07:03 Epoch [5][572/2860] - The Best Score Updated to: 0.8789 Model
13:08:34 Epoch: [5][1144/2860] - Start evaluating...
13:08:48 Epoch: [5][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0005 - Val Loss: 1.0346 - F1: 0.8766 - LR: 0.00000240
13:08:48 Epoch [5][1144/2860] - Not The Best Score (0.8766), Current Best Score: 0.8789 Model
13:10:15 Epoch: [5][1716/2860] - Start evaluating...
13:10:28 Epoch: [5][1716/2860] - Elapsed 5m 7s (remain 3m 24s) - Train Loss: 1.0000 - Val Loss: 1.0288 - F1: 0.8780 - LR: 0.00000160
13:10:28 Epoch [5][1716/2860] - Not The Best Score (0.8780), Current Best Score: 0.8789 Model
13:11:56 Epoch: [5][2288/2860] - Start evaluating...
13:12:09 Epoch: [5][2288/2860] - Elapsed 6m 48s (remain 1m 42s) - Train Loss: 1.0000 - Val Loss: 1.0335 - F1: 0.8774 - LR: 0.000000

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

13:16:25 Preparing model, optimizer, and scheduler...
13:16:25 loading weights file /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a/pytorch_model.bin
13:16:28 Some weights of the model checkpoint at /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a were not used when initializing ElectraModel: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
13:16:28 Al

  0%|          | 0/2860 [00:00<?, ?it/s]

13:17:57 Epoch: [1][572/2860] - Start evaluating...
13:18:10 Epoch: [1][572/2860] - Elapsed 1m 41s (remain 6m 46s) - Train Loss: 1.0319 - Val Loss: 1.0145 - F1: 0.7763 - LR: 0.00001920
13:18:10 Epoch [1][572/2860] - The Best Score Updated to: 0.7763 Model
13:19:42 Epoch: [1][1144/2860] - Start evaluating...
13:19:55 Epoch: [1][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0219 - Val Loss: 1.0126 - F1: 0.8096 - LR: 0.00001840
13:19:55 Epoch [1][1144/2860] - The Best Score Updated to: 0.8096 Model
13:21:26 Epoch: [1][1716/2860] - Start evaluating...
13:21:39 Epoch: [1][1716/2860] - Elapsed 5m 10s (remain 3m 26s) - Train Loss: 1.0194 - Val Loss: 1.0137 - F1: 0.7948 - LR: 0.00001760
13:21:39 Epoch [1][1716/2860] - Not The Best Score (0.7948), Current Best Score: 0.8096 Model
13:23:06 Epoch: [1][2288/2860] - Start evaluating...
13:23:19 Epoch: [1][2288/2860] - Elapsed 6m 50s (remain 1m 42s) - Train Loss: 1.0173 - Val Loss: 1.0103 - F1: 0.8165 - LR: 0.00001680
13:23:19 Epoch [1]

  0%|          | 0/2860 [00:00<?, ?it/s]

13:26:37 Epoch: [2][572/2860] - Start evaluating...
13:26:50 Epoch: [2][572/2860] - Elapsed 1m 42s (remain 6m 49s) - Train Loss: 1.0065 - Val Loss: 1.0091 - F1: 0.8642 - LR: 0.00001520
13:26:50 Epoch [2][572/2860] - The Best Score Updated to: 0.8642 Model
13:28:22 Epoch: [2][1144/2860] - Start evaluating...
13:28:35 Epoch: [2][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0064 - Val Loss: 1.0101 - F1: 0.8649 - LR: 0.00001440
13:28:35 Epoch [2][1144/2860] - The Best Score Updated to: 0.8649 Model
13:30:07 Epoch: [2][1716/2860] - Start evaluating...
13:30:20 Epoch: [2][1716/2860] - Elapsed 5m 11s (remain 3m 27s) - Train Loss: 1.0061 - Val Loss: 1.0117 - F1: 0.8679 - LR: 0.00001360
13:30:20 Epoch [2][1716/2860] - The Best Score Updated to: 0.8679 Model
13:31:51 Epoch: [2][2288/2860] - Start evaluating...
13:32:04 Epoch: [2][2288/2860] - Elapsed 6m 55s (remain 1m 43s) - Train Loss: 1.0058 - Val Loss: 1.0182 - F1: 0.8660 - LR: 0.00001280
13:32:04 Epoch [2][2288/2860] - Not The

  0%|          | 0/2860 [00:00<?, ?it/s]

13:35:13 Epoch: [3][572/2860] - Start evaluating...
13:35:26 Epoch: [3][572/2860] - Elapsed 1m 42s (remain 6m 48s) - Train Loss: 1.0077 - Val Loss: 1.0171 - F1: 0.8702 - LR: 0.00001120
13:35:26 Epoch [3][572/2860] - The Best Score Updated to: 0.8702 Model
13:36:59 Epoch: [3][1144/2860] - Start evaluating...
13:37:12 Epoch: [3][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0078 - Val Loss: 1.0214 - F1: 0.8705 - LR: 0.00001040
13:37:12 Epoch [3][1144/2860] - The Best Score Updated to: 0.8705 Model
13:38:44 Epoch: [3][1716/2860] - Start evaluating...
13:38:57 Epoch: [3][1716/2860] - Elapsed 5m 12s (remain 3m 28s) - Train Loss: 1.0067 - Val Loss: 1.0175 - F1: 0.8606 - LR: 0.00000960
13:38:57 Epoch [3][1716/2860] - Not The Best Score (0.8606), Current Best Score: 0.8705 Model
13:40:25 Epoch: [3][2288/2860] - Start evaluating...
13:40:38 Epoch: [3][2288/2860] - Elapsed 6m 53s (remain 1m 43s) - Train Loss: 1.0060 - Val Loss: 1.0248 - F1: 0.8671 - LR: 0.00000880
13:40:38 Epoch [3

  0%|          | 0/2860 [00:00<?, ?it/s]

13:43:47 Epoch: [4][572/2860] - Start evaluating...
13:44:00 Epoch: [4][572/2860] - Elapsed 1m 42s (remain 6m 48s) - Train Loss: 1.0063 - Val Loss: 1.0237 - F1: 0.8757 - LR: 0.00000720
13:44:00 Epoch [4][572/2860] - The Best Score Updated to: 0.8757 Model
13:45:33 Epoch: [4][1144/2860] - Start evaluating...
13:45:46 Epoch: [4][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0064 - Val Loss: 1.0230 - F1: 0.8779 - LR: 0.00000640
13:45:46 Epoch [4][1144/2860] - The Best Score Updated to: 0.8779 Model
13:47:18 Epoch: [4][1716/2860] - Start evaluating...
13:47:31 Epoch: [4][1716/2860] - Elapsed 5m 12s (remain 3m 28s) - Train Loss: 1.0050 - Val Loss: 1.0243 - F1: 0.8763 - LR: 0.00000560
13:47:31 Epoch [4][1716/2860] - Not The Best Score (0.8763), Current Best Score: 0.8779 Model
13:48:58 Epoch: [4][2288/2860] - Start evaluating...
13:49:11 Epoch: [4][2288/2860] - Elapsed 6m 52s (remain 1m 43s) - Train Loss: 1.0040 - Val Loss: 1.0285 - F1: 0.8747 - LR: 0.00000480
13:49:11 Epoch [4

  0%|          | 0/2860 [00:00<?, ?it/s]

13:52:20 Epoch: [5][572/2860] - Start evaluating...
13:52:33 Epoch: [5][572/2860] - Elapsed 1m 41s (remain 6m 45s) - Train Loss: 1.0062 - Val Loss: 1.0255 - F1: 0.8812 - LR: 0.00000320
13:52:33 Epoch [5][572/2860] - The Best Score Updated to: 0.8812 Model
13:54:04 Epoch: [5][1144/2860] - Start evaluating...
13:54:17 Epoch: [5][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0045 - Val Loss: 1.0266 - F1: 0.8814 - LR: 0.00000240
13:54:17 Epoch [5][1144/2860] - The Best Score Updated to: 0.8814 Model
13:55:49 Epoch: [5][1716/2860] - Start evaluating...
13:56:02 Epoch: [5][1716/2860] - Elapsed 5m 10s (remain 3m 27s) - Train Loss: 1.0029 - Val Loss: 1.0262 - F1: 0.8801 - LR: 0.00000160
13:56:02 Epoch [5][1716/2860] - Not The Best Score (0.8801), Current Best Score: 0.8814 Model
13:57:30 Epoch: [5][2288/2860] - Start evaluating...
13:57:43 Epoch: [5][2288/2860] - Elapsed 6m 51s (remain 1m 42s) - Train Loss: 1.0020 - Val Loss: 1.0304 - F1: 0.8803 - LR: 0.00000080
13:57:43 Epoch [5]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

14:02:01 Preparing model, optimizer, and scheduler...
14:02:01 loading weights file /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a/pytorch_model.bin
14:02:05 Some weights of the model checkpoint at /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a were not used when initializing ElectraModel: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
14:02:05 Al

  0%|          | 0/2860 [00:00<?, ?it/s]

14:03:34 Epoch: [1][572/2860] - Start evaluating...
14:03:47 Epoch: [1][572/2860] - Elapsed 1m 41s (remain 6m 47s) - Train Loss: 1.0305 - Val Loss: 1.0151 - F1: 0.7616 - LR: 0.00001920
14:03:47 Epoch [1][572/2860] - The Best Score Updated to: 0.7616 Model
14:05:18 Epoch: [1][1144/2860] - Start evaluating...
14:05:31 Epoch: [1][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0210 - Val Loss: 1.0111 - F1: 0.8218 - LR: 0.00001840
14:05:31 Epoch [1][1144/2860] - The Best Score Updated to: 0.8218 Model
14:07:03 Epoch: [1][1716/2860] - Start evaluating...
14:07:16 Epoch: [1][1716/2860] - Elapsed 5m 10s (remain 3m 26s) - Train Loss: 1.0174 - Val Loss: 1.0134 - F1: 0.7834 - LR: 0.00001760
14:07:16 Epoch [1][1716/2860] - Not The Best Score (0.7834), Current Best Score: 0.8218 Model
14:08:43 Epoch: [1][2288/2860] - Start evaluating...
14:08:56 Epoch: [1][2288/2860] - Elapsed 6m 50s (remain 1m 42s) - Train Loss: 1.0158 - Val Loss: 1.0116 - F1: 0.8173 - LR: 0.00001680
14:08:56 Epoch [1]

  0%|          | 0/2860 [00:00<?, ?it/s]

14:12:12 Epoch: [2][572/2860] - Start evaluating...
14:12:25 Epoch: [2][572/2860] - Elapsed 1m 45s (remain 7m 1s) - Train Loss: 1.0070 - Val Loss: 1.0152 - F1: 0.8519 - LR: 0.00001520
14:12:25 Epoch [2][572/2860] - The Best Score Updated to: 0.8519 Model
14:13:57 Epoch: [2][1144/2860] - Start evaluating...
14:14:10 Epoch: [2][1144/2860] - Elapsed 3m 30s (remain 5m 15s) - Train Loss: 1.0070 - Val Loss: 1.0138 - F1: 0.8564 - LR: 0.00001440
14:14:10 Epoch [2][1144/2860] - The Best Score Updated to: 0.8564 Model
14:15:42 Epoch: [2][1716/2860] - Start evaluating...
14:15:56 Epoch: [2][1716/2860] - Elapsed 5m 15s (remain 3m 30s) - Train Loss: 1.0069 - Val Loss: 1.0186 - F1: 0.8561 - LR: 0.00001360
14:15:56 Epoch [2][1716/2860] - Not The Best Score (0.8561), Current Best Score: 0.8564 Model
14:17:23 Epoch: [2][2288/2860] - Start evaluating...
14:17:36 Epoch: [2][2288/2860] - Elapsed 6m 56s (remain 1m 44s) - Train Loss: 1.0068 - Val Loss: 1.0229 - F1: 0.8639 - LR: 0.00001280
14:17:36 Epoch [2]

  0%|          | 0/2860 [00:00<?, ?it/s]

14:20:50 Epoch: [3][572/2860] - Start evaluating...
14:21:03 Epoch: [3][572/2860] - Elapsed 1m 41s (remain 6m 45s) - Train Loss: 1.0074 - Val Loss: 1.0230 - F1: 0.8637 - LR: 0.00001120
14:21:03 Epoch [3][572/2860] - Not The Best Score (0.8637), Current Best Score: 0.8639 Model
14:22:30 Epoch: [3][1144/2860] - Start evaluating...
14:22:43 Epoch: [3][1144/2860] - Elapsed 3m 21s (remain 5m 2s) - Train Loss: 1.0066 - Val Loss: 1.0238 - F1: 0.8645 - LR: 0.00001040
14:22:43 Epoch [3][1144/2860] - The Best Score Updated to: 0.8645 Model
14:24:15 Epoch: [3][1716/2860] - Start evaluating...
14:24:28 Epoch: [3][1716/2860] - Elapsed 5m 6s (remain 3m 24s) - Train Loss: 1.0062 - Val Loss: 1.0237 - F1: 0.8657 - LR: 0.00000960
14:24:28 Epoch [3][1716/2860] - The Best Score Updated to: 0.8657 Model
14:26:00 Epoch: [3][2288/2860] - Start evaluating...
14:26:13 Epoch: [3][2288/2860] - Elapsed 6m 51s (remain 1m 42s) - Train Loss: 1.0059 - Val Loss: 1.0263 - F1: 0.8592 - LR: 0.00000880
14:26:13 Epoch [3][

  0%|          | 0/2860 [00:00<?, ?it/s]

14:29:23 Epoch: [4][572/2860] - Start evaluating...
14:29:36 Epoch: [4][572/2860] - Elapsed 1m 41s (remain 6m 45s) - Train Loss: 1.0056 - Val Loss: 1.0298 - F1: 0.8695 - LR: 0.00000720
14:29:36 Epoch [4][572/2860] - The Best Score Updated to: 0.8695 Model
14:31:08 Epoch: [4][1144/2860] - Start evaluating...
14:31:21 Epoch: [4][1144/2860] - Elapsed 3m 26s (remain 5m 10s) - Train Loss: 1.0045 - Val Loss: 1.0300 - F1: 0.8720 - LR: 0.00000640
14:31:21 Epoch [4][1144/2860] - The Best Score Updated to: 0.8720 Model
14:32:53 Epoch: [4][1716/2860] - Start evaluating...
14:33:07 Epoch: [4][1716/2860] - Elapsed 5m 12s (remain 3m 28s) - Train Loss: 1.0039 - Val Loss: 1.0299 - F1: 0.8757 - LR: 0.00000560
14:33:07 Epoch [4][1716/2860] - The Best Score Updated to: 0.8757 Model
14:34:39 Epoch: [4][2288/2860] - Start evaluating...
14:34:52 Epoch: [4][2288/2860] - Elapsed 6m 57s (remain 1m 44s) - Train Loss: 1.0033 - Val Loss: 1.0345 - F1: 0.8678 - LR: 0.00000480
14:34:52 Epoch [4][2288/2860] - Not The

  0%|          | 0/2860 [00:00<?, ?it/s]

14:38:07 Epoch: [5][572/2860] - Start evaluating...
14:38:20 Epoch: [5][572/2860] - Elapsed 1m 42s (remain 6m 51s) - Train Loss: 1.0033 - Val Loss: 1.0316 - F1: 0.8789 - LR: 0.00000320
14:38:20 Epoch [5][572/2860] - The Best Score Updated to: 0.8789 Model
14:39:54 Epoch: [5][1144/2860] - Start evaluating...
14:40:07 Epoch: [5][1144/2860] - Elapsed 3m 29s (remain 5m 14s) - Train Loss: 1.0021 - Val Loss: 1.0325 - F1: 0.8757 - LR: 0.00000240
14:40:07 Epoch [5][1144/2860] - Not The Best Score (0.8757), Current Best Score: 0.8789 Model
14:41:35 Epoch: [5][1716/2860] - Start evaluating...
14:41:48 Epoch: [5][1716/2860] - Elapsed 5m 10s (remain 3m 27s) - Train Loss: 1.0015 - Val Loss: 1.0350 - F1: 0.8751 - LR: 0.00000160
14:41:48 Epoch [5][1716/2860] - Not The Best Score (0.8751), Current Best Score: 0.8789 Model
14:43:16 Epoch: [5][2288/2860] - Start evaluating...
14:43:29 Epoch: [5][2288/2860] - Elapsed 6m 52s (remain 1m 43s) - Train Loss: 1.0009 - Val Loss: 1.0359 - F1: 0.8764 - LR: 0.0000

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

14:47:47 Preparing model, optimizer, and scheduler...
14:47:47 loading weights file /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a/pytorch_model.bin
14:47:50 Some weights of the model checkpoint at /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a were not used when initializing ElectraModel: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
14:47:50 Al

  0%|          | 0/2860 [00:00<?, ?it/s]

14:49:20 Epoch: [1][572/2860] - Start evaluating...
14:49:33 Epoch: [1][572/2860] - Elapsed 1m 42s (remain 6m 50s) - Train Loss: 1.0326 - Val Loss: 1.0166 - F1: 0.7655 - LR: 0.00001920
14:49:33 Epoch [1][572/2860] - The Best Score Updated to: 0.7655 Model
14:51:06 Epoch: [1][1144/2860] - Start evaluating...
14:51:19 Epoch: [1][1144/2860] - Elapsed 3m 28s (remain 5m 12s) - Train Loss: 1.0227 - Val Loss: 1.0136 - F1: 0.8167 - LR: 0.00001840
14:51:19 Epoch [1][1144/2860] - The Best Score Updated to: 0.8167 Model
14:52:52 Epoch: [1][1716/2860] - Start evaluating...
14:53:05 Epoch: [1][1716/2860] - Elapsed 5m 13s (remain 3m 29s) - Train Loss: 1.0186 - Val Loss: 1.0097 - F1: 0.8354 - LR: 0.00001760
14:53:05 Epoch [1][1716/2860] - The Best Score Updated to: 0.8354 Model
14:54:37 Epoch: [1][2288/2860] - Start evaluating...
14:54:50 Epoch: [1][2288/2860] - Elapsed 6m 59s (remain 1m 44s) - Train Loss: 1.0161 - Val Loss: 1.0106 - F1: 0.8480 - LR: 0.00001680
14:54:50 Epoch [1][2288/2860] - The Bes

  0%|          | 0/2860 [00:00<?, ?it/s]

14:58:09 Epoch: [2][572/2860] - Start evaluating...
14:58:22 Epoch: [2][572/2860] - Elapsed 1m 42s (remain 6m 50s) - Train Loss: 1.0074 - Val Loss: 1.0140 - F1: 0.8616 - LR: 0.00001520
14:58:22 Epoch [2][572/2860] - Not The Best Score (0.8616), Current Best Score: 0.8634 Model
14:59:50 Epoch: [2][1144/2860] - Start evaluating...
15:00:02 Epoch: [2][1144/2860] - Elapsed 3m 23s (remain 5m 4s) - Train Loss: 1.0083 - Val Loss: 1.0166 - F1: 0.8593 - LR: 0.00001440
15:00:02 Epoch [2][1144/2860] - Not The Best Score (0.8593), Current Best Score: 0.8634 Model
15:01:30 Epoch: [2][1716/2860] - Start evaluating...
15:01:43 Epoch: [2][1716/2860] - Elapsed 5m 3s (remain 3m 22s) - Train Loss: 1.0076 - Val Loss: 1.0157 - F1: 0.8622 - LR: 0.00001360
15:01:43 Epoch [2][1716/2860] - Not The Best Score (0.8622), Current Best Score: 0.8634 Model
15:03:11 Epoch: [2][2288/2860] - Start evaluating...
15:03:24 Epoch: [2][2288/2860] - Elapsed 6m 44s (remain 1m 41s) - Train Loss: 1.0073 - Val Loss: 1.0252 - F1:

  0%|          | 0/2860 [00:00<?, ?it/s]

15:06:37 Epoch: [3][572/2860] - Start evaluating...
15:06:50 Epoch: [3][572/2860] - Elapsed 1m 42s (remain 6m 48s) - Train Loss: 1.0074 - Val Loss: 1.0246 - F1: 0.8695 - LR: 0.00001120
15:06:50 Epoch [3][572/2860] - The Best Score Updated to: 0.8695 Model
15:08:22 Epoch: [3][1144/2860] - Start evaluating...
15:08:35 Epoch: [3][1144/2860] - Elapsed 3m 26s (remain 5m 10s) - Train Loss: 1.0054 - Val Loss: 1.0261 - F1: 0.8681 - LR: 0.00001040
15:08:35 Epoch [3][1144/2860] - Not The Best Score (0.8681), Current Best Score: 0.8695 Model
15:10:02 Epoch: [3][1716/2860] - Start evaluating...
15:10:15 Epoch: [3][1716/2860] - Elapsed 5m 6s (remain 3m 24s) - Train Loss: 1.0044 - Val Loss: 1.0276 - F1: 0.8703 - LR: 0.00000960
15:10:15 Epoch [3][1716/2860] - The Best Score Updated to: 0.8703 Model
15:11:47 Epoch: [3][2288/2860] - Start evaluating...
15:11:59 Epoch: [3][2288/2860] - Elapsed 6m 51s (remain 1m 42s) - Train Loss: 1.0038 - Val Loss: 1.0284 - F1: 0.8711 - LR: 0.00000880
15:11:59 Epoch [3]

  0%|          | 0/2860 [00:00<?, ?it/s]

15:15:13 Epoch: [4][572/2860] - Start evaluating...
15:15:26 Epoch: [4][572/2860] - Elapsed 1m 41s (remain 6m 45s) - Train Loss: 1.0049 - Val Loss: 1.0310 - F1: 0.8713 - LR: 0.00000720
15:15:26 Epoch [4][572/2860] - The Best Score Updated to: 0.8713 Model
15:16:58 Epoch: [4][1144/2860] - Start evaluating...
15:17:11 Epoch: [4][1144/2860] - Elapsed 3m 26s (remain 5m 9s) - Train Loss: 1.0028 - Val Loss: 1.0321 - F1: 0.8725 - LR: 0.00000640
15:17:11 Epoch [4][1144/2860] - The Best Score Updated to: 0.8725 Model
15:18:43 Epoch: [4][1716/2860] - Start evaluating...
15:18:56 Epoch: [4][1716/2860] - Elapsed 5m 11s (remain 3m 27s) - Train Loss: 1.0020 - Val Loss: 1.0254 - F1: 0.8723 - LR: 0.00000560
15:18:56 Epoch [4][1716/2860] - Not The Best Score (0.8723), Current Best Score: 0.8725 Model
15:20:24 Epoch: [4][2288/2860] - Start evaluating...
15:20:38 Epoch: [4][2288/2860] - Elapsed 6m 53s (remain 1m 43s) - Train Loss: 1.0013 - Val Loss: 1.0306 - F1: 0.8741 - LR: 0.00000480
15:20:38 Epoch [4]

  0%|          | 0/2860 [00:00<?, ?it/s]

15:23:55 Epoch: [5][572/2860] - Start evaluating...
15:24:08 Epoch: [5][572/2860] - Elapsed 1m 41s (remain 6m 45s) - Train Loss: 1.0033 - Val Loss: 1.0302 - F1: 0.8740 - LR: 0.00000320
15:24:08 Epoch [5][572/2860] - Not The Best Score (0.8740), Current Best Score: 0.8741 Model
15:25:35 Epoch: [5][1144/2860] - Start evaluating...
15:25:48 Epoch: [5][1144/2860] - Elapsed 3m 21s (remain 5m 2s) - Train Loss: 1.0011 - Val Loss: 1.0312 - F1: 0.8761 - LR: 0.00000240
15:25:48 Epoch [5][1144/2860] - The Best Score Updated to: 0.8761 Model
15:27:20 Epoch: [5][1716/2860] - Start evaluating...
15:27:33 Epoch: [5][1716/2860] - Elapsed 5m 6s (remain 3m 24s) - Train Loss: 1.0000 - Val Loss: 1.0290 - F1: 0.8781 - LR: 0.00000160
15:27:33 Epoch [5][1716/2860] - The Best Score Updated to: 0.8781 Model
15:29:05 Epoch: [5][2288/2860] - Start evaluating...
15:29:18 Epoch: [5][2288/2860] - Elapsed 6m 51s (remain 1m 42s) - Train Loss: 0.9993 - Val Loss: 1.0300 - F1: 0.8799 - LR: 0.00000080
15:29:18 Epoch [5][

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

15:33:38 Preparing model, optimizer, and scheduler...
15:33:38 loading weights file /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a/pytorch_model.bin
15:33:41 Some weights of the model checkpoint at /content/drive/My Drive/Kaggle competitions/NBME/model/v11/a were not used when initializing ElectraModel: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
15:33:41 Al

  0%|          | 0/2860 [00:00<?, ?it/s]

15:35:10 Epoch: [1][572/2860] - Start evaluating...
15:35:24 Epoch: [1][572/2860] - Elapsed 1m 41s (remain 6m 47s) - Train Loss: 1.0324 - Val Loss: 1.0162 - F1: 0.7792 - LR: 0.00001920
15:35:24 Epoch [1][572/2860] - The Best Score Updated to: 0.7792 Model
15:36:56 Epoch: [1][1144/2860] - Start evaluating...
15:37:10 Epoch: [1][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0228 - Val Loss: 1.0110 - F1: 0.8154 - LR: 0.00001840
15:37:10 Epoch [1][1144/2860] - The Best Score Updated to: 0.8154 Model
15:38:41 Epoch: [1][1716/2860] - Start evaluating...
15:38:54 Epoch: [1][1716/2860] - Elapsed 5m 12s (remain 3m 28s) - Train Loss: 1.0190 - Val Loss: 1.0098 - F1: 0.8439 - LR: 0.00001760
15:38:54 Epoch [1][1716/2860] - The Best Score Updated to: 0.8439 Model
15:40:26 Epoch: [1][2288/2860] - Start evaluating...
15:40:39 Epoch: [1][2288/2860] - Elapsed 6m 57s (remain 1m 44s) - Train Loss: 1.0165 - Val Loss: 1.0096 - F1: 0.8592 - LR: 0.00001680
15:40:39 Epoch [1][2288/2860] - The Bes

  0%|          | 0/2860 [00:00<?, ?it/s]

15:43:53 Epoch: [2][572/2860] - Start evaluating...
15:44:07 Epoch: [2][572/2860] - Elapsed 1m 41s (remain 6m 47s) - Train Loss: 1.0072 - Val Loss: 1.0154 - F1: 0.8632 - LR: 0.00001520
15:44:07 Epoch [2][572/2860] - The Best Score Updated to: 0.8632 Model
15:45:39 Epoch: [2][1144/2860] - Start evaluating...
15:45:52 Epoch: [2][1144/2860] - Elapsed 3m 26s (remain 5m 10s) - Train Loss: 1.0080 - Val Loss: 1.0149 - F1: 0.8661 - LR: 0.00001440
15:45:52 Epoch [2][1144/2860] - The Best Score Updated to: 0.8661 Model
15:47:23 Epoch: [2][1716/2860] - Start evaluating...
15:47:37 Epoch: [2][1716/2860] - Elapsed 5m 11s (remain 3m 27s) - Train Loss: 1.0083 - Val Loss: 1.0140 - F1: 0.8580 - LR: 0.00001360
15:47:37 Epoch [2][1716/2860] - Not The Best Score (0.8580), Current Best Score: 0.8661 Model
15:49:04 Epoch: [2][2288/2860] - Start evaluating...
15:49:17 Epoch: [2][2288/2860] - Elapsed 6m 52s (remain 1m 43s) - Train Loss: 1.0083 - Val Loss: 1.0214 - F1: 0.8550 - LR: 0.00001280
15:49:17 Epoch [2

  0%|          | 0/2860 [00:00<?, ?it/s]

15:52:26 Epoch: [3][572/2860] - Start evaluating...
15:52:40 Epoch: [3][572/2860] - Elapsed 1m 41s (remain 6m 47s) - Train Loss: 1.0083 - Val Loss: 1.0287 - F1: 0.8711 - LR: 0.00001120
15:52:40 Epoch [3][572/2860] - The Best Score Updated to: 0.8711 Model
15:54:12 Epoch: [3][1144/2860] - Start evaluating...
15:54:26 Epoch: [3][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0078 - Val Loss: 1.0242 - F1: 0.8692 - LR: 0.00001040
15:54:26 Epoch [3][1144/2860] - Not The Best Score (0.8692), Current Best Score: 0.8711 Model
15:55:54 Epoch: [3][1716/2860] - Start evaluating...
15:56:07 Epoch: [3][1716/2860] - Elapsed 5m 9s (remain 3m 26s) - Train Loss: 1.0068 - Val Loss: 1.0230 - F1: 0.8729 - LR: 0.00000960
15:56:07 Epoch [3][1716/2860] - The Best Score Updated to: 0.8729 Model
15:57:39 Epoch: [3][2288/2860] - Start evaluating...
15:57:53 Epoch: [3][2288/2860] - Elapsed 6m 55s (remain 1m 43s) - Train Loss: 1.0062 - Val Loss: 1.0270 - F1: 0.8700 - LR: 0.00000880
15:57:53 Epoch [3]

  0%|          | 0/2860 [00:00<?, ?it/s]

16:01:08 Epoch: [4][572/2860] - Start evaluating...
16:01:21 Epoch: [4][572/2860] - Elapsed 1m 42s (remain 6m 51s) - Train Loss: 1.0046 - Val Loss: 1.0328 - F1: 0.8750 - LR: 0.00000720
16:01:21 Epoch [4][572/2860] - Not The Best Score (0.8750), Current Best Score: 0.8755 Model
16:02:49 Epoch: [4][1144/2860] - Start evaluating...
16:03:02 Epoch: [4][1144/2860] - Elapsed 3m 24s (remain 5m 6s) - Train Loss: 1.0041 - Val Loss: 1.0223 - F1: 0.8733 - LR: 0.00000640
16:03:02 Epoch [4][1144/2860] - Not The Best Score (0.8733), Current Best Score: 0.8755 Model
16:04:30 Epoch: [4][1716/2860] - Start evaluating...
16:04:43 Epoch: [4][1716/2860] - Elapsed 5m 5s (remain 3m 23s) - Train Loss: 1.0029 - Val Loss: 1.0306 - F1: 0.8701 - LR: 0.00000560
16:04:43 Epoch [4][1716/2860] - Not The Best Score (0.8701), Current Best Score: 0.8755 Model
16:06:11 Epoch: [4][2288/2860] - Start evaluating...
16:06:24 Epoch: [4][2288/2860] - Elapsed 6m 46s (remain 1m 41s) - Train Loss: 1.0026 - Val Loss: 1.0250 - F1:

  0%|          | 0/2860 [00:00<?, ?it/s]

16:09:38 Epoch: [5][572/2860] - Start evaluating...
16:09:52 Epoch: [5][572/2860] - Elapsed 1m 42s (remain 6m 49s) - Train Loss: 1.0036 - Val Loss: 1.0288 - F1: 0.8819 - LR: 0.00000320
16:09:52 Epoch [5][572/2860] - The Best Score Updated to: 0.8819 Model
16:11:24 Epoch: [5][1144/2860] - Start evaluating...
16:11:37 Epoch: [5][1144/2860] - Elapsed 3m 27s (remain 5m 11s) - Train Loss: 1.0015 - Val Loss: 1.0285 - F1: 0.8815 - LR: 0.00000240
16:11:37 Epoch [5][1144/2860] - Not The Best Score (0.8815), Current Best Score: 0.8819 Model
16:13:05 Epoch: [5][1716/2860] - Start evaluating...
16:13:18 Epoch: [5][1716/2860] - Elapsed 5m 8s (remain 3m 25s) - Train Loss: 1.0005 - Val Loss: 1.0320 - F1: 0.8834 - LR: 0.00000160
16:13:18 Epoch [5][1716/2860] - The Best Score Updated to: 0.8834 Model
16:14:50 Epoch: [5][2288/2860] - Start evaluating...
16:15:03 Epoch: [5][2288/2860] - Elapsed 6m 54s (remain 1m 43s) - Train Loss: 1.0001 - Val Loss: 1.0303 - F1: 0.8824 - LR: 0.00000080
16:15:03 Epoch [5]