In [1]:
!nvidia-smi

Wed Apr 27 05:20:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    50W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# Init

In [3]:
# !pip install wandb > /dev/null

In [4]:
# %cd "/content/drive/MyDrive/Colab Notebooks/nbme/code/exp076"

In [5]:
import os
import random
import re
import yaml
import numpy as np
from numpy import ndarray
import pandas as pd
import torch
from logging import Logger, getLogger, INFO, StreamHandler, FileHandler, Formatter
import wandb
from pathlib import Path
from wandb.sdk.wandb_config import Config

def init_pandas() -> None:    
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

def get_logger(filename:str) -> Logger:
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def seed_everything(seed:int=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def init_wandb(wandb_key:str) -> Config:
    #from kaggle_secrets import UserSecretsClient
    #user_secrets = UserSecretsClient()
    secret_value_0 = wandb_key
    wandb.login(key=secret_value_0)

    INPUT_DIR = Path("../input") #'../input/raiii-nbme'
    loader = yaml.SafeLoader
    loader.add_implicit_resolver(
        u'tag:yaml.org,2002:float',
        re.compile(u'''^(?:
         [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
        |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
        |\\.[0-9_]+(?:[eE][-+][0-9]+)?
        |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
        |[-+]?\\.(?:inf|Inf|INF)
        |\\.(?:nan|NaN|NAN))$''', re.X),
        list(u'-+0123456789.'))
    with open(INPUT_DIR / 'exp076_config.yml') as f:
        param = yaml.load(f, Loader=loader)
    wandb.init(
        project=param['project'],
        config=param
    )
    wandb.config.update(param)
    print(f'run name: {wandb.run.name}')    
    return wandb.config

def mk_output_dir(path:str) -> None:
    if not os.path.exists(path):
        os.makedirs(path)

In [6]:
from getpass import getpass
wandb_key = getpass()

········


In [7]:
import warnings

#wandb_key = ""
config = init_wandb(wandb_key=wandb_key)
mk_output_dir(path=config.output_dir)
LOGGER = get_logger(
    filename=config.output_dir+'train'
)
seed_everything(seed=config.seed)
init_pandas()
warnings.filterwarnings("ignore")

[34m[1mwandb[0m: Currently logged in as: [33mmpeg[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


run name: quiet-snow-26


# Helper functions for scoring

In [8]:
from sklearn.metrics import f1_score

def get_score(y_true:ndarray, y_pred:ndarray) -> float:
    score = span_micro_f1(y_true, y_pred)
    return score

def micro_f1(preds:list, truths:list) -> float:
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans:list, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [9]:
# !pip install transformers[sentencepiece] > /dev/null

In [10]:
import itertools
import ast
from pandas import DataFrame
from transformers.tokenization_utils import PreTrainedTokenizer

def create_labels_for_scoring(df:DataFrame):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


#↓こっちのほうがちょっといい
def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        prev_pred = 0
        prev_end = -1
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
            if start != prev_end:
                results[i][prev_end:start] = (pred+prev_pred)/2
            prev_pred = pred
            prev_end = end
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_result(df_oof:DataFrame, tokenizer:PreTrainedTokenizer, max_len:int) -> None:
    labels = create_labels_for_scoring(df_oof)
    predictions = df_oof[[i for i in range(max_len)]].values
    char_probs = get_char_probs(df_oof['pn_history'].values, predictions, tokenizer)
    
    score=-100
    for th in np.arange(0.3,0.7,0.005):
        th = np.round(th,4)
        results = get_results(char_probs, th=th)
        preds = get_predictions(results)
        tmp_score = get_score(labels, preds)
        if tmp_score > score:
            best_th=th
            score=tmp_score
    LOGGER.info(f'Score: {score:<.4f} Best threshold:: {best_th}')

# Data Loading

In [11]:
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features

df_train = pd.read_csv('../input/train.csv')
df_train['annotation'] = df_train['annotation'].map(lambda x: ast.literal_eval(x))
df_train['location'] = df_train['location'].map(lambda x: ast.literal_eval(x))

features = pd.read_csv('../input/features.csv')
features = preprocess_features(features)

patient_notes = pd.read_csv('../input/patient_notes.csv')

print(f"df_train.shape: {df_train.shape}")
display(df_train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

df_train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [12]:
df_train = df_train.merge(features, on=['feature_num', 'case_num'], how='left')
df_train = df_train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(df_train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [13]:
from pandas import DataFrame

def correct_annotation(df_train:DataFrame) -> None:
    df_train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
    df_train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

    df_train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
    df_train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

    df_train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
    df_train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

    df_train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
    df_train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

    df_train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
    df_train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

    df_train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
    df_train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

    df_train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
    df_train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

    df_train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
    df_train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

    df_train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
    df_train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

    df_train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
    df_train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

    df_train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
    df_train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

    df_train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
    df_train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

    df_train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
    df_train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

    df_train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
    df_train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

    df_train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
    df_train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

    df_train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
    df_train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

    df_train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
    df_train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

    df_train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
    df_train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

    df_train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
    df_train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

    df_train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
    df_train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

    df_train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
    df_train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

    df_train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
    df_train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

    df_train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
    df_train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

    df_train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
    df_train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

    df_train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
    df_train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

    df_train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
    df_train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

    df_train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
    df_train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

    df_train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
    df_train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

    df_train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
    df_train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

    df_train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
    df_train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

    df_train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
    df_train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

    df_train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
    df_train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

    df_train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
    df_train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

    df_train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
    df_train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

    df_train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
    df_train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

    df_train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
    df_train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

    df_train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
    df_train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

    df_train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
    df_train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

    df_train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
    df_train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

    df_train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
    df_train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

    df_train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
    df_train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

    df_train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
    df_train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [14]:
df_train['annotation_length'] = df_train['annotation'].map(lambda x: len(x))
display(df_train['annotation_length'].value_counts())

1    8181
0    4399
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [15]:
from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=config.n_folds)
groups = df_train['pn_num'].to_numpy()
df_train.loc[:, 'fold'] = -1
for n, (train_index, val_index) in enumerate(kf.split(df_train, df_train['location'], groups)):
    df_train.loc[val_index, 'fold'] = n
display(df_train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

In [16]:
if config.debug:
    display(df_train.groupby('fold').size())
    df_train = df_train.sample(n=500, random_state=0).reset_index(drop=True)
    display(df_train.groupby('fold').size())

# tokenizer

In [17]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path
import transformers

if 'deberta' in config.model:

    transformers_path = Path(transformers.__file__[:-12])

    input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

    convert_file = input_dir / "convert_slow_tokenizer.py"
    conversion_path = transformers_path/convert_file.name

    if conversion_path.exists():
        conversion_path.unlink()

    shutil.copy(convert_file, transformers_path)
    deberta_v2_path = transformers_path / "models" / "deberta_v2"

    for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
        if str(filename).startswith("deberta"):
            filepath = deberta_v2_path/str(filename).replace("deberta", "")
        else:
            filepath = deberta_v2_path/filename
        if filepath.exists():
            filepath.unlink()

        shutil.copy(input_dir/filename, filepath)
    from transformers.models.deberta_v2 import DebertaV2TokenizerFast

In [18]:
from transformers import AutoTokenizer

%env TOKENIZERS_PARALLELISM=true
if 'deberta' in config.model:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(config.model)
else:
    tokenizer = AutoTokenizer.from_pretrained('roberta-large', trim_offsets=False)
tokenizer.save_pretrained(config.output_dir+'tokenizer/')

env: TOKENIZERS_PARALLELISM=true


('../output/exp076/tokenizer/tokenizer_config.json',
 '../output/exp076/tokenizer/special_tokens_map.json',
 '../output/exp076/tokenizer/vocab.json',
 '../output/exp076/tokenizer/merges.txt',
 '../output/exp076/tokenizer/added_tokens.json',
 '../output/exp076/tokenizer/tokenizer.json')

In [19]:
pn_history_lengths = []
for text in patient_notes['pn_history'].fillna("").to_list():
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    pn_history_lengths.append(length)
pn_history_max_len = max(pn_history_lengths)
LOGGER.info(f'pn_history max(lengths): {pn_history_max_len}')

features_lengths = []
for text in features['feature_text'].fillna("").to_list():
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    features_lengths.append(length)
feature_text_max_len = max(features_lengths)
LOGGER.info(f'feature_text max(lengths): {feature_text_max_len}')

config.max_len = pn_history_max_len+feature_text_max_len + 3
LOGGER.info(f"max_len: {config.max_len}")

pn_history max(lengths): 433
feature_text max(lengths): 30
max_len: 466


# Dataset

In [20]:
from pandas import DataFrame
import torch
from torch import Tensor
from torch.utils.data import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer

class TrainDataset(Dataset):
    def __init__(
        self, 
        tokenizer:PreTrainedTokenizer, 
        max_len:int,
        feature_text_max_len:int, 
        pn_history_max_len:int, 
        df:DataFrame
    ) -> None:
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.feature_text_max_len = feature_text_max_len
        self.pn_history_max_len = pn_history_max_len
        self.feature_texts = df['feature_text'].to_numpy()
        self.pn_historys = df['pn_history'].to_numpy()
        self.annotation_lengths = df['annotation_length'].to_numpy()
        self.locations = df['location'].to_numpy()

    def prepare_input_with_fixed_position(self, pn_history:str, feature_text:str) -> dict:

        pn_history_token = self.tokenizer(
            pn_history, 
            add_special_tokens=True,
            max_length=self.pn_history_max_len+2, 
            padding='max_length',
            return_offsets_mapping=False)
        
        feature_text_token = self.tokenizer(
            feature_text, 
            add_special_tokens=True,
            max_length=self.feature_text_max_len+2, 
            padding='max_length',
            return_offsets_mapping=False)
        for k,v in feature_text_token.items():
            feature_text_token[k] = v[1:]

        token = {
            'input_ids': pn_history_token['input_ids']+feature_text_token['input_ids'],
            'attention_mask': pn_history_token['attention_mask']+feature_text_token['attention_mask'],
            #'token_type_ids': pn_history_token['token_type_ids']+torch.ones_like(feature_text_token['token_type_ids'])
        }
        for k, v in token.items():
            token[k] = torch.tensor(v[:self.max_len], dtype=torch.long)
        return token
    
    def prepare_input(self, text:str, feature_text:str) -> dict:
        token = self.tokenizer(text, feature_text, 
                               add_special_tokens=True,
                               max_length=self.max_len,
                               padding="max_length",
                               return_offsets_mapping=False)
        for k, v in token.items():
            token[k] = torch.tensor(v[:self.max_len], dtype=torch.long)
        return token
    
    def create_label(self, text:str, annotation_length:int, location_list:list) -> Tensor:
        encoded = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True)
        offset_mapping = encoded['offset_mapping']
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1
        if annotation_length != 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(';')]:
                    start_idx = -1
                    end_idx = -1
                    start, end = int(loc[0]), int(loc[1])
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1
        return torch.tensor(label[:self.max_len], dtype=torch.float)

    def __len__(self) -> int:
        return len(self.feature_texts)

    def __getitem__(self, item:int) -> tuple:
        inputs = self.prepare_input_with_fixed_position(
            self.pn_historys[item],
            self.feature_texts[item])
        label = self.create_label(
            self.pn_historys[item], 
            self.annotation_lengths[item], 
            self.locations[item])
        return inputs, label

In [21]:
ds = TrainDataset(
    tokenizer=tokenizer, 
    max_len=config.max_len,
    feature_text_max_len=feature_text_max_len, 
    pn_history_max_len=pn_history_max_len, 
    df=df_train
)

# Model

In [22]:
# %cd "/content/drive/MyDrive/Colab Notebooks/nbme/code/exp076"

In [23]:
# !pip install transformers[sentencepiece] > /dev/null

In [24]:
from transformers import AutoModel
model = AutoModel.from_pretrained('../output/roberta-large-self-supervised-learning-9epoch')

Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream tas

In [25]:
type(model)

transformers.models.roberta.modeling_roberta.RobertaModel

In [26]:
from torch import Tensor
from torch import nn
from torch.nn import Module
from transformers import AutoModel, AutoConfig


class CustomModel(Module):
    def __init__(
        self, model_name: str, config_path: str = None, pretrained: bool = False
    ) -> None:
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                model_name, output_hidden_states=True
            )
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(config.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)

    def _init_weights(self, module) -> None:
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs: Tensor) -> Tensor:
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs: Tensor) -> Tensor:
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output


class AWP:
    def __init__(
        self,
        model,
        criterion,
        optimizer,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None,
    ):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, inputs, labels, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save()
        for i in range(self.adv_step):
            self._attack_step()
            with torch.cuda.amp.autocast():
                y_preds = self.model(inputs)
                adv_loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
                adv_loss = torch.masked_select(
                    adv_loss, labels.view(-1, 1) != -1
                ).mean()
                adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()

        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if (
                param.requires_grad
                and param.grad is not None
                and self.adv_param in name
            ):
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]),
                        self.backup_eps[name][1],
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if (
                param.requires_grad
                and param.grad is not None
                and self.adv_param in name
            ):
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(
        self,
    ):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# Logging

In [27]:
import time
from math import floor
from torch import inference_mode

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self) -> None:
        self.reset()

    def reset(self) -> None:
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val:float, n=1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s) -> str:
    m = floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent) -> str:
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trainer

In [28]:
import time
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import Module
from torch.optim import AdamW
from torch import cuda
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
from wandb.sdk.wandb_config import Config

def get_optimizer_params(model:Module, encoder_lr:float, decoder_lr:float, weight_decay:float=0.0) -> list:
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
        'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(scheduler:str, optimizer, num_warmup_steps:int, num_train_steps:int, num_cycles:int):
    if scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=num_warmup_steps, 
            num_training_steps=num_train_steps
        )
    elif scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=num_warmup_steps, 
            num_training_steps=num_train_steps, 
            num_cycles=num_cycles
        )
    else:
        raise ValueError('Invalid Scheduler Name.')
    return scheduler

class Trainer:

    def __init__(self, config:Config, tokenizer:PreTrainedTokenizer) -> None:
        self.config = config
        self.tokenizer = tokenizer
        self.criterion = nn.BCEWithLogitsLoss(reduction="none")
        self.device = torch.device('cuda' if cuda.is_available() else 'cpu')

    def train(self, model:Module, fold:int, tr_dl:DataLoader, optimizer, epoch:int, scheduler):
        model.train()
        scaler = cuda.amp.GradScaler(enabled=self.config.apex)
        awp = AWP(model,
              self.criterion,
              optimizer,
              adv_lr=config.adv_lr,
              adv_eps=config.adv_eps,
              start_epoch=config.adv_start_epoch,
              scaler=scaler
             )
        losses = AverageMeter()
        start = end = time.time()
        global_step = 0
        for step, (inputs, labels) in enumerate(tr_dl):
            for k, v in inputs.items():
                inputs[k] = v.to(self.device)
            labels = labels.to(self.device)
            batch_size = labels.size(0)
            with cuda.amp.autocast(enabled=self.config.apex):
                y_preds = model(inputs)
            loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
            if self.config.gradient_accumulation_steps > 1:
                loss = loss / self.config.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            awp.attack_backward(inputs, labels, epoch)
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), self.config.max_grad_norm)
            if (step + 1) % self.config.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                if self.config.batch_scheduler:
                    scheduler.step()
            end = time.time()
            if step % self.config.print_freq == 0 or step == (len(tr_dl)-1):
                print('Epoch: [{0}][{1}/{2}] '
                    'Elapsed {remain:s} '
                    'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                    'Grad: {grad_norm:.4f}  '
                    'LR: {lr:.8f}  '
                    .format(epoch+1, step, len(tr_dl), 
                            remain=timeSince(start, float(step+1)/len(tr_dl)),
                            loss=losses,
                            grad_norm=grad_norm,
                            lr=scheduler.get_lr()[0]))
            wandb.log({f"[fold{fold}] loss": losses.val,
                    f"[fold{fold}] lr": scheduler.get_lr()[0]})
        return losses.avg

    @inference_mode()
    def validate(self, model:Module, vl_dl:DataLoader) -> tuple:
        model.eval()
        losses = AverageMeter()
        preds = []
        start = end = time.time()
        for step, (inputs, labels) in enumerate(vl_dl):
            for k, v in inputs.items():
                inputs[k] = v.to(self.device)
            labels = labels.to(self.device)
            batch_size = labels.size(0)
            y_preds = model(inputs)
            loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
            if self.config.gradient_accumulation_steps > 1:
                loss = loss / self.config.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            preds.append(y_preds.sigmoid().to('cpu').numpy())
            end = time.time()
            if step % self.config.print_freq == 0 or step == (len(vl_dl)-1):
                print('EVAL: [{0}/{1}] '
                    'Elapsed {remain:s} '
                    'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                    .format(step, len(vl_dl),
                            loss=losses,
                            remain=timeSince(start, float(step+1)/len(vl_dl))))
        return losses.avg, np.concatenate(preds)

    def create_dl(self, df:DataFrame, feature_text_max_len:int, pn_history_max_len:int, is_train:bool) -> DataLoader:
        ds = TrainDataset(
            tokenizer=self.tokenizer,
            max_len=self.config.max_len,
            feature_text_max_len=feature_text_max_len,
            pn_history_max_len=pn_history_max_len,
            df=df)
        return DataLoader(
            ds,
            batch_size=self.config.batch_size if is_train else self.config.batch_size * 2,
            shuffle=is_train,
            num_workers=self.config.num_workers,
            pin_memory=True, 
            drop_last=is_train)
    
    def log_epoch_result(self, f:int, ep:int, avg_tr_loss:float, avg_vl_loss:float, elapsed:float, score:float, best_th:float) -> None:
        LOGGER.info(
            f'Epoch {ep} - avg_train_loss: {avg_tr_loss:.4f}  avg_val_loss: {avg_vl_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(
            f'Epoch {ep} - Score: {score:.4f} for th={best_th}')
        wandb.log(
            {
                f"[fold{f}] epoch": ep, 
                f"[fold{f}] avg_train_loss": avg_tr_loss, 
                f"[fold{f}] avg_val_loss": avg_vl_loss,
                f"[fold{f}] score": score,
                f"[fold{f}] best_th": best_th
            }
        )

    def run(
        self,
        df:DataFrame,
        feature_text_max_len:int, 
        pn_history_max_len:int) -> None:
    
        oof_df = pd.DataFrame()
        for f in range(self.config.n_folds):
            LOGGER.info(f"========== fold: {f} training ==========")
            
            model = CustomModel(
                self.config.model, 
                config_path=None, 
                pretrained=True).to(self.device)

            tr_df = df[df['fold'] != f].reset_index(drop=True)
            tr_dl = self.create_dl(
                df=tr_df, 
                feature_text_max_len=feature_text_max_len, 
                pn_history_max_len=pn_history_max_len, 
                is_train=True)
            num_train_steps = int(len(tr_df) / self.config.batch_size * self.config.epochs)
            
            vl_df = df[df['fold'] == f].reset_index(drop=True)
            vl_dl = self.create_dl(
                df=vl_df, 
                feature_text_max_len=feature_text_max_len, 
                pn_history_max_len=pn_history_max_len, 
                is_train=False)
            valid_texts = vl_df['pn_history'].to_numpy()
            valid_labels = create_labels_for_scoring(vl_df)

            optimizer_parameters = get_optimizer_params(
                model,
                encoder_lr=self.config.encoder_lr, 
                decoder_lr=self.config.decoder_lr,
                weight_decay=self.config.weight_decay)
            optimizer = AdamW(
                optimizer_parameters, 
                lr=self.config.encoder_lr, 
                eps=self.config.eps, 
                betas=self.config.betas)
            scheduler = get_scheduler(
                scheduler=self.config.scheduler, 
                optimizer=optimizer, 
                num_warmup_steps=self.config.num_warmup_steps,
                num_train_steps=num_train_steps,
                num_cycles=self.config.num_cycles)
            
            best_score = 0

            for epoch in range(self.config.epochs):
                
                start_time = time.time()

                avg_tr_loss = self.train(
                    model,
                    f, 
                    tr_dl, 
                    optimizer, 
                    epoch, 
                    scheduler)

                # eval
                avg_vl_loss, predictions = self.validate(
                    model, 
                    vl_dl
                )
                predictions = predictions.reshape(
                    (len(vl_df), 
                    self.config.max_len))
                
                # scoring
                char_probs = get_char_probs(
                    valid_texts, 
                    predictions, 
                    self.tokenizer)
                # ここをしきい値で探索した最適な値にする
                score=-100
                for th in np.arange(0.3,0.7,0.005):
                    th = np.round(th,4)
                    results = get_results(char_probs, th=th)
                    preds = get_predictions(results)
                    tmp_score = get_score(valid_labels, preds)
                    if tmp_score > score:
                        best_th=th
                        score=tmp_score
                
                self.log_epoch_result(
                    f=f,
                    ep=epoch+1, 
                    avg_tr_loss=avg_tr_loss, 
                    avg_vl_loss=avg_vl_loss, 
                    elapsed=time.time() - start_time, 
                    score=score, 
                    best_th=best_th)
                
                if score > best_score:
                    best_score = score
                    LOGGER.info(f'Epoch {epoch+1} - Save Score: {best_score:.4f} Model')
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'predictions': predictions},
                        f'{self.config.output_dir}{self.config.ckpt_name}_fold{f}_best.pth')

            predictions = torch.load(
                f'{self.config.output_dir}{self.config.ckpt_name}_fold{f}_best.pth', 
                map_location=torch.device('cpu'))['predictions']
            vl_df[[i for i in range(self.config.max_len)]] = predictions
            oof_df = pd.concat([oof_df, vl_df])
            LOGGER.info(f"========== fold: {f} result ==========")
            get_result(vl_df, self.tokenizer, self.config.max_len)
            oof_df.to_pickle(f'{self.config.output_dir}oof_df_fold{f}.pkl')
            wandb.alert(
                title=f"fold{f} Finished", 
                text=f'{self.config.model} has finished its fold{f} running.'
            )
        
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df, self.tokenizer, self.config.max_len)
        oof_df.to_pickle(self.config.output_dir+'oof_df.pkl')

In [29]:
# !pwd

In [30]:
config.model = '../output/roberta-large-self-supervised-learning-9epoch'
trainer = Trainer(
    config=config,
    tokenizer=tokenizer
)
trainer.run(
    df=df_train,
    feature_text_max_len=feature_text_max_len, 
    pn_history_max_len=pn_history_max_len
)

Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream tas

Epoch: [1][0/536] Elapsed 0m 1s (remain 17m 21s) Loss: 1.0100(1.0100) Grad: inf  LR: 0.00002500  
Epoch: [1][100/536] Elapsed 0m 48s (remain 3m 27s) Loss: 0.0175(0.1018) Grad: 1823.3350  LR: 0.00002494  
Epoch: [1][200/536] Elapsed 1m 34s (remain 2m 37s) Loss: 0.0192(0.0605) Grad: 2266.5237  LR: 0.00002476  
Epoch: [1][300/536] Elapsed 2m 21s (remain 1m 50s) Loss: 0.0055(0.0452) Grad: 595.0005  LR: 0.00002446  
Epoch: [1][400/536] Elapsed 3m 7s (remain 1m 3s) Loss: 0.0121(0.0375) Grad: 1397.3009  LR: 0.00002405  
Epoch: [1][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0167(0.0328) Grad: 1205.3877  LR: 0.00002353  
Epoch: [1][535/536] Elapsed 4m 10s (remain 0m 0s) Loss: 0.0136(0.0315) Grad: 1594.2166  LR: 0.00002333  
EVAL: [0/90] Elapsed 0m 0s (remain 1m 3s) Loss: 0.0072(0.0072) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0062(0.0120) 


Epoch 1 - avg_train_loss: 0.0315  avg_val_loss: 0.0120  time: 312s
Epoch 1 - Score: 0.8455 for th=0.425
Epoch 1 - Save Score: 0.8455 Model


Epoch: [2][0/536] Elapsed 0m 0s (remain 6m 10s) Loss: 0.0082(0.0082) Grad: 26633.9746  LR: 0.00002332  
Epoch: [2][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0166(0.0096) Grad: 18584.1074  LR: 0.00002266  
Epoch: [2][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0098(0.0094) Grad: 20383.0352  LR: 0.00002190  
Epoch: [2][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0113(0.0095) Grad: 99047.1250  LR: 0.00002105  
Epoch: [2][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0054(0.0090) Grad: 10198.6309  LR: 0.00002012  
Epoch: [2][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0042(0.0087) Grad: 26291.7109  LR: 0.00001912  
Epoch: [2][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0067(0.0086) Grad: 14682.9043  LR: 0.00001875  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0054(0.0054) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0040(0.0120) 


Epoch 2 - avg_train_loss: 0.0086  avg_val_loss: 0.0120  time: 310s
Epoch 2 - Score: 0.8685 for th=0.44
Epoch 2 - Save Score: 0.8685 Model


Epoch: [3][0/536] Elapsed 0m 0s (remain 6m 16s) Loss: 0.0056(0.0056) Grad: 8955.5449  LR: 0.00001874  
Epoch: [3][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0042(0.0072) Grad: 18308.7227  LR: 0.00001766  
Epoch: [3][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0112(0.0070) Grad: 29037.3203  LR: 0.00001652  
Epoch: [3][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0142(0.0071) Grad: 24376.8105  LR: 0.00001535  
Epoch: [3][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0038(0.0072) Grad: 11914.4951  LR: 0.00001415  
Epoch: [3][500/536] Elapsed 3m 52s (remain 0m 16s) Loss: 0.0020(0.0074) Grad: 6756.7559  LR: 0.00001293  
Epoch: [3][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0017(0.0074) Grad: 6487.9727  LR: 0.00001251  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 42s) Loss: 0.0066(0.0066) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0037(0.0120) 


Epoch 3 - avg_train_loss: 0.0074  avg_val_loss: 0.0120  time: 310s
Epoch 3 - Score: 0.8713 for th=0.69
Epoch 3 - Save Score: 0.8713 Model


Epoch: [4][0/536] Elapsed 0m 1s (remain 10m 37s) Loss: 0.0041(0.0041) Grad: 27950.6270  LR: 0.00001249  
Epoch: [4][100/536] Elapsed 1m 36s (remain 6m 53s) Loss: 0.0083(0.0061) Grad: 18009.5449  LR: 0.00001128  
Epoch: [4][200/536] Elapsed 3m 10s (remain 5m 18s) Loss: 0.0025(0.0068) Grad: 11856.3486  LR: 0.00001007  
Epoch: [4][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0058(0.0070) Grad: 16031.7510  LR: 0.00000888  
Epoch: [4][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0049(0.0071) Grad: 13918.9570  LR: 0.00000773  
Epoch: [4][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0182(0.0073) Grad: 40378.0547  LR: 0.00000663  
Epoch: [4][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0123(0.0074) Grad: 33532.6055  LR: 0.00000626  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0064(0.0064) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0040(0.0102) 


Epoch 4 - avg_train_loss: 0.0074  avg_val_loss: 0.0102  time: 570s
Epoch 4 - Score: 0.8711 for th=0.42


Epoch: [5][0/536] Elapsed 0m 1s (remain 10m 48s) Loss: 0.0057(0.0057) Grad: 16562.6641  LR: 0.00000625  
Epoch: [5][100/536] Elapsed 1m 36s (remain 6m 53s) Loss: 0.0065(0.0079) Grad: 16298.3857  LR: 0.00000522  
Epoch: [5][200/536] Elapsed 3m 10s (remain 5m 18s) Loss: 0.0057(0.0078) Grad: 11654.2705  LR: 0.00000426  
Epoch: [5][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0058(0.0077) Grad: 15684.2666  LR: 0.00000339  
Epoch: [5][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0066(0.0076) Grad: 18909.7441  LR: 0.00000260  
Epoch: [5][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0085(0.0074) Grad: 20489.3184  LR: 0.00000190  
Epoch: [5][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0051(0.0074) Grad: 13307.4805  LR: 0.00000168  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0064(0.0064) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0040(0.0103) 


Epoch 5 - avg_train_loss: 0.0074  avg_val_loss: 0.0103  time: 570s
Epoch 5 - Score: 0.8714 for th=0.43
Epoch 5 - Save Score: 0.8714 Model


Epoch: [6][0/536] Elapsed 0m 1s (remain 10m 50s) Loss: 0.0078(0.0078) Grad: 13731.1475  LR: 0.00000167  
Epoch: [6][100/536] Elapsed 1m 36s (remain 6m 53s) Loss: 0.0049(0.0076) Grad: 24871.3574  LR: 0.00000112  
Epoch: [6][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0102(0.0077) Grad: 18878.1973  LR: 0.00000067  
Epoch: [6][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0062(0.0075) Grad: 5232.0850  LR: 0.00000033  
Epoch: [6][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0060(0.0074) Grad: 2670.3279  LR: 0.00000011  
Epoch: [6][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0031(0.0073) Grad: 7684.1743  LR: 0.00000001  
Epoch: [6][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0046(0.0073) Grad: 4979.0625  LR: 0.00000000  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 43s) Loss: 0.0065(0.0065) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0040(0.0103) 


Epoch 6 - avg_train_loss: 0.0073  avg_val_loss: 0.0103  time: 570s
Epoch 6 - Score: 0.8714 for th=0.48
Score: 0.8714 Best threshold:: 0.43
Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch and ar

Epoch: [1][0/536] Elapsed 0m 0s (remain 6m 6s) Loss: 0.6834(0.6834) Grad: inf  LR: 0.00002500  
Epoch: [1][100/536] Elapsed 0m 47s (remain 3m 22s) Loss: 0.0215(0.0855) Grad: 2322.1880  LR: 0.00002494  
Epoch: [1][200/536] Elapsed 1m 33s (remain 2m 35s) Loss: 0.0067(0.0517) Grad: 1320.7733  LR: 0.00002476  
Epoch: [1][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0228(0.0396) Grad: 1902.7981  LR: 0.00002446  
Epoch: [1][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0117(0.0332) Grad: 1902.3706  LR: 0.00002405  
Epoch: [1][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0079(0.0292) Grad: 1072.1298  LR: 0.00002353  
Epoch: [1][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0129(0.0282) Grad: 2309.6772  LR: 0.00002333  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0103(0.0103) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0085(0.0127) 


Epoch 1 - avg_train_loss: 0.0282  avg_val_loss: 0.0127  time: 312s
Epoch 1 - Score: 0.8536 for th=0.48
Epoch 1 - Save Score: 0.8536 Model


Epoch: [2][0/536] Elapsed 0m 0s (remain 6m 28s) Loss: 0.0096(0.0096) Grad: 14121.2314  LR: 0.00002332  
Epoch: [2][100/536] Elapsed 0m 47s (remain 3m 24s) Loss: 0.0109(0.0093) Grad: 18464.1543  LR: 0.00002266  
Epoch: [2][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0113(0.0088) Grad: 32455.6387  LR: 0.00002190  
Epoch: [2][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0153(0.0088) Grad: 14081.5410  LR: 0.00002105  
Epoch: [2][400/536] Elapsed 3m 7s (remain 1m 3s) Loss: 0.0049(0.0087) Grad: 6729.8667  LR: 0.00002012  
Epoch: [2][500/536] Elapsed 3m 54s (remain 0m 16s) Loss: 0.0218(0.0086) Grad: 23131.8926  LR: 0.00001912  
Epoch: [2][535/536] Elapsed 4m 10s (remain 0m 0s) Loss: 0.0052(0.0086) Grad: 6627.4390  LR: 0.00001875  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0087(0.0087) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0054(0.0129) 


Epoch 2 - avg_train_loss: 0.0086  avg_val_loss: 0.0129  time: 312s
Epoch 2 - Score: 0.8698 for th=0.54
Epoch 2 - Save Score: 0.8698 Model


Epoch: [3][0/536] Elapsed 0m 0s (remain 6m 34s) Loss: 0.0117(0.0117) Grad: 28640.3066  LR: 0.00001874  
Epoch: [3][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0109(0.0070) Grad: 69227.7031  LR: 0.00001766  
Epoch: [3][200/536] Elapsed 1m 34s (remain 2m 36s) Loss: 0.0103(0.0068) Grad: 9242.8389  LR: 0.00001652  
Epoch: [3][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0090(0.0070) Grad: 20590.9922  LR: 0.00001535  
Epoch: [3][400/536] Elapsed 3m 7s (remain 1m 3s) Loss: 0.0142(0.0073) Grad: 22723.6035  LR: 0.00001415  
Epoch: [3][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0111(0.0072) Grad: 11569.3740  LR: 0.00001293  
Epoch: [3][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0043(0.0071) Grad: 8940.6592  LR: 0.00001251  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0079(0.0079) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0063(0.0133) 


Epoch 3 - avg_train_loss: 0.0071  avg_val_loss: 0.0133  time: 311s
Epoch 3 - Score: 0.8734 for th=0.44
Epoch 3 - Save Score: 0.8734 Model


Epoch: [4][0/536] Elapsed 0m 1s (remain 10m 51s) Loss: 0.0003(0.0003) Grad: 10610.1562  LR: 0.00001249  
Epoch: [4][100/536] Elapsed 1m 36s (remain 6m 53s) Loss: 0.0059(0.0061) Grad: 25984.9629  LR: 0.00001128  
Epoch: [4][200/536] Elapsed 3m 10s (remain 5m 18s) Loss: 0.0049(0.0063) Grad: 23973.0352  LR: 0.00001007  
Epoch: [4][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0024(0.0065) Grad: 12727.7061  LR: 0.00000888  
Epoch: [4][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0039(0.0065) Grad: 6477.3750  LR: 0.00000773  
Epoch: [4][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0078(0.0067) Grad: 11489.7334  LR: 0.00000663  
Epoch: [4][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0062(0.0067) Grad: 5159.6460  LR: 0.00000626  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0080(0.0080) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0060(0.0116) 


Epoch 4 - avg_train_loss: 0.0067  avg_val_loss: 0.0116  time: 570s
Epoch 4 - Score: 0.8729 for th=0.4


Epoch: [5][0/536] Elapsed 0m 1s (remain 10m 53s) Loss: 0.0042(0.0042) Grad: 11215.3828  LR: 0.00000625  
Epoch: [5][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0063(0.0070) Grad: 11363.3701  LR: 0.00000522  
Epoch: [5][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0086(0.0070) Grad: 8985.7451  LR: 0.00000426  
Epoch: [5][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0082(0.0070) Grad: 13181.6045  LR: 0.00000339  
Epoch: [5][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0029(0.0070) Grad: 6567.2407  LR: 0.00000260  


wandb: Network error (ReadTimeout), entering retry loop.


Epoch: [5][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0048(0.0069) Grad: 21793.7109  LR: 0.00000190  
Epoch: [5][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0037(0.0069) Grad: 11456.5762  LR: 0.00000168  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0079(0.0079) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0059(0.0115) 


Epoch 5 - avg_train_loss: 0.0069  avg_val_loss: 0.0115  time: 570s
Epoch 5 - Score: 0.8741 for th=0.405
Epoch 5 - Save Score: 0.8741 Model


Epoch: [6][0/536] Elapsed 0m 1s (remain 10m 57s) Loss: 0.0030(0.0030) Grad: 16280.0947  LR: 0.00000167  
Epoch: [6][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0123(0.0069) Grad: 17403.9707  LR: 0.00000112  
Epoch: [6][200/536] Elapsed 3m 10s (remain 5m 18s) Loss: 0.0038(0.0068) Grad: 10376.6797  LR: 0.00000067  
Epoch: [6][300/536] Elapsed 4m 45s (remain 3m 43s) Loss: 0.0085(0.0070) Grad: 10646.8438  LR: 0.00000033  
Epoch: [6][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0090(0.0069) Grad: 11348.4824  LR: 0.00000011  
Epoch: [6][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0032(0.0068) Grad: 14211.6279  LR: 0.00000001  
Epoch: [6][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0099(0.0068) Grad: 24715.3594  LR: 0.00000000  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0084(0.0084) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0057(0.0114) 


Epoch 6 - avg_train_loss: 0.0068  avg_val_loss: 0.0114  time: 570s
Epoch 6 - Score: 0.8742 for th=0.46
Epoch 6 - Save Score: 0.8742 Model
Score: 0.8742 Best threshold:: 0.46
Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-sel

Epoch: [1][0/536] Elapsed 0m 0s (remain 6m 34s) Loss: 0.5208(0.5208) Grad: inf  LR: 0.00002500  
Epoch: [1][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0086(0.0703) Grad: 887.0882  LR: 0.00002494  
Epoch: [1][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0156(0.0444) Grad: 2012.3204  LR: 0.00002476  
Epoch: [1][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0075(0.0349) Grad: 4807.9292  LR: 0.00002446  
Epoch: [1][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0183(0.0299) Grad: 2409.5039  LR: 0.00002405  
Epoch: [1][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0093(0.0265) Grad: 808.6677  LR: 0.00002353  
Epoch: [1][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0154(0.0256) Grad: 1916.4004  LR: 0.00002333  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0115(0.0115) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0032(0.0132) 


Epoch 1 - avg_train_loss: 0.0256  avg_val_loss: 0.0132  time: 311s
Epoch 1 - Score: 0.8510 for th=0.3
Epoch 1 - Save Score: 0.8510 Model


Epoch: [2][0/536] Elapsed 0m 0s (remain 6m 56s) Loss: 0.0153(0.0153) Grad: 26106.6484  LR: 0.00002332  
Epoch: [2][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0051(0.0099) Grad: 21137.3945  LR: 0.00002266  
Epoch: [2][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0065(0.0097) Grad: 28581.5176  LR: 0.00002190  
Epoch: [2][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0091(0.0097) Grad: 37987.3320  LR: 0.00002105  
Epoch: [2][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0014(0.0095) Grad: 4227.0176  LR: 0.00002012  
Epoch: [2][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0111(0.0093) Grad: 12070.3203  LR: 0.00001912  
Epoch: [2][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0049(0.0093) Grad: 38372.8086  LR: 0.00001875  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 44s) Loss: 0.0106(0.0106) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0028(0.0110) 


Epoch 2 - avg_train_loss: 0.0093  avg_val_loss: 0.0110  time: 311s
Epoch 2 - Score: 0.8709 for th=0.36
Epoch 2 - Save Score: 0.8709 Model


Epoch: [3][0/536] Elapsed 0m 0s (remain 7m 3s) Loss: 0.0175(0.0175) Grad: 23301.1348  LR: 0.00001874  
Epoch: [3][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0153(0.0084) Grad: 23306.2070  LR: 0.00001766  
Epoch: [3][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0043(0.0079) Grad: 14913.9502  LR: 0.00001652  
Epoch: [3][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0057(0.0080) Grad: 13773.3398  LR: 0.00001535  
Epoch: [3][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0060(0.0079) Grad: 15805.5195  LR: 0.00001415  
Epoch: [3][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0095(0.0079) Grad: 5287.1934  LR: 0.00001293  
Epoch: [3][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0078(0.0078) Grad: 13772.1709  LR: 0.00001251  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0097(0.0097) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0024(0.0113) 


Epoch 3 - avg_train_loss: 0.0078  avg_val_loss: 0.0113  time: 311s
Epoch 3 - Score: 0.8722 for th=0.46
Epoch 3 - Save Score: 0.8722 Model


Epoch: [4][0/536] Elapsed 0m 1s (remain 11m 19s) Loss: 0.0080(0.0080) Grad: 46201.9219  LR: 0.00001249  
Epoch: [4][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0125(0.0074) Grad: 22007.4141  LR: 0.00001128  
Epoch: [4][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0055(0.0075) Grad: 8330.0840  LR: 0.00001007  
Epoch: [4][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0071(0.0073) Grad: 12745.3447  LR: 0.00000888  
Epoch: [4][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0089(0.0075) Grad: 9868.7256  LR: 0.00000773  
Epoch: [4][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0107(0.0076) Grad: 17172.0156  LR: 0.00000663  
Epoch: [4][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0049(0.0076) Grad: 4214.0073  LR: 0.00000626  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0089(0.0089) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0028(0.0102) 


Epoch 4 - avg_train_loss: 0.0076  avg_val_loss: 0.0102  time: 571s
Epoch 4 - Score: 0.8764 for th=0.395
Epoch 4 - Save Score: 0.8764 Model


Epoch: [5][0/536] Elapsed 0m 1s (remain 11m 12s) Loss: 0.0038(0.0038) Grad: 12542.9844  LR: 0.00000625  
Epoch: [5][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0078(0.0078) Grad: 23899.9160  LR: 0.00000522  
Epoch: [5][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0255(0.0081) Grad: 25413.6191  LR: 0.00000426  
Epoch: [5][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0034(0.0080) Grad: 8632.0850  LR: 0.00000339  
Epoch: [5][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0075(0.0077) Grad: 9481.4736  LR: 0.00000260  
Epoch: [5][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0090(0.0076) Grad: 25134.0977  LR: 0.00000190  
Epoch: [5][535/536] Elapsed 8m 28s (remain 0m 0s) Loss: 0.0045(0.0076) Grad: 11855.4316  LR: 0.00000168  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0090(0.0090) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0028(0.0100) 


Epoch 5 - avg_train_loss: 0.0076  avg_val_loss: 0.0100  time: 571s
Epoch 5 - Score: 0.8774 for th=0.445
Epoch 5 - Save Score: 0.8774 Model


Epoch: [6][0/536] Elapsed 0m 1s (remain 11m 28s) Loss: 0.0078(0.0078) Grad: 19204.2344  LR: 0.00000167  
Epoch: [6][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0074(0.0071) Grad: 13970.5771  LR: 0.00000112  
Epoch: [6][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0079(0.0074) Grad: 26371.4375  LR: 0.00000067  
Epoch: [6][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0067(0.0075) Grad: 4257.5078  LR: 0.00000033  
Epoch: [6][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0032(0.0076) Grad: 4372.7925  LR: 0.00000011  
Epoch: [6][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0132(0.0075) Grad: 12269.8691  LR: 0.00000001  
Epoch: [6][535/536] Elapsed 8m 29s (remain 0m 0s) Loss: 0.0040(0.0075) Grad: 5337.6777  LR: 0.00000000  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0090(0.0090) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0027(0.0100) 


Epoch 6 - avg_train_loss: 0.0075  avg_val_loss: 0.0100  time: 571s
Epoch 6 - Score: 0.8779 for th=0.475
Epoch 6 - Save Score: 0.8779 Model
Score: 0.8779 Best threshold:: 0.475
Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-s

Epoch: [1][0/536] Elapsed 0m 0s (remain 6m 34s) Loss: 0.8639(0.8639) Grad: inf  LR: 0.00002500  
Epoch: [1][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0219(0.0992) Grad: 2306.2725  LR: 0.00002494  
Epoch: [1][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0105(0.0596) Grad: 1351.3083  LR: 0.00002476  
Epoch: [1][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0080(0.0448) Grad: 1068.5028  LR: 0.00002446  
Epoch: [1][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0327(0.0371) Grad: 5284.2358  LR: 0.00002405  
Epoch: [1][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0102(0.0326) Grad: 2643.2878  LR: 0.00002353  
Epoch: [1][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0100(0.0312) Grad: 1106.7229  LR: 0.00002333  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 48s) Loss: 0.0109(0.0109) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0030(0.0120) 


Epoch 1 - avg_train_loss: 0.0312  avg_val_loss: 0.0120  time: 312s
Epoch 1 - Score: 0.8593 for th=0.6
Epoch 1 - Save Score: 0.8593 Model


Epoch: [2][0/536] Elapsed 0m 0s (remain 7m 10s) Loss: 0.0087(0.0087) Grad: 19818.9727  LR: 0.00002332  
Epoch: [2][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0141(0.0100) Grad: 17341.0215  LR: 0.00002266  
Epoch: [2][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0251(0.0101) Grad: 79657.7031  LR: 0.00002190  
Epoch: [2][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0119(0.0097) Grad: 44633.3945  LR: 0.00002105  
Epoch: [2][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0115(0.0095) Grad: 15747.3145  LR: 0.00002012  
Epoch: [2][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0117(0.0093) Grad: 27500.1973  LR: 0.00001912  
Epoch: [2][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0033(0.0093) Grad: 21817.3672  LR: 0.00001875  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0103(0.0103) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0025(0.0104) 


Epoch 2 - avg_train_loss: 0.0093  avg_val_loss: 0.0104  time: 311s
Epoch 2 - Score: 0.8719 for th=0.55
Epoch 2 - Save Score: 0.8719 Model


Epoch: [3][0/536] Elapsed 0m 0s (remain 7m 9s) Loss: 0.0134(0.0134) Grad: 39109.5078  LR: 0.00001874  
Epoch: [3][100/536] Elapsed 0m 47s (remain 3m 23s) Loss: 0.0041(0.0079) Grad: 10171.5596  LR: 0.00001766  
Epoch: [3][200/536] Elapsed 1m 33s (remain 2m 36s) Loss: 0.0061(0.0081) Grad: 16644.5430  LR: 0.00001652  
Epoch: [3][300/536] Elapsed 2m 20s (remain 1m 49s) Loss: 0.0060(0.0082) Grad: 5233.3794  LR: 0.00001535  
Epoch: [3][400/536] Elapsed 3m 6s (remain 1m 2s) Loss: 0.0038(0.0079) Grad: 9373.9385  LR: 0.00001415  
Epoch: [3][500/536] Elapsed 3m 53s (remain 0m 16s) Loss: 0.0061(0.0081) Grad: 8099.7939  LR: 0.00001293  
Epoch: [3][535/536] Elapsed 4m 9s (remain 0m 0s) Loss: 0.0104(0.0080) Grad: 7092.3174  LR: 0.00001251  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0073(0.0073) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0024(0.0106) 


Epoch 3 - avg_train_loss: 0.0080  avg_val_loss: 0.0106  time: 311s
Epoch 3 - Score: 0.8782 for th=0.51
Epoch 3 - Save Score: 0.8782 Model


Epoch: [4][0/536] Elapsed 0m 1s (remain 11m 17s) Loss: 0.0013(0.0013) Grad: 55194.2383  LR: 0.00001249  
Epoch: [4][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0169(0.0067) Grad: 39086.2617  LR: 0.00001128  
Epoch: [4][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0068(0.0071) Grad: 27038.4961  LR: 0.00001007  
Epoch: [4][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0086(0.0072) Grad: 28683.6152  LR: 0.00000888  
Epoch: [4][400/536] Elapsed 6m 21s (remain 2m 8s) Loss: 0.0160(0.0072) Grad: 27045.0566  LR: 0.00000773  
Epoch: [4][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0093(0.0072) Grad: 22729.2715  LR: 0.00000663  
Epoch: [4][535/536] Elapsed 8m 29s (remain 0m 0s) Loss: 0.0080(0.0072) Grad: 14457.7959  LR: 0.00000626  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0066(0.0066) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0022(0.0094) 


Epoch 4 - avg_train_loss: 0.0072  avg_val_loss: 0.0094  time: 571s
Epoch 4 - Score: 0.8774 for th=0.47


Epoch: [5][0/536] Elapsed 0m 1s (remain 11m 31s) Loss: 0.0040(0.0040) Grad: 11441.0498  LR: 0.00000625  
Epoch: [5][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0115(0.0077) Grad: 7952.9248  LR: 0.00000522  
Epoch: [5][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0067(0.0074) Grad: 10385.5156  LR: 0.00000426  
Epoch: [5][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0079(0.0075) Grad: 5481.4585  LR: 0.00000339  
Epoch: [5][400/536] Elapsed 6m 21s (remain 2m 8s) Loss: 0.0100(0.0076) Grad: 21617.3652  LR: 0.00000260  
Epoch: [5][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0107(0.0075) Grad: 6883.7407  LR: 0.00000190  
Epoch: [5][535/536] Elapsed 8m 29s (remain 0m 0s) Loss: 0.0137(0.0074) Grad: 23283.0840  LR: 0.00000168  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 45s) Loss: 0.0065(0.0065) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0022(0.0094) 


Epoch 5 - avg_train_loss: 0.0074  avg_val_loss: 0.0094  time: 571s
Epoch 5 - Score: 0.8773 for th=0.435


Epoch: [6][0/536] Elapsed 0m 1s (remain 11m 21s) Loss: 0.0059(0.0059) Grad: 65134.3828  LR: 0.00000167  
Epoch: [6][100/536] Elapsed 1m 36s (remain 6m 54s) Loss: 0.0093(0.0075) Grad: 14710.8936  LR: 0.00000112  
Epoch: [6][200/536] Elapsed 3m 11s (remain 5m 18s) Loss: 0.0093(0.0074) Grad: 12717.3047  LR: 0.00000067  
Epoch: [6][300/536] Elapsed 4m 46s (remain 3m 43s) Loss: 0.0058(0.0073) Grad: 30583.7031  LR: 0.00000033  
Epoch: [6][400/536] Elapsed 6m 20s (remain 2m 8s) Loss: 0.0192(0.0074) Grad: 13572.1455  LR: 0.00000011  
Epoch: [6][500/536] Elapsed 7m 55s (remain 0m 33s) Loss: 0.0081(0.0074) Grad: 16094.4346  LR: 0.00000001  
Epoch: [6][535/536] Elapsed 8m 29s (remain 0m 0s) Loss: 0.0056(0.0074) Grad: 26682.3398  LR: 0.00000000  
EVAL: [0/90] Elapsed 0m 0s (remain 0m 46s) Loss: 0.0065(0.0065) 
EVAL: [89/90] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0023(0.0094) 


Epoch 6 - avg_train_loss: 0.0074  avg_val_loss: 0.0094  time: 571s
Epoch 6 - Score: 0.8775 for th=0.45
Score: 0.8782 Best threshold:: 0.51
Score: 0.8749 Best threshold:: 0.48


In [31]:
'''
commit_msg = '"run_name: ' + wandb.run.name[:wandb.run.name.rfind('-')] + '"'
wandb.finish()
!git config --global user.email "taro.masuda.jp@gmail.com"
!git config --global user.name "taro-masuda"
!git add baseline-train.ipynb
!git add ../input/raiii-nbme/config.yml
!git commit -m $commit_msg
'''

'\ncommit_msg = \'"run_name: \' + wandb.run.name[:wandb.run.name.rfind(\'-\')] + \'"\'\nwandb.finish()\n!git config --global user.email "taro.masuda.jp@gmail.com"\n!git config --global user.name "taro-masuda"\n!git add baseline-train.ipynb\n!git add ../input/raiii-nbme/config.yml\n!git commit -m $commit_msg\n'

In [33]:
            model = CustomModel(
                config.model, 
                config_path=None, 
                pretrained=True).to(torch.device('cuda' if cuda.is_available() else 'cpu'))

Some weights of the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../output/roberta-large-self-supervised-learning-9epoch and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream tas

In [35]:
torch.save(model.config, "../output/exp076/config.pth")

wandb: Network error (ReadTimeout), entering retry loop.
