# Init

In [1]:
import os
import random
import re
import sys
import yaml
import numpy as np
from numpy import ndarray
import pandas as pd
import torch
from logging import Logger, getLogger, INFO, StreamHandler, FileHandler, Formatter
import wandb
from wandb.sdk.wandb_config import Config

# Colaboratory環境ならTrue
is_colab='google.colab' in sys.modules
# Kaggle Notebook環境ならTrue
is_kaggle='kaggle_web_client' in sys.modules

def init_pandas() -> None:    
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

def get_logger(filename:str) -> Logger:
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def seed_everything(seed:int=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def init_wandb(wandb_key:str) -> Config:
    #from kaggle_secrets import UserSecretsClient
    #user_secrets = UserSecretsClient()
    secret_value_0 = wandb_key
    wandb.login(key=secret_value_0)

    my_ds_path = '.'
    loader = yaml.SafeLoader
    loader.add_implicit_resolver(
        u'tag:yaml.org,2002:float',
        re.compile(u'''^(?:
         [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
        |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
        |\\.[0-9_]+(?:[eE][-+][0-9]+)?
        |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
        |[-+]?\\.(?:inf|Inf|INF)
        |\\.(?:nan|NaN|NAN))$''', re.X),
        list(u'-+0123456789.'))
    with open(f'{my_ds_path}/config.yml') as f:
        param = yaml.load(f, Loader=loader)
    wandb.init(
        project=param['project'],
        config=param
    )
    wandb.config.update(param)
    print(f'run name: {wandb.run.name}')    
    return wandb.config

def mk_output_dir(path:str) -> None:
    if not os.path.exists(path):
        os.makedirs(path)

In [2]:
import warnings
# if is_kaggle:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     secret_value_0 = user_secrets.get_secret("wandb_api")
# elif is_colab:
#     with open('/content/drive/MyDrive/dotfiles_for_colab/wandb_api.txt') as f:
#         secret_value_0=f.readline()
# else:
#     raise ValueError()
from getpass import getpass
wandb_key = getpass()
config = init_wandb(wandb_key=wandb_key)
mk_output_dir(path=config.output_dir)
LOGGER = get_logger(
    filename=config.output_dir+'train'
)
seed_everything(seed=config.seed)
init_pandas()
warnings.filterwarnings("ignore")

········


[34m[1mwandb[0m: Currently logged in as: [33mmpeg[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


run name: celestial-leaf-22


# Helper functions for scoring

In [3]:
from sklearn.metrics import f1_score

def get_score(y_true:ndarray, y_pred:ndarray) -> float:
    score = span_micro_f1(y_true, y_pred)
    return score

def micro_f1(preds:list, truths:list) -> float:
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans:list, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [4]:
import itertools
import ast
from pandas import DataFrame
from transformers.tokenization_utils import PreTrainedTokenizer

def create_labels_for_scoring(df:DataFrame):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_result(df_oof:DataFrame, tokenizer:PreTrainedTokenizer, max_len:int) -> None:
    labels = create_labels_for_scoring(df_oof)
    predictions = df_oof[[i for i in range(max_len)]].values
    char_probs = get_char_probs(df_oof['pn_history'].values, predictions, tokenizer)
    
    score=-100
    for th in np.arange(0.3,0.7,0.005):
        th = np.round(th,4)
        results = get_results(char_probs, th=th)
        preds = get_predictions(results)
        tmp_score = get_score(labels, preds)
        if tmp_score > score:
            best_th=th
            score=tmp_score
    LOGGER.info(f'Score: {score:<.4f} Best threshold:: {best_th}')

# Data Loading

In [5]:
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features

df_train = pd.read_csv('../input/train.csv')
df_train['annotation'] = df_train['annotation'].map(lambda x: ast.literal_eval(x))
df_train['location'] = df_train['location'].map(lambda x: ast.literal_eval(x))

features = pd.read_csv('../input/features.csv')
features = preprocess_features(features)
features_retranslated = (pd.read_pickle('./df_retranslated.pkl')
                         .rename(columns={'ja': 'feature_text_ja',
                                          'ko': 'feature_text_ko',
                                          'ru': 'feature_text_ru',
                                          'ca': 'feature_text_ca'})
                         .apply(lambda x:x.str.replace(' ','-')))

patient_notes = pd.read_csv('../input/patient_notes.csv')

print(f"df_train.shape: {df_train.shape}")
display(df_train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

df_train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [6]:
df_train = df_train.merge(features, on=['feature_num', 'case_num'], how='left')
df_train = df_train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
df_train = df_train.merge(features_retranslated, on=['feature_text'], how='left')
display(df_train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,feature_text_ja,feature_text_ko,feature_text_ru,feature_text_ca
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,History-of-families-of-MI-or-family-of-myocard...,MI-in-myocardial-infarction-MI-or-the-history-...,Family-History-of-MI-or-Family-History-Myocard...,Family-History-of-Mi-or-Family-History-of-Myoc...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,History-of-family-of-thyroid-disorder,Family-force-of-thyroid-disorder,Family-history-of-thyroid-disorder,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure,Chest-pressure,Chest-pressure,Chest-pressure
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms,Intermittent-symptoms,Intermittent-symptoms,Intermittent-symptoms
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,Light-head,Light,Lighthead.,Light-headed


In [7]:
from pandas import DataFrame

def correct_annotation(df_train:DataFrame) -> None:
    df_train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
    df_train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

    df_train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
    df_train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

    df_train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
    df_train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

    df_train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
    df_train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

    df_train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
    df_train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

    df_train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
    df_train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

    df_train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
    df_train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

    df_train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
    df_train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

    df_train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
    df_train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

    df_train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
    df_train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

    df_train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
    df_train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

    df_train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
    df_train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

    df_train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
    df_train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

    df_train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
    df_train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

    df_train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
    df_train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

    df_train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
    df_train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

    df_train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
    df_train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

    df_train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
    df_train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

    df_train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
    df_train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

    df_train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
    df_train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

    df_train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
    df_train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

    df_train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
    df_train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

    df_train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
    df_train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

    df_train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
    df_train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

    df_train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
    df_train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

    df_train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
    df_train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

    df_train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
    df_train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

    df_train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
    df_train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

    df_train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
    df_train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

    df_train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
    df_train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

    df_train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
    df_train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

    df_train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
    df_train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

    df_train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
    df_train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

    df_train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
    df_train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

    df_train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
    df_train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

    df_train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
    df_train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

    df_train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
    df_train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

    df_train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
    df_train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

    df_train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
    df_train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

    df_train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
    df_train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

    df_train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
    df_train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

    df_train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
    df_train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [8]:
df_train['annotation_length'] = df_train['annotation'].map(lambda x: len(x))
display(df_train['annotation_length'].value_counts())

1    8181
0    4399
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [9]:
from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=config.n_folds)
groups = df_train['pn_num'].to_numpy()
df_train.loc[:, 'fold'] = -1
for n, (train_index, val_index) in enumerate(kf.split(df_train, df_train['location'], groups)):
    df_train.loc[val_index, 'fold'] = n
display(df_train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [10]:
if config.debug:
    display(df_train.groupby('fold').size())
    df_train = df_train.sample(n=500, random_state=0).reset_index(drop=True)
    display(df_train.groupby('fold').size())

# tokenizer

In [11]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path
import transformers

transformers_path = Path(transformers.__file__[:-12])

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [12]:
from transformers.models.deberta_v2 import DebertaV2TokenizerFast

%env TOKENIZERS_PARALLELISM=true
tokenizer = DebertaV2TokenizerFast.from_pretrained(config.model)
tokenizer.save_pretrained(config.output_dir+'tokenizer/')

env: TOKENIZERS_PARALLELISM=true


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('../output/exp067tokenizer/tokenizer_config.json',
 '../output/exp067tokenizer/special_tokens_map.json',
 '../output/exp067tokenizer/spm.model',
 '../output/exp067tokenizer/added_tokens.json',
 '../output/exp067tokenizer/tokenizer.json')

In [13]:
pn_history_lengths = []
for text in patient_notes['pn_history'].fillna("").to_list():
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    pn_history_lengths.append(length)
pn_history_max_len = max(pn_history_lengths)
LOGGER.info(f'pn_history max(lengths): {pn_history_max_len}')

features_lengths = []
for _, col in df_train.filter(like='feature_text').iteritems():
    for text in col.fillna("").to_list():
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
feature_text_max_len = max(features_lengths)
LOGGER.info(f'feature_text max(lengths): {feature_text_max_len}')

config.max_len = pn_history_max_len+feature_text_max_len + 3
LOGGER.info(f"max_len: {config.max_len}")

pn_history max(lengths): 323
feature_text max(lengths): 32
max_len: 358


# Dataset

In [14]:
from pandas import DataFrame
import torch
from torch import Tensor
from torch.utils.data import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer

class TrainDataset(Dataset):
    def __init__(
        self, 
        tokenizer:PreTrainedTokenizer, 
        max_len:int,
        feature_text_max_len:int, 
        pn_history_max_len:int, 
        df:DataFrame,
        feature_retranslate:bool=True
    ) -> None:
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.feature_text_max_len = feature_text_max_len
        self.pn_history_max_len = pn_history_max_len
        self.feature_texts = df.filter(like='feature_text').to_numpy()
        self.pn_historys = df['pn_history'].to_numpy()
        self.annotation_lengths = df['annotation_length'].to_numpy()
        self.locations = df['location'].to_numpy()
        self.feature_retranslate = feature_retranslate

    def prepare_input_with_fixed_position(self, pn_history:str, feature_text:str) -> dict:

        pn_history_token = self.tokenizer(
            pn_history, 
            add_special_tokens=True,
            max_length=self.pn_history_max_len+2, 
            padding='max_length',
            return_offsets_mapping=False)
        
        feature_text_token = self.tokenizer(
            feature_text, 
            add_special_tokens=True,
            max_length=self.feature_text_max_len+2, 
            padding='max_length',
            return_offsets_mapping=False)
        for k,v in feature_text_token.items():
            feature_text_token[k] = v[1:]

        token = {
            'input_ids': pn_history_token['input_ids']+feature_text_token['input_ids'],
            'attention_mask': pn_history_token['attention_mask']+feature_text_token['attention_mask'],
            'token_type_ids': pn_history_token['token_type_ids']+feature_text_token['token_type_ids']
        }
        for k, v in token.items():
            token[k] = torch.tensor(v[:self.max_len], dtype=torch.long)
        return token
    
    def prepare_input(self, text:str, feature_text:str) -> dict:
        token = self.tokenizer(text, feature_text, 
                               add_special_tokens=True,
                               max_length=self.max_len,
                               padding="max_length",
                               return_offsets_mapping=False)
        for k, v in token.items():
            token[k] = torch.tensor(v[:self.max_len], dtype=torch.long)
        return token
    
    def create_label(self, text:str, annotation_length:int, location_list:list) -> Tensor:
        encoded = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True)
        offset_mapping = encoded['offset_mapping']
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1
        if annotation_length != 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(';')]:
                    start_idx = -1
                    end_idx = -1
                    start, end = int(loc[0]), int(loc[1])
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1
        return torch.tensor(label[:self.max_len], dtype=torch.float)

    def __len__(self) -> int:
        return len(self.feature_texts)

    def __getitem__(self, item:int) -> tuple:
        if self.feature_retranslate:
            feature_text=np.random.choice(self.feature_texts[item],p=config.choice_weight)
        else:
            feature_text=self.feature_texts[item][0]
        inputs = self.prepare_input_with_fixed_position(
            self.pn_historys[item],
            feature_text)
        label = self.create_label(
            self.pn_historys[item], 
            self.annotation_lengths[item], 
            self.locations[item])
        return inputs, label

In [15]:
ds = TrainDataset(
    tokenizer=tokenizer, 
    max_len=config.max_len,
    feature_text_max_len=feature_text_max_len, 
    pn_history_max_len=pn_history_max_len, 
    df=df_train
)

# Model

In [16]:
from torch import Tensor
from torch import nn
from torch.nn import Module
from transformers import AutoModel, AutoConfig

class CustomModel(Module):
    def __init__(self, model_name:str, config_path:str=None, pretrained:bool=False) -> None:
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                model_name, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(
                config.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module) -> None:
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs:Tensor) -> Tensor:
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs:Tensor) -> Tensor:
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Logging

In [17]:
import time
from math import floor
from torch import inference_mode

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self) -> None:
        self.reset()

    def reset(self) -> None:
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val:float, n=1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s) -> str:
    m = floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent) -> str:
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trainer

In [18]:
import time
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import Module
from torch.optim import AdamW
from torch import cuda
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
from wandb.sdk.wandb_config import Config

def get_optimizer_params(model:Module, encoder_lr:float, decoder_lr:float, weight_decay:float=0.0) -> list:
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
        'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(scheduler:str, optimizer, num_warmup_steps:int, num_train_steps:int, num_cycles:int):
    if scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=num_warmup_steps, 
            num_training_steps=num_train_steps
        )
    elif scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=num_warmup_steps, 
            num_training_steps=num_train_steps, 
            num_cycles=num_cycles
        )
    else:
        raise ValueError('Invalid Scheduler Name.')
    return scheduler

class Trainer:

    def __init__(self, config:Config, tokenizer:PreTrainedTokenizer) -> None:
        self.config = config
        self.tokenizer = tokenizer
        self.criterion = nn.BCEWithLogitsLoss(reduction="none")
        self.device = torch.device('cuda' if cuda.is_available() else 'cpu')

    def train(self, model:Module, fold:int, tr_dl:DataLoader, optimizer, epoch:int, scheduler):
        model.train()
        scaler = cuda.amp.GradScaler(enabled=self.config.apex)
        losses = AverageMeter()
        start = end = time.time()
        global_step = 0
        for step, (inputs, labels) in enumerate(tr_dl):
            for k, v in inputs.items():
                inputs[k] = v.to(self.device)
            labels = labels.to(self.device)
            batch_size = labels.size(0)
            with cuda.amp.autocast(enabled=self.config.apex):
                y_preds = model(inputs)
            loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
            if self.config.gradient_accumulation_steps > 1:
                loss = loss / self.config.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), self.config.max_grad_norm)
            if (step + 1) % self.config.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                if self.config.batch_scheduler:
                    scheduler.step()
            end = time.time()
            if step % self.config.print_freq == 0 or step == (len(tr_dl)-1):
                print('Epoch: [{0}][{1}/{2}] '
                    'Elapsed {remain:s} '
                    'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                    'Grad: {grad_norm:.4f}  '
                    'LR: {lr:.8f}  '
                    .format(epoch+1, step, len(tr_dl), 
                            remain=timeSince(start, float(step+1)/len(tr_dl)),
                            loss=losses,
                            grad_norm=grad_norm,
                            lr=scheduler.get_lr()[0]))
            wandb.log({f"[fold{fold}] loss": losses.val,
                    f"[fold{fold}] lr": scheduler.get_lr()[0]})
        return losses.avg

    @inference_mode()
    def validate(self, model:Module, vl_dl:DataLoader) -> tuple:
        model.eval()
        losses = AverageMeter()
        preds = []
        start = end = time.time()
        for step, (inputs, labels) in enumerate(vl_dl):
            for k, v in inputs.items():
                inputs[k] = v.to(self.device)
            labels = labels.to(self.device)
            batch_size = labels.size(0)
            y_preds = model(inputs)
            loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
            if self.config.gradient_accumulation_steps > 1:
                loss = loss / self.config.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            preds.append(y_preds.sigmoid().to('cpu').numpy())
            end = time.time()
            if step % self.config.print_freq == 0 or step == (len(vl_dl)-1):
                print('EVAL: [{0}/{1}] '
                    'Elapsed {remain:s} '
                    'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                    .format(step, len(vl_dl),
                            loss=losses,
                            remain=timeSince(start, float(step+1)/len(vl_dl))))
        return losses.avg, np.concatenate(preds)

    def create_dl(self, df:DataFrame, feature_text_max_len:int, pn_history_max_len:int, is_train:bool) -> DataLoader:
        ds = TrainDataset(
            tokenizer=self.tokenizer,
            max_len=self.config.max_len,
            feature_text_max_len=feature_text_max_len,
            pn_history_max_len=pn_history_max_len,
            df=df,
            feature_retranslate=is_train)
        return DataLoader(
            ds,
            batch_size=self.config.batch_size if is_train else self.config.batch_size * 2,
            shuffle=is_train,
            num_workers=self.config.num_workers,
            pin_memory=True, 
            drop_last=is_train)
    
    def log_epoch_result(self, f:int, ep:int, avg_tr_loss:float, avg_vl_loss:float, elapsed:float, score:float, best_th:float) -> None:
        LOGGER.info(
            f'Epoch {ep} - avg_train_loss: {avg_tr_loss:.4f}  avg_val_loss: {avg_vl_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(
            f'Epoch {ep} - Score: {score:.4f} for th={best_th}')
        wandb.log(
            {
                f"[fold{f}] epoch": ep, 
                f"[fold{f}] avg_train_loss": avg_tr_loss, 
                f"[fold{f}] avg_val_loss": avg_vl_loss,
                f"[fold{f}] score": score,
                f"[fold{f}] best_th": best_th
            }
        )

    def run(
        self,
        df:DataFrame,
        feature_text_max_len:int, 
        pn_history_max_len:int) -> None:
    
        oof_df = pd.DataFrame()
        for f in range(self.config.n_folds):
            LOGGER.info(f"========== fold: {f} training ==========")
            
            model = CustomModel(
                self.config.model, 
                config_path=None, 
                pretrained=True).to(self.device)

            tr_df = df[df['fold'] != f].reset_index(drop=True)
            tr_dl = self.create_dl(
                df=tr_df, 
                feature_text_max_len=feature_text_max_len, 
                pn_history_max_len=pn_history_max_len, 
                is_train=True)
            num_train_steps = int(len(tr_df) / self.config.batch_size * self.config.epochs)
            
            vl_df = df[df['fold'] == f].reset_index(drop=True)
            vl_dl = self.create_dl(
                df=vl_df, 
                feature_text_max_len=feature_text_max_len, 
                pn_history_max_len=pn_history_max_len, 
                is_train=False)
            valid_texts = vl_df['pn_history'].to_numpy()
            valid_labels = create_labels_for_scoring(vl_df)

            optimizer_parameters = get_optimizer_params(
                model,
                encoder_lr=self.config.encoder_lr, 
                decoder_lr=self.config.decoder_lr,
                weight_decay=self.config.weight_decay)
            optimizer = AdamW(
                optimizer_parameters, 
                lr=self.config.encoder_lr, 
                eps=self.config.eps, 
                betas=self.config.betas)
            scheduler = get_scheduler(
                scheduler=self.config.scheduler, 
                optimizer=optimizer, 
                num_warmup_steps=self.config.num_warmup_steps,
                num_train_steps=num_train_steps,
                num_cycles=self.config.num_cycles)
            
            best_score = -100.0

            for epoch in range(self.config.epochs):
                
                start_time = time.time()
                
                # train
                avg_tr_loss = self.train(
                    model,
                    f, 
                    tr_dl, 
                    optimizer, 
                    epoch, 
                    scheduler)

                # eval
                avg_vl_loss, predictions = self.validate(
                    model, 
                    vl_dl
                )
                predictions = predictions.reshape(
                    (len(vl_df), 
                    self.config.max_len))
                
                # scoring
                char_probs = get_char_probs(
                    valid_texts, 
                    predictions, 
                    self.tokenizer)
                # ここをしきい値で探索した最適な値にする
                score=-100
                for th in np.arange(0.3,0.7,0.005):
                    th = np.round(th,4)
                    results = get_results(char_probs, th=th)
                    preds = get_predictions(results)
                    tmp_score = get_score(valid_labels, preds)
                    if tmp_score > score:
                        best_th=th
                        score=tmp_score
                
                self.log_epoch_result(
                    f=f,
                    ep=epoch+1, 
                    avg_tr_loss=avg_tr_loss, 
                    avg_vl_loss=avg_vl_loss, 
                    elapsed=time.time() - start_time, 
                    score=score, 
                    best_th=best_th)
                
                if score > best_score:
                    best_score = score
                    LOGGER.info(f'Epoch {epoch+1} - Save Score: {best_score:.4f} Model')
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'predictions': predictions},
                        f'{self.config.output_dir}{self.config.ckpt_name}_fold{f}_best.pth')

            predictions = torch.load(
                f'{self.config.output_dir}{self.config.ckpt_name}_fold{f}_best.pth', 
                map_location=torch.device('cpu'))['predictions']
            vl_df[[i for i in range(self.config.max_len)]] = predictions
            oof_df = pd.concat([oof_df, vl_df])
            LOGGER.info(f"========== fold: {f} result ==========")
            get_result(vl_df, self.tokenizer, self.config.max_len)
            oof_df.to_pickle(f'{self.config.output_dir}oof_df_fold{f}.pkl')
            wandb.alert(
                title=f"fold{f} Finished", 
                text=f'{self.config.model} has finished its fold{f} running.'
            )
        
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df, self.tokenizer, self.config.max_len)
        oof_df.to_pickle(self.config.output_dir+'oof_df.pkl')

In [20]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
trainer = Trainer(
    config=config,
    tokenizer=tokenizer
)
trainer.run(
    df=df_train,
    feature_text_max_len=feature_text_max_len, 
    pn_history_max_len=pn_history_max_len
)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1906] Elapsed 0m 1s (remain 58m 40s) Loss: 0.6006(0.6006) Grad: inf  LR: 0.00001500  
Epoch: [1][100/1906] Elapsed 0m 27s (remain 8m 14s) Loss: 0.0445(0.0880) Grad: 1529.0291  LR: 0.00001500  
Epoch: [1][200/1906] Elapsed 0m 53s (remain 7m 30s) Loss: 0.0224(0.0613) Grad: 1517.4312  LR: 0.00001499  
Epoch: [1][300/1906] Elapsed 1m 18s (remain 6m 55s) Loss: 0.0141(0.0496) Grad: 1541.2185  LR: 0.00001497  
Epoch: [1][400/1906] Elapsed 1m 42s (remain 6m 26s) Loss: 0.0156(0.0425) Grad: 864.7016  LR: 0.00001495  
Epoch: [1][500/1906] Elapsed 2m 8s (remain 5m 59s) Loss: 0.0051(0.0381) Grad: 758.2766  LR: 0.00001493  
Epoch: [1][600/1906] Elapsed 2m 33s (remain 5m 32s) Loss: 0.0075(0.0350) Grad: 1255.6080  LR: 0.00001490  
Epoch: [1][700/1906] Elapsed 2m 58s (remain 5m 6s) Loss: 0.0126(0.0328) Grad: 2487.5571  LR: 0.00001486  
Epoch: [1][800/1906] Elapsed 3m 23s (remain 4m 41s) Loss: 0.0011(0.0309) Grad: 475.2433  LR: 0.00001482  
Epoch: [1][900/1906] Elapsed 3m 49s (remain 4m 15s

Epoch 1 - avg_train_loss: 0.0225  avg_val_loss: 0.0143  time: 538s
Epoch 1 - Score: 0.8615 for th=0.3
Epoch 1 - Save Score: 0.8615 Model


Epoch: [2][0/1906] Elapsed 0m 0s (remain 14m 6s) Loss: 0.0037(0.0037) Grad: 37574.8633  LR: 0.00001399  
Epoch: [2][100/1906] Elapsed 0m 25s (remain 7m 40s) Loss: 0.0054(0.0139) Grad: 55776.1016  LR: 0.00001389  
Epoch: [2][200/1906] Elapsed 0m 50s (remain 7m 10s) Loss: 0.0124(0.0131) Grad: 21388.6934  LR: 0.00001378  
Epoch: [2][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0241(0.0130) Grad: 31872.4102  LR: 0.00001366  
Epoch: [2][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0111(0.0124) Grad: 15971.2617  LR: 0.00001354  
Epoch: [2][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0028(0.0120) Grad: 5655.6514  LR: 0.00001342  
Epoch: [2][600/1906] Elapsed 2m 30s (remain 5m 26s) Loss: 0.0009(0.0122) Grad: 3356.9697  LR: 0.00001329  
Epoch: [2][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0062(0.0121) Grad: 11371.4531  LR: 0.00001316  
Epoch: [2][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0029(0.0118) Grad: 13725.5498  LR: 0.00001302  
Epoch: [2][900/1906] Elapsed 3m 45s

Epoch 2 - avg_train_loss: 0.0107  avg_val_loss: 0.0126  time: 536s
Epoch 2 - Score: 0.8780 for th=0.47
Epoch 2 - Save Score: 0.8780 Model


Epoch: [3][0/1906] Elapsed 0m 0s (remain 14m 7s) Loss: 0.0058(0.0058) Grad: 13797.7178  LR: 0.00001125  
Epoch: [3][100/1906] Elapsed 0m 25s (remain 7m 38s) Loss: 0.0106(0.0078) Grad: 9635.3369  LR: 0.00001107  
Epoch: [3][200/1906] Elapsed 0m 50s (remain 7m 10s) Loss: 0.0066(0.0075) Grad: 8183.1704  LR: 0.00001089  
Epoch: [3][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0039(0.0080) Grad: 10828.4248  LR: 0.00001070  
Epoch: [3][400/1906] Elapsed 1m 40s (remain 6m 18s) Loss: 0.0198(0.0083) Grad: 21310.9043  LR: 0.00001052  
Epoch: [3][500/1906] Elapsed 2m 5s (remain 5m 53s) Loss: 0.0033(0.0087) Grad: 11003.6270  LR: 0.00001033  
Epoch: [3][600/1906] Elapsed 2m 31s (remain 5m 28s) Loss: 0.0044(0.0089) Grad: 12139.2471  LR: 0.00001013  
Epoch: [3][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0031(0.0090) Grad: 12724.6064  LR: 0.00000994  
Epoch: [3][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0055(0.0089) Grad: 47760.0703  LR: 0.00000975  
Epoch: [3][900/1906] Elapsed 3m 45s

Epoch 3 - avg_train_loss: 0.0093  avg_val_loss: 0.0122  time: 535s
Epoch 3 - Score: 0.8813 for th=0.355
Epoch 3 - Save Score: 0.8813 Model


Epoch: [4][0/1906] Elapsed 0m 0s (remain 14m 23s) Loss: 0.0053(0.0053) Grad: 10264.8193  LR: 0.00000750  
Epoch: [4][100/1906] Elapsed 0m 25s (remain 7m 38s) Loss: 0.0002(0.0078) Grad: 1251.3422  LR: 0.00000730  
Epoch: [4][200/1906] Elapsed 0m 51s (remain 7m 13s) Loss: 0.0025(0.0074) Grad: 8598.3770  LR: 0.00000709  
Epoch: [4][300/1906] Elapsed 1m 16s (remain 6m 45s) Loss: 0.0024(0.0078) Grad: 9560.1182  LR: 0.00000688  
Epoch: [4][400/1906] Elapsed 1m 40s (remain 6m 18s) Loss: 0.0004(0.0074) Grad: 3444.8555  LR: 0.00000668  
Epoch: [4][500/1906] Elapsed 2m 5s (remain 5m 52s) Loss: 0.0270(0.0071) Grad: 99898.1953  LR: 0.00000648  
Epoch: [4][600/1906] Elapsed 2m 30s (remain 5m 27s) Loss: 0.0289(0.0075) Grad: 46527.1367  LR: 0.00000627  
Epoch: [4][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0076(0.0076) Grad: 21915.9824  LR: 0.00000607  
Epoch: [4][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0001(0.0076) Grad: 363.9773  LR: 0.00000587  
Epoch: [4][900/1906] Elapsed 3m 45s (r

Epoch 4 - avg_train_loss: 0.0080  avg_val_loss: 0.0128  time: 535s
Epoch 4 - Score: 0.8844 for th=0.66
Epoch 4 - Save Score: 0.8844 Model


Epoch: [5][0/1906] Elapsed 0m 0s (remain 14m 27s) Loss: 0.0005(0.0005) Grad: 1707.2677  LR: 0.00000375  
Epoch: [5][100/1906] Elapsed 0m 25s (remain 7m 36s) Loss: 0.0047(0.0055) Grad: 9204.2168  LR: 0.00000358  
Epoch: [5][200/1906] Elapsed 0m 50s (remain 7m 8s) Loss: 0.0060(0.0069) Grad: 22812.1855  LR: 0.00000340  
Epoch: [5][300/1906] Elapsed 1m 15s (remain 6m 42s) Loss: 0.0008(0.0076) Grad: 3371.0845  LR: 0.00000323  
Epoch: [5][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0137(0.0075) Grad: 29222.9102  LR: 0.00000306  
Epoch: [5][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0014(0.0073) Grad: 15594.7295  LR: 0.00000290  
Epoch: [5][600/1906] Elapsed 2m 30s (remain 5m 25s) Loss: 0.0004(0.0071) Grad: 2866.6492  LR: 0.00000274  
Epoch: [5][700/1906] Elapsed 2m 54s (remain 5m 0s) Loss: 0.0146(0.0070) Grad: 36361.0508  LR: 0.00000258  
Epoch: [5][800/1906] Elapsed 3m 19s (remain 4m 35s) Loss: 0.0150(0.0068) Grad: 80437.5000  LR: 0.00000243  
Epoch: [5][900/1906] Elapsed 3m 44s (

Epoch 5 - avg_train_loss: 0.0070  avg_val_loss: 0.0149  time: 535s
Epoch 5 - Score: 0.8836 for th=0.575


Epoch: [6][0/1906] Elapsed 0m 0s (remain 14m 30s) Loss: 0.0000(0.0000) Grad: 52.7082  LR: 0.00000101  
Epoch: [6][100/1906] Elapsed 0m 25s (remain 7m 37s) Loss: 0.0422(0.0076) Grad: 37491.3086  LR: 0.00000091  
Epoch: [6][200/1906] Elapsed 0m 50s (remain 7m 9s) Loss: 0.0036(0.0072) Grad: 10368.0508  LR: 0.00000081  
Epoch: [6][300/1906] Elapsed 1m 15s (remain 6m 45s) Loss: 0.0028(0.0074) Grad: 71705.1797  LR: 0.00000072  
Epoch: [6][400/1906] Elapsed 1m 41s (remain 6m 19s) Loss: 0.0071(0.0070) Grad: 26276.8164  LR: 0.00000063  
Epoch: [6][500/1906] Elapsed 2m 5s (remain 5m 53s) Loss: 0.0037(0.0069) Grad: 22582.3438  LR: 0.00000055  
Epoch: [6][600/1906] Elapsed 2m 30s (remain 5m 27s) Loss: 0.0105(0.0067) Grad: 30523.7715  LR: 0.00000048  
Epoch: [6][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0068(0.0067) Grad: 47482.7812  LR: 0.00000041  
Epoch: [6][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0015(0.0067) Grad: 5983.4995  LR: 0.00000035  
Epoch: [6][900/1906] Elapsed 3m 45s (

Epoch 6 - avg_train_loss: 0.0062  avg_val_loss: 0.0150  time: 536s
Epoch 6 - Score: 0.8836 for th=0.555
Score: 0.8844 Best threshold:: 0.66
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/1906] Elapsed 0m 0s (remain 12m 39s) Loss: 1.0063(1.0063) Grad: inf  LR: 0.00001500  
Epoch: [1][100/1906] Elapsed 0m 25s (remain 7m 34s) Loss: 0.0116(0.1205) Grad: 1643.7045  LR: 0.00001500  
Epoch: [1][200/1906] Elapsed 0m 50s (remain 7m 7s) Loss: 0.0095(0.0748) Grad: 7135.2642  LR: 0.00001499  
Epoch: [1][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0103(0.0573) Grad: 1382.4327  LR: 0.00001497  
Epoch: [1][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0027(0.0486) Grad: 749.0032  LR: 0.00001495  
Epoch: [1][500/1906] Elapsed 2m 5s (remain 5m 53s) Loss: 0.0045(0.0431) Grad: 1437.2698  LR: 0.00001493  
Epoch: [1][600/1906] Elapsed 2m 31s (remain 5m 28s) Loss: 0.0481(0.0395) Grad: 3265.0762  LR: 0.00001490  
Epoch: [1][700/1906] Elapsed 2m 56s (remain 5m 2s) Loss: 0.0297(0.0367) Grad: 1902.6162  LR: 0.00001486  
Epoch: [1][800/1906] Elapsed 3m 21s (remain 4m 37s) Loss: 0.0088(0.0345) Grad: 1295.3027  LR: 0.00001482  
Epoch: [1][900/1906] Elapsed 3m 46s (remain 4m 12

Epoch 1 - avg_train_loss: 0.0239  avg_val_loss: 0.0131  time: 535s
Epoch 1 - Score: 0.8612 for th=0.465
Epoch 1 - Save Score: 0.8612 Model


Epoch: [2][0/1906] Elapsed 0m 0s (remain 14m 5s) Loss: 0.0034(0.0034) Grad: 4577.4028  LR: 0.00001399  
Epoch: [2][100/1906] Elapsed 0m 25s (remain 7m 32s) Loss: 0.0095(0.0110) Grad: 20239.7715  LR: 0.00001389  
Epoch: [2][200/1906] Elapsed 0m 50s (remain 7m 6s) Loss: 0.0126(0.0118) Grad: 52507.3672  LR: 0.00001378  
Epoch: [2][300/1906] Elapsed 1m 15s (remain 6m 40s) Loss: 0.0028(0.0110) Grad: 7710.0879  LR: 0.00001366  
Epoch: [2][400/1906] Elapsed 1m 40s (remain 6m 15s) Loss: 0.0107(0.0110) Grad: 35143.1484  LR: 0.00001354  
Epoch: [2][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0010(0.0109) Grad: 3991.4290  LR: 0.00001342  
Epoch: [2][600/1906] Elapsed 2m 30s (remain 5m 26s) Loss: 0.0012(0.0104) Grad: 4149.2979  LR: 0.00001329  
Epoch: [2][700/1906] Elapsed 2m 55s (remain 5m 1s) Loss: 0.0108(0.0105) Grad: 16762.3340  LR: 0.00001316  
Epoch: [2][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0005(0.0101) Grad: 1454.6656  LR: 0.00001302  
Epoch: [2][900/1906] Elapsed 3m 45s (re

Epoch 2 - avg_train_loss: 0.0100  avg_val_loss: 0.0129  time: 534s
Epoch 2 - Score: 0.8824 for th=0.325
Epoch 2 - Save Score: 0.8824 Model


Epoch: [3][0/1906] Elapsed 0m 0s (remain 15m 12s) Loss: 0.0194(0.0194) Grad: 14930.6270  LR: 0.00001125  
Epoch: [3][100/1906] Elapsed 0m 25s (remain 7m 39s) Loss: 0.0015(0.0089) Grad: 6639.5098  LR: 0.00001107  
Epoch: [3][200/1906] Elapsed 0m 50s (remain 7m 11s) Loss: 0.0099(0.0093) Grad: 19350.7676  LR: 0.00001089  
Epoch: [3][300/1906] Elapsed 1m 15s (remain 6m 44s) Loss: 0.0003(0.0087) Grad: 2777.9202  LR: 0.00001070  
Epoch: [3][400/1906] Elapsed 1m 41s (remain 6m 19s) Loss: 0.0079(0.0085) Grad: 17446.9648  LR: 0.00001052  
Epoch: [3][500/1906] Elapsed 2m 6s (remain 5m 53s) Loss: 0.0180(0.0086) Grad: 22617.0449  LR: 0.00001033  
Epoch: [3][600/1906] Elapsed 2m 31s (remain 5m 27s) Loss: 0.0029(0.0086) Grad: 23861.1738  LR: 0.00001013  
Epoch: [3][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0040(0.0085) Grad: 15521.0332  LR: 0.00000994  
Epoch: [3][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0003(0.0084) Grad: 2214.5125  LR: 0.00000975  
Epoch: [3][900/1906] Elapsed 3m 45s

Epoch 3 - avg_train_loss: 0.0085  avg_val_loss: 0.0137  time: 536s
Epoch 3 - Score: 0.8805 for th=0.32


Epoch: [4][0/1906] Elapsed 0m 0s (remain 14m 46s) Loss: 0.0091(0.0091) Grad: 19609.5977  LR: 0.00000750  
Epoch: [4][100/1906] Elapsed 0m 25s (remain 7m 39s) Loss: 0.0003(0.0070) Grad: 1663.5278  LR: 0.00000730  
Epoch: [4][200/1906] Elapsed 0m 50s (remain 7m 11s) Loss: 0.0077(0.0067) Grad: 27464.3047  LR: 0.00000709  
Epoch: [4][300/1906] Elapsed 1m 15s (remain 6m 44s) Loss: 0.0016(0.0074) Grad: 10533.5781  LR: 0.00000688  
Epoch: [4][400/1906] Elapsed 1m 40s (remain 6m 18s) Loss: 0.0001(0.0074) Grad: 496.6645  LR: 0.00000668  
Epoch: [4][500/1906] Elapsed 2m 5s (remain 5m 52s) Loss: 0.0001(0.0074) Grad: 1041.9352  LR: 0.00000648  
Epoch: [4][600/1906] Elapsed 2m 30s (remain 5m 27s) Loss: 0.0001(0.0077) Grad: 429.6658  LR: 0.00000627  
Epoch: [4][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0340(0.0079) Grad: 45131.1523  LR: 0.00000607  
Epoch: [4][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0022(0.0078) Grad: 11251.0293  LR: 0.00000587  
Epoch: [4][900/1906] Elapsed 3m 45s (r

Epoch 4 - avg_train_loss: 0.0073  avg_val_loss: 0.0150  time: 535s
Epoch 4 - Score: 0.8848 for th=0.35
Epoch 4 - Save Score: 0.8848 Model


Epoch: [5][0/1906] Elapsed 0m 0s (remain 14m 59s) Loss: 0.0051(0.0051) Grad: 26165.1230  LR: 0.00000375  
Epoch: [5][100/1906] Elapsed 0m 25s (remain 7m 40s) Loss: 0.0014(0.0065) Grad: 7427.2344  LR: 0.00000358  
Epoch: [5][200/1906] Elapsed 0m 50s (remain 7m 12s) Loss: 0.0177(0.0057) Grad: 32120.1953  LR: 0.00000340  
Epoch: [5][300/1906] Elapsed 1m 16s (remain 6m 45s) Loss: 0.0013(0.0056) Grad: 81047.9453  LR: 0.00000323  
Epoch: [5][400/1906] Elapsed 1m 41s (remain 6m 22s) Loss: 0.0011(0.0061) Grad: 39624.2461  LR: 0.00000306  
Epoch: [5][500/1906] Elapsed 2m 6s (remain 5m 56s) Loss: 0.0009(0.0062) Grad: 5237.9912  LR: 0.00000290  
Epoch: [5][600/1906] Elapsed 2m 31s (remain 5m 29s) Loss: 0.0064(0.0061) Grad: 15729.2109  LR: 0.00000274  
Epoch: [5][700/1906] Elapsed 2m 56s (remain 5m 3s) Loss: 0.0012(0.0062) Grad: 10521.5469  LR: 0.00000258  
Epoch: [5][800/1906] Elapsed 3m 21s (remain 4m 38s) Loss: 0.0282(0.0061) Grad: 153940.0312  LR: 0.00000243  
Epoch: [5][900/1906] Elapsed 3m 4

Epoch 5 - avg_train_loss: 0.0062  avg_val_loss: 0.0161  time: 536s
Epoch 5 - Score: 0.8812 for th=0.345


Epoch: [6][0/1906] Elapsed 0m 0s (remain 14m 38s) Loss: 0.0041(0.0041) Grad: 15353.1055  LR: 0.00000101  
Epoch: [6][100/1906] Elapsed 0m 25s (remain 7m 31s) Loss: 0.0001(0.0051) Grad: 436.2000  LR: 0.00000091  
Epoch: [6][200/1906] Elapsed 0m 50s (remain 7m 6s) Loss: 0.0083(0.0051) Grad: 13693.9434  LR: 0.00000081  
Epoch: [6][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0142(0.0049) Grad: 71121.6094  LR: 0.00000072  
Epoch: [6][400/1906] Elapsed 1m 41s (remain 6m 19s) Loss: 0.0021(0.0048) Grad: 9484.0889  LR: 0.00000063  
Epoch: [6][500/1906] Elapsed 2m 6s (remain 5m 53s) Loss: 0.0007(0.0049) Grad: 5058.2051  LR: 0.00000055  
Epoch: [6][600/1906] Elapsed 2m 31s (remain 5m 28s) Loss: 0.0006(0.0052) Grad: 7032.0693  LR: 0.00000048  
Epoch: [6][700/1906] Elapsed 2m 56s (remain 5m 2s) Loss: 0.0000(0.0051) Grad: 130.0101  LR: 0.00000041  
Epoch: [6][800/1906] Elapsed 3m 21s (remain 4m 37s) Loss: 0.0006(0.0052) Grad: 4389.5215  LR: 0.00000035  
Epoch: [6][900/1906] Elapsed 3m 45s (rema

Epoch 6 - avg_train_loss: 0.0056  avg_val_loss: 0.0167  time: 535s
Epoch 6 - Score: 0.8808 for th=0.3
Score: 0.8848 Best threshold:: 0.35
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to 

Epoch: [1][0/1906] Elapsed 0m 0s (remain 13m 3s) Loss: 0.9086(0.9086) Grad: inf  LR: 0.00001500  
Epoch: [1][100/1906] Elapsed 0m 25s (remain 7m 36s) Loss: 0.0112(0.1079) Grad: 2182.5891  LR: 0.00001500  
Epoch: [1][200/1906] Elapsed 0m 50s (remain 7m 12s) Loss: 0.0366(0.0702) Grad: 6752.6948  LR: 0.00001499  
Epoch: [1][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0163(0.0543) Grad: 5261.9121  LR: 0.00001497  
Epoch: [1][400/1906] Elapsed 1m 40s (remain 6m 18s) Loss: 0.0132(0.0473) Grad: 3405.0090  LR: 0.00001495  
Epoch: [1][500/1906] Elapsed 2m 5s (remain 5m 53s) Loss: 0.0082(0.0423) Grad: 2977.6626  LR: 0.00001493  
Epoch: [1][600/1906] Elapsed 2m 30s (remain 5m 27s) Loss: 0.0191(0.0386) Grad: 1954.3705  LR: 0.00001490  
Epoch: [1][700/1906] Elapsed 2m 56s (remain 5m 2s) Loss: 0.0520(0.0358) Grad: 5556.7935  LR: 0.00001486  
Epoch: [1][800/1906] Elapsed 3m 21s (remain 4m 37s) Loss: 0.0032(0.0338) Grad: 323.7139  LR: 0.00001482  
Epoch: [1][900/1906] Elapsed 3m 46s (remain 4m 12

Epoch 1 - avg_train_loss: 0.0236  avg_val_loss: 0.0122  time: 537s
Epoch 1 - Score: 0.8635 for th=0.585
Epoch 1 - Save Score: 0.8635 Model


Epoch: [2][0/1906] Elapsed 0m 0s (remain 15m 31s) Loss: 0.0074(0.0074) Grad: 21075.2891  LR: 0.00001399  
Epoch: [2][100/1906] Elapsed 0m 25s (remain 7m 33s) Loss: 0.0025(0.0116) Grad: 8725.1270  LR: 0.00001389  
Epoch: [2][200/1906] Elapsed 0m 50s (remain 7m 6s) Loss: 0.0054(0.0103) Grad: 10504.4072  LR: 0.00001378  
Epoch: [2][300/1906] Elapsed 1m 15s (remain 6m 41s) Loss: 0.0082(0.0104) Grad: 77086.1406  LR: 0.00001366  
Epoch: [2][400/1906] Elapsed 1m 40s (remain 6m 15s) Loss: 0.0068(0.0110) Grad: 10695.7246  LR: 0.00001354  
Epoch: [2][500/1906] Elapsed 2m 4s (remain 5m 50s) Loss: 0.0074(0.0108) Grad: 28287.5586  LR: 0.00001342  
Epoch: [2][600/1906] Elapsed 2m 29s (remain 5m 25s) Loss: 0.0125(0.0107) Grad: 32904.2852  LR: 0.00001329  
Epoch: [2][700/1906] Elapsed 2m 55s (remain 5m 1s) Loss: 0.0128(0.0107) Grad: 22923.4531  LR: 0.00001316  
Epoch: [2][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0040(0.0106) Grad: 32252.1250  LR: 0.00001302  
Epoch: [2][900/1906] Elapsed 3m 45

Epoch 2 - avg_train_loss: 0.0101  avg_val_loss: 0.0132  time: 534s
Epoch 2 - Score: 0.8772 for th=0.515
Epoch 2 - Save Score: 0.8772 Model


Epoch: [3][0/1906] Elapsed 0m 0s (remain 15m 21s) Loss: 0.0048(0.0048) Grad: 15509.4648  LR: 0.00001125  
Epoch: [3][100/1906] Elapsed 0m 25s (remain 7m 37s) Loss: 0.0039(0.0084) Grad: 20311.5918  LR: 0.00001107  
Epoch: [3][200/1906] Elapsed 0m 50s (remain 7m 7s) Loss: 0.0170(0.0082) Grad: 18751.7246  LR: 0.00001089  
Epoch: [3][300/1906] Elapsed 1m 15s (remain 6m 41s) Loss: 0.0742(0.0086) Grad: 79153.2500  LR: 0.00001070  
Epoch: [3][400/1906] Elapsed 1m 40s (remain 6m 15s) Loss: 0.0064(0.0088) Grad: 23775.1094  LR: 0.00001052  
Epoch: [3][500/1906] Elapsed 2m 4s (remain 5m 50s) Loss: 0.0002(0.0087) Grad: 724.7931  LR: 0.00001033  
Epoch: [3][600/1906] Elapsed 2m 30s (remain 5m 26s) Loss: 0.0002(0.0088) Grad: 1205.2024  LR: 0.00001013  
Epoch: [3][700/1906] Elapsed 2m 55s (remain 5m 1s) Loss: 0.0001(0.0088) Grad: 193.7339  LR: 0.00000994  
Epoch: [3][800/1906] Elapsed 3m 19s (remain 4m 35s) Loss: 0.0094(0.0088) Grad: 8384.6885  LR: 0.00000975  
Epoch: [3][900/1906] Elapsed 3m 44s (re

Epoch 3 - avg_train_loss: 0.0087  avg_val_loss: 0.0138  time: 532s
Epoch 3 - Score: 0.8811 for th=0.515
Epoch 3 - Save Score: 0.8811 Model


Epoch: [4][0/1906] Elapsed 0m 0s (remain 14m 55s) Loss: 0.0017(0.0017) Grad: 2764.8450  LR: 0.00000750  
Epoch: [4][100/1906] Elapsed 0m 25s (remain 7m 39s) Loss: 0.0181(0.0078) Grad: 32354.4766  LR: 0.00000730  
Epoch: [4][200/1906] Elapsed 0m 50s (remain 7m 10s) Loss: 0.0289(0.0076) Grad: 55762.8320  LR: 0.00000709  
Epoch: [4][300/1906] Elapsed 1m 15s (remain 6m 42s) Loss: 0.0082(0.0084) Grad: 11249.9131  LR: 0.00000688  
Epoch: [4][400/1906] Elapsed 1m 40s (remain 6m 16s) Loss: 0.0005(0.0084) Grad: 2473.7346  LR: 0.00000668  
Epoch: [4][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0064(0.0080) Grad: 26522.1855  LR: 0.00000648  
Epoch: [4][600/1906] Elapsed 2m 30s (remain 5m 26s) Loss: 0.0064(0.0080) Grad: 23239.7031  LR: 0.00000627  
Epoch: [4][700/1906] Elapsed 2m 55s (remain 5m 1s) Loss: 0.0055(0.0082) Grad: 12067.0195  LR: 0.00000607  
Epoch: [4][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0024(0.0079) Grad: 10282.7354  LR: 0.00000587  
Epoch: [4][900/1906] Elapsed 3m 45

Epoch 4 - avg_train_loss: 0.0077  avg_val_loss: 0.0145  time: 535s
Epoch 4 - Score: 0.8794 for th=0.575


Epoch: [5][0/1906] Elapsed 0m 0s (remain 14m 48s) Loss: 0.0028(0.0028) Grad: 9631.1094  LR: 0.00000375  
Epoch: [5][100/1906] Elapsed 0m 25s (remain 7m 32s) Loss: 0.0012(0.0053) Grad: 3206.0491  LR: 0.00000358  
Epoch: [5][200/1906] Elapsed 0m 50s (remain 7m 7s) Loss: 0.0062(0.0058) Grad: 20451.3145  LR: 0.00000340  
Epoch: [5][300/1906] Elapsed 1m 15s (remain 6m 40s) Loss: 0.0001(0.0060) Grad: 140.0103  LR: 0.00000323  
Epoch: [5][400/1906] Elapsed 1m 39s (remain 6m 15s) Loss: 0.0105(0.0067) Grad: 23160.0410  LR: 0.00000306  
Epoch: [5][500/1906] Elapsed 2m 5s (remain 5m 50s) Loss: 0.0004(0.0065) Grad: 2003.4817  LR: 0.00000290  
Epoch: [5][600/1906] Elapsed 2m 29s (remain 5m 25s) Loss: 0.0012(0.0064) Grad: 3200.5037  LR: 0.00000274  
Epoch: [5][700/1906] Elapsed 2m 55s (remain 5m 0s) Loss: 0.0036(0.0066) Grad: 11178.0752  LR: 0.00000258  
Epoch: [5][800/1906] Elapsed 3m 20s (remain 4m 35s) Loss: 0.0034(0.0065) Grad: 4886.8359  LR: 0.00000243  
Epoch: [5][900/1906] Elapsed 3m 44s (rem

Epoch 5 - avg_train_loss: 0.0064  avg_val_loss: 0.0150  time: 533s
Epoch 5 - Score: 0.8829 for th=0.45
Epoch 5 - Save Score: 0.8829 Model


Epoch: [6][0/1906] Elapsed 0m 0s (remain 15m 6s) Loss: 0.0004(0.0004) Grad: 2786.8425  LR: 0.00000101  
Epoch: [6][100/1906] Elapsed 0m 25s (remain 7m 32s) Loss: 0.0053(0.0044) Grad: 22625.3672  LR: 0.00000091  
Epoch: [6][200/1906] Elapsed 0m 50s (remain 7m 9s) Loss: 0.0024(0.0055) Grad: 5754.3857  LR: 0.00000081  
Epoch: [6][300/1906] Elapsed 1m 15s (remain 6m 41s) Loss: 0.0030(0.0061) Grad: 20505.8340  LR: 0.00000072  
Epoch: [6][400/1906] Elapsed 1m 40s (remain 6m 16s) Loss: 0.0404(0.0061) Grad: 219187.7656  LR: 0.00000063  
Epoch: [6][500/1906] Elapsed 2m 4s (remain 5m 50s) Loss: 0.0021(0.0059) Grad: 19726.6152  LR: 0.00000055  
Epoch: [6][600/1906] Elapsed 2m 29s (remain 5m 24s) Loss: 0.0002(0.0057) Grad: 1905.2695  LR: 0.00000048  
Epoch: [6][700/1906] Elapsed 2m 54s (remain 4m 59s) Loss: 0.0035(0.0057) Grad: 21252.2852  LR: 0.00000041  
Epoch: [6][800/1906] Elapsed 3m 19s (remain 4m 34s) Loss: 0.0001(0.0056) Grad: 314.1557  LR: 0.00000035  
Epoch: [6][900/1906] Elapsed 3m 44s (

Epoch 6 - avg_train_loss: 0.0056  avg_val_loss: 0.0168  time: 533s
Epoch 6 - Score: 0.8822 for th=0.515
Score: 0.8829 Best threshold:: 0.45
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/1906] Elapsed 0m 0s (remain 14m 36s) Loss: 0.6470(0.6470) Grad: inf  LR: 0.00001500  
Epoch: [1][100/1906] Elapsed 0m 25s (remain 7m 36s) Loss: 0.0345(0.0865) Grad: 2999.7876  LR: 0.00001500  
Epoch: [1][200/1906] Elapsed 0m 50s (remain 7m 10s) Loss: 0.0105(0.0566) Grad: 1454.8508  LR: 0.00001499  
Epoch: [1][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0129(0.0471) Grad: 1188.7983  LR: 0.00001497  
Epoch: [1][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0071(0.0409) Grad: 1196.0090  LR: 0.00001495  
Epoch: [1][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0117(0.0370) Grad: 1414.2686  LR: 0.00001493  
Epoch: [1][600/1906] Elapsed 2m 30s (remain 5m 25s) Loss: 0.0411(0.0340) Grad: 4515.7695  LR: 0.00001490  
Epoch: [1][700/1906] Elapsed 2m 54s (remain 5m 0s) Loss: 0.0256(0.0318) Grad: 2132.2915  LR: 0.00001486  
Epoch: [1][800/1906] Elapsed 3m 19s (remain 4m 35s) Loss: 0.0007(0.0299) Grad: 242.1227  LR: 0.00001482  
Epoch: [1][900/1906] Elapsed 3m 44s (remain 4m 1

Epoch 1 - avg_train_loss: 0.0218  avg_val_loss: 0.0132  time: 533s
Epoch 1 - Score: 0.8618 for th=0.45
Epoch 1 - Save Score: 0.8618 Model


Epoch: [2][0/1906] Elapsed 0m 0s (remain 15m 40s) Loss: 0.0184(0.0184) Grad: 237774.0000  LR: 0.00001399  
Epoch: [2][100/1906] Elapsed 0m 25s (remain 7m 42s) Loss: 0.0029(0.0110) Grad: 5480.0410  LR: 0.00001389  
Epoch: [2][200/1906] Elapsed 0m 50s (remain 7m 11s) Loss: 0.0065(0.0111) Grad: 14557.2207  LR: 0.00001378  
Epoch: [2][300/1906] Elapsed 1m 15s (remain 6m 44s) Loss: 0.0003(0.0115) Grad: 776.2796  LR: 0.00001366  
Epoch: [2][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0152(0.0115) Grad: 19904.1094  LR: 0.00001354  
Epoch: [2][500/1906] Elapsed 2m 5s (remain 5m 52s) Loss: 0.0072(0.0111) Grad: 11900.3164  LR: 0.00001342  
Epoch: [2][600/1906] Elapsed 2m 30s (remain 5m 26s) Loss: 0.0036(0.0108) Grad: 16726.2070  LR: 0.00001329  
Epoch: [2][700/1906] Elapsed 2m 55s (remain 5m 1s) Loss: 0.0085(0.0107) Grad: 12649.2676  LR: 0.00001316  
Epoch: [2][800/1906] Elapsed 3m 20s (remain 4m 36s) Loss: 0.0014(0.0108) Grad: 8082.4058  LR: 0.00001302  
Epoch: [2][900/1906] Elapsed 3m 44s

Epoch 2 - avg_train_loss: 0.0102  avg_val_loss: 0.0134  time: 533s
Epoch 2 - Score: 0.8749 for th=0.575
Epoch 2 - Save Score: 0.8749 Model


Epoch: [3][0/1906] Elapsed 0m 0s (remain 15m 36s) Loss: 0.0031(0.0031) Grad: 10984.2012  LR: 0.00001125  
Epoch: [3][100/1906] Elapsed 0m 25s (remain 7m 34s) Loss: 0.0010(0.0078) Grad: 10035.7568  LR: 0.00001107  
Epoch: [3][200/1906] Elapsed 0m 50s (remain 7m 11s) Loss: 0.0033(0.0070) Grad: 10098.4521  LR: 0.00001089  
Epoch: [3][300/1906] Elapsed 1m 15s (remain 6m 44s) Loss: 0.0012(0.0071) Grad: 6495.3931  LR: 0.00001070  
Epoch: [3][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0020(0.0079) Grad: 24313.0312  LR: 0.00001052  
Epoch: [3][500/1906] Elapsed 2m 5s (remain 5m 52s) Loss: 0.0001(0.0081) Grad: 490.7624  LR: 0.00001033  
Epoch: [3][600/1906] Elapsed 2m 30s (remain 5m 27s) Loss: 0.0007(0.0081) Grad: 3481.6147  LR: 0.00001013  
Epoch: [3][700/1906] Elapsed 2m 55s (remain 5m 2s) Loss: 0.0005(0.0082) Grad: 3729.0339  LR: 0.00000994  
Epoch: [3][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0001(0.0084) Grad: 417.8151  LR: 0.00000975  
Epoch: [3][900/1906] Elapsed 3m 45s (re

Epoch 3 - avg_train_loss: 0.0089  avg_val_loss: 0.0143  time: 534s
Epoch 3 - Score: 0.8771 for th=0.38
Epoch 3 - Save Score: 0.8771 Model


Epoch: [4][0/1906] Elapsed 0m 0s (remain 15m 41s) Loss: 0.0191(0.0191) Grad: 23869.9395  LR: 0.00000750  
Epoch: [4][100/1906] Elapsed 0m 25s (remain 7m 38s) Loss: 0.0035(0.0085) Grad: 8921.7686  LR: 0.00000730  
Epoch: [4][200/1906] Elapsed 0m 50s (remain 7m 8s) Loss: 0.0112(0.0086) Grad: 32252.3359  LR: 0.00000709  
Epoch: [4][300/1906] Elapsed 1m 15s (remain 6m 41s) Loss: 0.0014(0.0080) Grad: 10974.6611  LR: 0.00000688  
Epoch: [4][400/1906] Elapsed 1m 40s (remain 6m 15s) Loss: 0.0007(0.0074) Grad: 12279.8096  LR: 0.00000668  
Epoch: [4][500/1906] Elapsed 2m 4s (remain 5m 50s) Loss: 0.0063(0.0074) Grad: 15526.6035  LR: 0.00000648  
Epoch: [4][600/1906] Elapsed 2m 29s (remain 5m 25s) Loss: 0.0017(0.0074) Grad: 65611.6406  LR: 0.00000627  
Epoch: [4][700/1906] Elapsed 2m 54s (remain 5m 0s) Loss: 0.0008(0.0075) Grad: 4200.7769  LR: 0.00000607  
Epoch: [4][800/1906] Elapsed 3m 19s (remain 4m 35s) Loss: 0.0055(0.0075) Grad: 25301.4688  LR: 0.00000587  
Epoch: [4][900/1906] Elapsed 3m 44s

Epoch 4 - avg_train_loss: 0.0078  avg_val_loss: 0.0157  time: 532s
Epoch 4 - Score: 0.8785 for th=0.545
Epoch 4 - Save Score: 0.8785 Model


Epoch: [5][0/1906] Elapsed 0m 0s (remain 15m 28s) Loss: 0.0182(0.0182) Grad: 48758.8477  LR: 0.00000375  
Epoch: [5][100/1906] Elapsed 0m 25s (remain 7m 37s) Loss: 0.0002(0.0068) Grad: 2365.0435  LR: 0.00000358  
Epoch: [5][200/1906] Elapsed 0m 51s (remain 7m 13s) Loss: 0.0021(0.0073) Grad: 53219.0469  LR: 0.00000340  
Epoch: [5][300/1906] Elapsed 1m 16s (remain 6m 47s) Loss: 0.0001(0.0067) Grad: 984.3024  LR: 0.00000323  
Epoch: [5][400/1906] Elapsed 1m 41s (remain 6m 19s) Loss: 0.0076(0.0069) Grad: 19090.9004  LR: 0.00000306  
Epoch: [5][500/1906] Elapsed 2m 6s (remain 5m 54s) Loss: 0.0058(0.0068) Grad: 12801.9014  LR: 0.00000290  
Epoch: [5][600/1906] Elapsed 2m 31s (remain 5m 28s) Loss: 0.0021(0.0071) Grad: 24225.2793  LR: 0.00000274  
Epoch: [5][700/1906] Elapsed 2m 56s (remain 5m 3s) Loss: 0.0048(0.0070) Grad: 38018.3672  LR: 0.00000258  
Epoch: [5][800/1906] Elapsed 3m 21s (remain 4m 38s) Loss: 0.0066(0.0070) Grad: 10757.9902  LR: 0.00000243  
Epoch: [5][900/1906] Elapsed 3m 46s

Epoch 5 - avg_train_loss: 0.0066  avg_val_loss: 0.0172  time: 534s
Epoch 5 - Score: 0.8812 for th=0.64
Epoch 5 - Save Score: 0.8812 Model


Epoch: [6][0/1906] Elapsed 0m 0s (remain 15m 25s) Loss: 0.0025(0.0025) Grad: 7586.3970  LR: 0.00000101  
Epoch: [6][100/1906] Elapsed 0m 25s (remain 7m 34s) Loss: 0.0195(0.0040) Grad: 20433.4473  LR: 0.00000091  
Epoch: [6][200/1906] Elapsed 0m 50s (remain 7m 5s) Loss: 0.0003(0.0057) Grad: 3011.3818  LR: 0.00000081  
Epoch: [6][300/1906] Elapsed 1m 15s (remain 6m 41s) Loss: 0.0000(0.0056) Grad: 65.9716  LR: 0.00000072  
Epoch: [6][400/1906] Elapsed 1m 40s (remain 6m 15s) Loss: 0.0001(0.0054) Grad: 293.2040  LR: 0.00000063  
Epoch: [6][500/1906] Elapsed 2m 4s (remain 5m 49s) Loss: 0.0021(0.0058) Grad: 9889.3809  LR: 0.00000055  
Epoch: [6][600/1906] Elapsed 2m 29s (remain 5m 25s) Loss: 0.0027(0.0056) Grad: 56802.1406  LR: 0.00000048  
Epoch: [6][700/1906] Elapsed 2m 54s (remain 5m 0s) Loss: 0.0056(0.0058) Grad: 40789.7852  LR: 0.00000041  
Epoch: [6][800/1906] Elapsed 3m 19s (remain 4m 35s) Loss: 0.0000(0.0057) Grad: 117.5270  LR: 0.00000035  
Epoch: [6][900/1906] Elapsed 3m 44s (remain

Epoch 6 - avg_train_loss: 0.0060  avg_val_loss: 0.0176  time: 531s
Epoch 6 - Score: 0.8797 for th=0.695
Score: 0.8812 Best threshold:: 0.64
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/1906] Elapsed 0m 0s (remain 13m 23s) Loss: 0.8638(0.8638) Grad: inf  LR: 0.00001500  
Epoch: [1][100/1906] Elapsed 0m 25s (remain 7m 29s) Loss: 0.0597(0.1136) Grad: 3800.8384  LR: 0.00001500  
Epoch: [1][200/1906] Elapsed 0m 49s (remain 7m 3s) Loss: 0.0150(0.0727) Grad: 1526.3492  LR: 0.00001499  
Epoch: [1][300/1906] Elapsed 1m 14s (remain 6m 38s) Loss: 0.0066(0.0574) Grad: 857.4569  LR: 0.00001497  
Epoch: [1][400/1906] Elapsed 1m 39s (remain 6m 13s) Loss: 0.0301(0.0488) Grad: 3079.7739  LR: 0.00001495  
Epoch: [1][500/1906] Elapsed 2m 4s (remain 5m 49s) Loss: 0.0188(0.0431) Grad: 1997.1191  LR: 0.00001493  
Epoch: [1][600/1906] Elapsed 2m 29s (remain 5m 24s) Loss: 0.0178(0.0396) Grad: 1036.5685  LR: 0.00001490  
Epoch: [1][700/1906] Elapsed 2m 54s (remain 4m 59s) Loss: 0.0147(0.0365) Grad: 948.0223  LR: 0.00001486  
Epoch: [1][800/1906] Elapsed 3m 19s (remain 4m 34s) Loss: 0.0049(0.0345) Grad: 1107.9357  LR: 0.00001482  
Epoch: [1][900/1906] Elapsed 3m 44s (remain 4m 9s

Epoch 1 - avg_train_loss: 0.0236  avg_val_loss: 0.0129  time: 532s
Epoch 1 - Score: 0.8626 for th=0.345
Epoch 1 - Save Score: 0.8626 Model


Epoch: [2][0/1906] Elapsed 0m 0s (remain 16m 5s) Loss: 0.0009(0.0009) Grad: 2358.7688  LR: 0.00001399  
Epoch: [2][100/1906] Elapsed 0m 26s (remain 7m 45s) Loss: 0.0138(0.0123) Grad: 7631.7856  LR: 0.00001389  
Epoch: [2][200/1906] Elapsed 0m 51s (remain 7m 13s) Loss: 0.0030(0.0120) Grad: 17083.3398  LR: 0.00001378  
Epoch: [2][300/1906] Elapsed 1m 16s (remain 6m 46s) Loss: 0.0143(0.0115) Grad: 20091.6367  LR: 0.00001366  
Epoch: [2][400/1906] Elapsed 1m 41s (remain 6m 19s) Loss: 0.0025(0.0114) Grad: 5341.1694  LR: 0.00001354  
Epoch: [2][500/1906] Elapsed 2m 6s (remain 5m 53s) Loss: 0.0090(0.0108) Grad: 15104.7803  LR: 0.00001342  
Epoch: [2][600/1906] Elapsed 2m 31s (remain 5m 27s) Loss: 0.0043(0.0107) Grad: 10666.6943  LR: 0.00001329  
Epoch: [2][700/1906] Elapsed 2m 56s (remain 5m 2s) Loss: 0.0201(0.0106) Grad: 53249.2734  LR: 0.00001316  
Epoch: [2][800/1906] Elapsed 3m 20s (remain 4m 37s) Loss: 0.0034(0.0106) Grad: 14959.7617  LR: 0.00001302  
Epoch: [2][900/1906] Elapsed 3m 45s 

Epoch 2 - avg_train_loss: 0.0100  avg_val_loss: 0.0131  time: 533s
Epoch 2 - Score: 0.8762 for th=0.44
Epoch 2 - Save Score: 0.8762 Model


Epoch: [3][0/1906] Elapsed 0m 0s (remain 15m 3s) Loss: 0.0051(0.0051) Grad: 11060.0039  LR: 0.00001125  
Epoch: [3][100/1906] Elapsed 0m 25s (remain 7m 42s) Loss: 0.0186(0.0075) Grad: 33847.4688  LR: 0.00001107  
Epoch: [3][200/1906] Elapsed 0m 50s (remain 7m 12s) Loss: 0.0035(0.0076) Grad: 5707.6216  LR: 0.00001089  
Epoch: [3][300/1906] Elapsed 1m 15s (remain 6m 43s) Loss: 0.0164(0.0075) Grad: 41192.2812  LR: 0.00001070  
Epoch: [3][400/1906] Elapsed 1m 40s (remain 6m 17s) Loss: 0.0002(0.0079) Grad: 1594.2341  LR: 0.00001052  
Epoch: [3][500/1906] Elapsed 2m 5s (remain 5m 51s) Loss: 0.0111(0.0080) Grad: 20182.3359  LR: 0.00001033  
Epoch: [3][600/1906] Elapsed 2m 30s (remain 5m 25s) Loss: 0.0002(0.0082) Grad: 624.0928  LR: 0.00001013  
Epoch: [3][700/1906] Elapsed 2m 54s (remain 5m 0s) Loss: 0.0033(0.0083) Grad: 25394.6562  LR: 0.00000994  
Epoch: [3][800/1906] Elapsed 3m 20s (remain 4m 35s) Loss: 0.0007(0.0084) Grad: 6474.2778  LR: 0.00000975  
Epoch: [3][900/1906] Elapsed 3m 45s (r

Epoch 3 - avg_train_loss: 0.0088  avg_val_loss: 0.0138  time: 551s
Epoch 3 - Score: 0.8757 for th=0.45


Epoch: [4][0/1906] Elapsed 0m 0s (remain 17m 25s) Loss: 0.0006(0.0006) Grad: 2530.5293  LR: 0.00000750  
Epoch: [4][100/1906] Elapsed 0m 27s (remain 8m 19s) Loss: 0.0031(0.0071) Grad: 12343.9131  LR: 0.00000730  
Epoch: [4][200/1906] Elapsed 0m 55s (remain 7m 46s) Loss: 0.0035(0.0077) Grad: 15362.0596  LR: 0.00000709  
Epoch: [4][300/1906] Elapsed 1m 22s (remain 7m 17s) Loss: 0.0004(0.0074) Grad: 1447.2618  LR: 0.00000688  
Epoch: [4][400/1906] Elapsed 1m 49s (remain 6m 50s) Loss: 0.0008(0.0078) Grad: 14182.1934  LR: 0.00000668  
Epoch: [4][500/1906] Elapsed 2m 16s (remain 6m 23s) Loss: 0.0177(0.0078) Grad: 48160.5742  LR: 0.00000648  
Epoch: [4][600/1906] Elapsed 2m 43s (remain 5m 55s) Loss: 0.0017(0.0076) Grad: 12716.2900  LR: 0.00000627  
Epoch: [4][700/1906] Elapsed 3m 10s (remain 5m 28s) Loss: 0.0012(0.0078) Grad: 6550.3320  LR: 0.00000607  
Epoch: [4][800/1906] Elapsed 3m 38s (remain 5m 0s) Loss: 0.0026(0.0076) Grad: 18074.2031  LR: 0.00000587  
Epoch: [4][900/1906] Elapsed 4m 5s

Epoch 4 - avg_train_loss: 0.0077  avg_val_loss: 0.0135  time: 580s
Epoch 4 - Score: 0.8758 for th=0.57


Epoch: [5][0/1906] Elapsed 0m 0s (remain 17m 19s) Loss: 0.0080(0.0080) Grad: 9182.7021  LR: 0.00000375  
Epoch: [5][100/1906] Elapsed 0m 27s (remain 8m 7s) Loss: 0.0001(0.0053) Grad: 591.7947  LR: 0.00000358  
Epoch: [5][200/1906] Elapsed 0m 54s (remain 7m 38s) Loss: 0.0116(0.0056) Grad: 15810.3994  LR: 0.00000340  
Epoch: [5][300/1906] Elapsed 1m 20s (remain 7m 10s) Loss: 0.0001(0.0056) Grad: 155.7479  LR: 0.00000323  
Epoch: [5][400/1906] Elapsed 1m 47s (remain 6m 42s) Loss: 0.0717(0.0059) Grad: 126774.9375  LR: 0.00000306  
Epoch: [5][500/1906] Elapsed 2m 13s (remain 6m 15s) Loss: 0.0010(0.0061) Grad: 4982.3071  LR: 0.00000290  
Epoch: [5][600/1906] Elapsed 2m 40s (remain 5m 48s) Loss: 0.0005(0.0058) Grad: 1688.1675  LR: 0.00000274  
Epoch: [5][700/1906] Elapsed 3m 7s (remain 5m 21s) Loss: 0.0018(0.0064) Grad: 2974.3447  LR: 0.00000258  
Epoch: [5][800/1906] Elapsed 3m 34s (remain 4m 55s) Loss: 0.0074(0.0064) Grad: 24904.9102  LR: 0.00000243  
Epoch: [5][900/1906] Elapsed 4m 1s (rem

Epoch 5 - avg_train_loss: 0.0065  avg_val_loss: 0.0154  time: 575s
Epoch 5 - Score: 0.8787 for th=0.375
Epoch 5 - Save Score: 0.8787 Model


Epoch: [6][0/1906] Elapsed 0m 0s (remain 17m 2s) Loss: 0.0010(0.0010) Grad: 5342.1636  LR: 0.00000101  
Epoch: [6][100/1906] Elapsed 0m 27s (remain 8m 9s) Loss: 0.0007(0.0040) Grad: 9262.1924  LR: 0.00000091  
Epoch: [6][200/1906] Elapsed 0m 54s (remain 7m 41s) Loss: 0.0082(0.0042) Grad: 11424.8857  LR: 0.00000081  
Epoch: [6][300/1906] Elapsed 1m 21s (remain 7m 15s) Loss: 0.0098(0.0044) Grad: 13509.7051  LR: 0.00000072  
Epoch: [6][400/1906] Elapsed 1m 48s (remain 6m 47s) Loss: 0.0014(0.0047) Grad: 20124.8828  LR: 0.00000063  
Epoch: [6][500/1906] Elapsed 2m 15s (remain 6m 20s) Loss: 0.0001(0.0050) Grad: 1228.5563  LR: 0.00000055  
Epoch: [6][600/1906] Elapsed 2m 42s (remain 5m 53s) Loss: 0.0264(0.0050) Grad: 34054.3867  LR: 0.00000048  
Epoch: [6][700/1906] Elapsed 3m 9s (remain 5m 26s) Loss: 0.0008(0.0051) Grad: 7187.6431  LR: 0.00000041  
Epoch: [6][800/1906] Elapsed 3m 36s (remain 4m 59s) Loss: 0.0002(0.0051) Grad: 2505.5427  LR: 0.00000035  
Epoch: [6][900/1906] Elapsed 4m 4s (re

Epoch 6 - avg_train_loss: 0.0057  avg_val_loss: 0.0165  time: 575s
Epoch 6 - Score: 0.8781 for th=0.51
Score: 0.8787 Best threshold:: 0.375
Score: 0.8809 Best threshold:: 0.36


In [None]:
commit_msg = '"run_name: ' + wandb.run.name[:wandb.run.name.rfind('-')] + '"'
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,▁
[fold0] avg_val_loss,▁
[fold0] best_th,▁
[fold0] epoch,▁
[fold0] loss,████▆▄▂▁▂▁▂▁▂▁▁▂▁▁▃▂▁▂▂▂▂▁▁▂▂▂▂▂▁▁▂▁▂▁▂▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁
[fold1] avg_train_loss,▁
[fold1] avg_val_loss,▁
[fold1] best_th,▁

0,1
[fold0] avg_train_loss,0.13472
[fold0] avg_val_loss,0.06083
[fold0] best_th,0.3
[fold0] epoch,1.0
[fold0] loss,0.02692
[fold0] lr,0.0
[fold0] score,0.0
[fold1] avg_train_loss,0.09764
[fold1] avg_val_loss,0.06451
[fold1] best_th,0.3


In [None]:
!cp -r ../output/* ../drive/MyDrive/exp067_output/
!cp exp067_train.ipynb ../drive/MyDrive/exp067_output/