# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

In [1]:
!nvidia-smi

Sat Apr 23 14:46:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    73W / 350W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   33C    P0    60W / 350W |  15882MiB / 40536MiB |      0%      Default |
|       

# Directory settings

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# %cd "/content/drive/MyDrive/Colab Notebooks/nbme/code/"
# !pwd

In [4]:
# ====================================================
# Directory settings
# ====================================================
import os
from pathlib import Path

OUTPUT_DIR = Path('../output/exp070')
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='NBME'
    _wandb_kernel='MPEG'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=15
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16 #6
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    freeze=False
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [6]:
!pip install wandb > /dev/null



In [7]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    from getpass import getpass
    try:
        #from kaggle_secrets import UserSecretsClient
        #user_secrets = UserSecretsClient()
        secret_value_0 = getpass()
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

········


[34m[1mwandb[0m: Currently logged in as: [33mmpeg[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Library

In [8]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

!pip install transformers[sentencepiece]
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [9]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [10]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


# def get_results(char_probs, th=0.5):
#     results = []
#     for char_prob in char_probs:
#         result = np.where(char_prob >= th)[0] + 1
#         # 上記の処理だと先頭1文字に必ずanotationがつけられないので帳尻合わせ
#         if char_prob[0]>=th:
#             result=np.hstack([[0],result])
#         result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
#         result = [f"{min(r)} {max(r)}" for r in result]
#         result = ";".join(result)
#         results.append(result)
#     return results

def get_raw_location_annotation(char_probs:"list[np.ndarray]",
                                pn_histories:"list[str]",
                                th:float=0.5
                               )->"tuple[list[list[tuple[int,int]]], list[list[str]]]":
    '''前処理なしのlocationと抜き出し'''
    locations = []
    for char_prob in char_probs:
        location = np.where(char_prob >= th)[0]
        location = [list(g) for _, g in itertools.groupby(location, key=lambda n, c=itertools.count(): n - next(c))]
        location = [(min(r), max(r)+1) for r in location]
        locations.append(location)
    
    annotations=[]
    for text, location in zip(pn_histories, locations):
        annotation=[]
        for i, j in location:
            annotation.append(text[i:j])
        annotations.append(annotation)
    return locations, annotations

def remove_white_space_from_head(locations:"list[list[tuple[int,int]]]",
                                 annotations:"list[list[str]]"
                                )->"tuple[list[list[tuple[int,int]]], list[list[str]]]":
    '''先頭の後処理。空白や改行等を抜く'''
    # pp
    to_delete={' ', '\n', '\r'}
    annotations2=[]
    locations2=[]
    for annotation, location in zip(annotations, locations):
        new_annotation=[]
        new_location=[]
        if len(annotation)==0:
            annotations2.append([])
            locations2.append([])
            continue
        for anno_seg, (i, j) in zip(annotation, location):
            while anno_seg and anno_seg[0] in to_delete: #先頭から変なのを抜いていきます。
                anno_seg=anno_seg[1:]
                i+=1
            new_annotation.append(anno_seg)
            new_location.append((i,j))
        annotations2.append(new_annotation)
        locations2.append(new_location)
    return locations2, annotations2

def get_results(char_probs:"list[np.ndarray]",
                pn_histories:"list[str]",
                th=0.5):
    '''文字ごとの出力確率と文章→後処理→提出用結果を生成'''
    locations, annotations=get_raw_location_annotation(char_probs,pn_histories,th=th)
    locations, annotations=remove_white_space_from_head(locations, annotations) #後処理1
    results=[]
    for loc in locations:
        result = [f"{i} {j}" for i, j in loc]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [11]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR / 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [12]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [13]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [14]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [15]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [16]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [17]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [18]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path

transformers_path = Path(transformers.__file__[:-12])

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [19]:
# ====================================================
# tokenizer
# ====================================================
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
from transformers import AutoTokenizer

tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR / 'tokenizer/')
CFG.tokenizer = tokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [20]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 354


In [21]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v[:CFG.max_len], dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label[:CFG.max_len], dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [22]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helpler functions

In [23]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.shape)
        sigmoid = nn.Sigmoid()
        toukei = y_preds.view(-1,1)#sigmoid(y_preds).view(-1, 1)
        loss = criterion(sigmoid(y_preds).view(-1, 1), labels.view(-1, 1)).to(device)
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    softmax = nn.Softmax(dim=1)
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        sigmoid = nn.Sigmoid()
        loss = criterion(sigmoid(y_preds).view(-1, 1), labels.view(-1, 1)).to(device)
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(sigmoid(y_preds).to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [24]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    if CFG.freeze:
        name_set = set()
        for i in range(20):
            name_set.add('model.encoder.layer.'+str(i)+'.')
        for name, param in model.named_parameters():
            for elem in name_set:
                if name.startswith(elem): # choose whatever you like here
                    print(name + ' set to be freezed.')
                    param.requires_grad = False
    torch.save(model.config, OUTPUT_DIR / 'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    #criterion = nn.BCEWithLogitsLoss(reduction="none")
    def f1_loss(y_pred, y_true, epsilon=1e-7):
        #print('y_pred:', torch.min(y_pred), torch.max(y_pred), torch.mean(y_pred), torch.std(y_pred))
        #print('y_true:', torch.min(y_true), torch.max(y_true), torch.mean(y_true), torch.std(y_true))
        mask = (y_true != -1)
        y_pred = y_pred[mask]
        y_true = y_true[mask]
        
        #print(mask)
        #print(len(y_true[mask]))
        
        assert torch.all(0.0 -1e-7 <= y_pred)
        assert torch.all(1.0 +1e-7 >= y_pred)
        
        tp = torch.sum((y_true*y_pred).type(torch.float64), axis=0)
        tn = torch.sum(((1-y_true)*(1-y_pred)).type(torch.float64), axis=0)
        fp = torch.sum(((1-y_true)*y_pred).type(torch.float64), axis=0)
        fn = torch.sum((y_true*(1-y_pred)).type(torch.float64), axis=0)
        assert torch.all(0 <= tp <= len(y_pred))
        assert torch.all(0 <= fp <= len(y_pred))
        assert torch.all(0 <= tn <= len(y_pred))
        assert torch.all(0 <= fn <= len(y_pred))
        
        #print('tptnfpfn:', tp, tn, fp, fn)
        p = tp / (tp + fp + epsilon)
        r = tp / (tp + fn + epsilon)
        #print(torch.min(p), torch.max(p), torch.mean(p), torch.std(p))
        assert torch.all(0. -1e-7 < p < 1. + 1e-7)
        #print(torch.min(r), torch.max(r), torch.mean(r), torch.std(r))
        assert torch.all(0. -1e-7 < r < 1. + 1e-7)
        f1 = 2*p*r / (p+r+epsilon)
        f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)
        assert 0. - 1e-7 <= torch.mean(f1) <= 1 + 1e-7
        return -torch.log(torch.mean(f1)+1e-45)
    criterion = f1_loss
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        # ここをしきい値で探索した最適な値にする
        score=-100
        for th in np.arange(0.3,0.7,0.005):
            th = np.round(th,4)
            results = get_results(char_probs, valid_folds['pn_history'].tolist(), th=th)
            preds = get_predictions(results)
            tmp_score = get_score(valid_labels, preds)
            if tmp_score > score:
                best_th=th
                score=tmp_score

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f} for th={best_th}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score,
                       f"[fold{fold}] threshold": best_th,
                       })
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR / f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR / f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [25]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
if __name__ == '__main__':
    gc.collect()
    torch.cuda.empty_cache()
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, oof_df['pn_history'].tolist(), th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
                oof_df.to_pickle(OUTPUT_DIR / 'oof_df_fold'+str(fold)+'.pkl')
                wandb.alert(
                   title="fold "+str(fold)+ " Finished", 
                   text=CFG.model + ' has finished its fold ' +str(fold) + ' running.'
                )
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR / 'oof_df.pkl')
        

    if CFG.wandb:
        wandb.finish()

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/715] Elapsed 0m 1s (remain 22m 22s) Loss: 3.6096(3.6096) Grad: nan  LR: 0.00002000  
Epoch: [1][100/715] Elapsed 0m 50s (remain 5m 5s) Loss: 1.0925(1.0667) Grad: 329608.2188  LR: 0.00002000  
Epoch: [1][200/715] Elapsed 1m 38s (remain 4m 11s) Loss: 0.3135(0.7168) Grad: 36211.3438  LR: 0.00001998  
Epoch: [1][300/715] Elapsed 2m 26s (remain 3m 21s) Loss: 0.2309(0.5972) Grad: 260083.0938  LR: 0.00001996  
Epoch: [1][400/715] Elapsed 3m 14s (remain 2m 32s) Loss: 0.1514(0.5357) Grad: 16226.7148  LR: 0.00001993  
Epoch: [1][500/715] Elapsed 4m 3s (remain 1m 43s) Loss: 0.1911(0.4880) Grad: 338130.9375  LR: 0.00001989  
Epoch: [1][600/715] Elapsed 4m 51s (remain 0m 55s) Loss: 0.4272(0.4605) Grad: 3556.9260  LR: 0.00001985  
Epoch: [1][700/715] Elapsed 5m 39s (remain 0m 6s) Loss: 0.1108(0.4402) Grad: 105200.7266  LR: 0.00001979  
Epoch: [1][714/715] Elapsed 5m 46s (remain 0m 0s) Loss: 0.3827(0.4364) Grad: 63846.6328  LR: 0.00001978  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 0s) Loss

Epoch 1 - avg_train_loss: 0.4364  avg_val_loss: 0.2734  time: 406s
Epoch 1 - Score: 0.7755 for th=0.47
Epoch 1 - Save Best Score: 0.7755 Model


Epoch: [2][0/715] Elapsed 0m 0s (remain 8m 57s) Loss: 0.3431(0.3431) Grad: 53972.4219  LR: 0.00001978  
Epoch: [2][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.4150(0.2677) Grad: 672791.4375  LR: 0.00001972  
Epoch: [2][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.3864(0.2574) Grad: 48635.3906  LR: 0.00001964  
Epoch: [2][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.1540(0.2539) Grad: 5698.1948  LR: 0.00001956  
Epoch: [2][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.2952(0.2507) Grad: 56163.7188  LR: 0.00001947  
Epoch: [2][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.8685(0.2986) Grad: 38228.1875  LR: 0.00001937  
Epoch: [2][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.1759(0.3247) Grad: 4109.3550  LR: 0.00001927  
Epoch: [2][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.4886(0.3351) Grad: 10239.4043  LR: 0.00001915  
Epoch: [2][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.2449(0.3357) Grad: 11504.6523  LR: 0.00001914  
EVAL: [0/179] Elapsed 0m 0s (remain 0m 59s) L

Epoch 2 - avg_train_loss: 0.3357  avg_val_loss: 0.3696  time: 404s
Epoch 2 - Score: 0.7083 for th=0.315


Epoch: [3][0/715] Elapsed 0m 0s (remain 8m 47s) Loss: 0.5051(0.5051) Grad: 62165.9805  LR: 0.00001913  
Epoch: [3][100/715] Elapsed 0m 48s (remain 4m 57s) Loss: 0.6403(0.4148) Grad: 36949.0391  LR: 0.00001901  
Epoch: [3][200/715] Elapsed 1m 37s (remain 4m 8s) Loss: 0.5473(0.3944) Grad: 257737.3594  LR: 0.00001888  
Epoch: [3][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.1905(0.3582) Grad: 22950.3086  LR: 0.00001874  
Epoch: [3][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1625(0.3523) Grad: 1.2030  LR: 0.00001860  
Epoch: [3][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.1660(0.3339) Grad: 77096.2656  LR: 0.00001844  
Epoch: [3][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.1823(0.3216) Grad: 1.0456  LR: 0.00001828  
Epoch: [3][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.3656(0.3095) Grad: 1433.9517  LR: 0.00001811  
Epoch: [3][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1982(0.3087) Grad: 27197.5527  LR: 0.00001809  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 0s) Loss: 0.0

Epoch 3 - avg_train_loss: 0.3087  avg_val_loss: 0.2587  time: 404s
Epoch 3 - Score: 0.7880 for th=0.495
Epoch 3 - Save Best Score: 0.7880 Model


Epoch: [4][0/715] Elapsed 0m 0s (remain 8m 7s) Loss: 0.1585(0.1585) Grad: nan  LR: 0.00001809  
Epoch: [4][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.3965(0.2205) Grad: 684719.4375  LR: 0.00001791  
Epoch: [4][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.4573(0.2232) Grad: 326157.1250  LR: 0.00001773  
Epoch: [4][300/715] Elapsed 2m 24s (remain 3m 19s) Loss: 0.1299(0.2262) Grad: 7515.6118  LR: 0.00001754  
Epoch: [4][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.0000(0.2225) Grad: 120.7471  LR: 0.00001735  
Epoch: [4][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.2534(0.2225) Grad: 722.2413  LR: 0.00001714  
Epoch: [4][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.1759(0.2242) Grad: 36.6832  LR: 0.00001694  
Epoch: [4][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.2449(0.2246) Grad: 18015.3359  LR: 0.00001672  
Epoch: [4][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.3839(0.2250) Grad: 221732.5781  LR: 0.00001669  
EVAL: [0/179] Elapsed 0m 0s (remain 0m 59s) Loss: 0.0435(

Epoch 4 - avg_train_loss: 0.2250  avg_val_loss: 0.2269  time: 403s
Epoch 4 - Score: 0.8094 for th=0.665
Epoch 4 - Save Best Score: 0.8094 Model


Epoch: [5][0/715] Elapsed 0m 0s (remain 8m 59s) Loss: 0.1690(0.1690) Grad: 226920.4688  LR: 0.00001669  
Epoch: [5][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.2437(0.2040) Grad: 262571.2188  LR: 0.00001647  
Epoch: [5][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.4595(0.2045) Grad: 5135.5986  LR: 0.00001624  
Epoch: [5][300/715] Elapsed 2m 24s (remain 3m 19s) Loss: 0.1189(0.2019) Grad: 22379.9824  LR: 0.00001601  
Epoch: [5][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.2627(0.2011) Grad: 10139.7627  LR: 0.00001577  
Epoch: [5][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.0507(0.2051) Grad: 127553.3516  LR: 0.00001553  
Epoch: [5][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.2088(0.2027) Grad: 0.5196  LR: 0.00001529  
Epoch: [5][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.1717(0.2028) Grad: 3810.5400  LR: 0.00001504  
Epoch: [5][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.0362(0.2025) Grad: 7912.1948  LR: 0.00001500  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 0s) Loss:

Epoch 5 - avg_train_loss: 0.2025  avg_val_loss: 0.2169  time: 403s
Epoch 5 - Score: 0.8147 for th=0.59
Epoch 5 - Save Best Score: 0.8147 Model


Epoch: [6][0/715] Elapsed 0m 0s (remain 8m 56s) Loss: 0.3075(0.3075) Grad: 1105509.6250  LR: 0.00001500  
Epoch: [6][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.2174(0.2194) Grad: 0.7256  LR: 0.00001474  
Epoch: [6][200/715] Elapsed 1m 37s (remain 4m 8s) Loss: 0.1054(0.2022) Grad: 13.2084  LR: 0.00001448  
Epoch: [6][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.1310(0.1985) Grad: 0.7734  LR: 0.00001422  
Epoch: [6][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.3040(0.2016) Grad: 764.9652  LR: 0.00001395  
Epoch: [6][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.1649(0.2018) Grad: 2848.9573  LR: 0.00001368  
Epoch: [6][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.0490(0.1973) Grad: 1966.5713  LR: 0.00001341  
Epoch: [6][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.1263(0.1983) Grad: 116720.9453  LR: 0.00001313  
Epoch: [6][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.2624(0.1986) Grad: 213.8911  LR: 0.00001309  
EVAL: [0/179] Elapsed 0m 0s (remain 0m 58s) Loss: 0.0744(0

Epoch 6 - avg_train_loss: 0.1986  avg_val_loss: 0.2228  time: 402s
Epoch 6 - Score: 0.8133 for th=0.315


Epoch: [7][0/715] Elapsed 0m 0s (remain 9m 2s) Loss: 0.1481(0.1481) Grad: 815012.1250  LR: 0.00001309  
Epoch: [7][100/715] Elapsed 0m 48s (remain 4m 57s) Loss: 0.0800(0.1965) Grad: 3.0657  LR: 0.00001281  
Epoch: [7][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.1526(0.1950) Grad: 44774.6016  LR: 0.00001253  
Epoch: [7][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.3013(0.1888) Grad: 54689.6094  LR: 0.00001224  
Epoch: [7][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1737(0.1865) Grad: 56.4833  LR: 0.00001195  
Epoch: [7][500/715] Elapsed 4m 1s (remain 1m 42s) Loss: 0.2778(0.1816) Grad: 373738.7500  LR: 0.00001167  
Epoch: [7][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.2297(0.1827) Grad: 42675.5977  LR: 0.00001138  
Epoch: [7][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.1500(0.1831) Grad: 240.6778  LR: 0.00001109  
Epoch: [7][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.0408(0.1825) Grad: 0.9730  LR: 0.00001105  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 1s) Loss: 0.0749(

Epoch 7 - avg_train_loss: 0.1825  avg_val_loss: 0.1946  time: 403s
Epoch 7 - Score: 0.8340 for th=0.5
Epoch 7 - Save Best Score: 0.8340 Model


Epoch: [8][0/715] Elapsed 0m 0s (remain 8m 59s) Loss: 0.1691(0.1691) Grad: 98.3615  LR: 0.00001104  
Epoch: [8][100/715] Elapsed 0m 48s (remain 4m 57s) Loss: 0.1573(0.1682) Grad: 96026.9844  LR: 0.00001075  
Epoch: [8][200/715] Elapsed 1m 37s (remain 4m 8s) Loss: 0.1362(0.1783) Grad: 10077.0215  LR: 0.00001046  
Epoch: [8][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.1919(0.1711) Grad: 0.5561  LR: 0.00001017  
Epoch: [8][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.0851(0.1722) Grad: 1366.8219  LR: 0.00000987  
Epoch: [8][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.2877(0.1730) Grad: 577.6472  LR: 0.00000958  
Epoch: [8][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.1709(0.1700) Grad: 205179.3281  LR: 0.00000929  
Epoch: [8][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.0918(0.1695) Grad: 2.5790  LR: 0.00000900  
Epoch: [8][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1382(0.1696) Grad: 0.4153  LR: 0.00000895  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 0s) Loss: 0.0749(0.074

Epoch 8 - avg_train_loss: 0.1696  avg_val_loss: 0.1877  time: 403s
Epoch 8 - Score: 0.8390 for th=0.33
Epoch 8 - Save Best Score: 0.8390 Model


Epoch: [9][0/715] Elapsed 0m 0s (remain 8m 43s) Loss: 0.1259(0.1259) Grad: 5.1914  LR: 0.00000895  
Epoch: [9][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.1178(0.1573) Grad: 0.4007  LR: 0.00000866  
Epoch: [9][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.0938(0.1542) Grad: 184158.0938  LR: 0.00000837  
Epoch: [9][300/715] Elapsed 2m 24s (remain 3m 19s) Loss: 0.1691(0.1520) Grad: 0.6990  LR: 0.00000808  
Epoch: [9][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.0368(0.1555) Grad: 39.2035  LR: 0.00000780  
Epoch: [9][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.1759(0.1566) Grad: 0.1466  LR: 0.00000751  
Epoch: [9][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.3041(0.1577) Grad: 90768.3984  LR: 0.00000723  
Epoch: [9][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.1274(0.1598) Grad: 25861.4297  LR: 0.00000695  
Epoch: [9][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1398(0.1597) Grad: 75.9475  LR: 0.00000691  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 0s) Loss: 0.0995(0.0995) 


Epoch 9 - avg_train_loss: 0.1597  avg_val_loss: 0.1913  time: 403s
Epoch 9 - Score: 0.8367 for th=0.3


Epoch: [10][0/715] Elapsed 0m 0s (remain 8m 44s) Loss: 0.0704(0.0704) Grad: 7.6548  LR: 0.00000691  
Epoch: [10][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.1793(0.1600) Grad: 0.6667  LR: 0.00000663  
Epoch: [10][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.1892(0.1571) Grad: 0.2971  LR: 0.00000636  
Epoch: [10][300/715] Elapsed 2m 24s (remain 3m 19s) Loss: 0.0541(0.1541) Grad: 0.1975  LR: 0.00000608  
Epoch: [10][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1278(0.1520) Grad: 0.1741  LR: 0.00000582  
Epoch: [10][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.0606(0.1511) Grad: 16.4041  LR: 0.00000555  
Epoch: [10][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.0748(0.1527) Grad: 1841108.2500  LR: 0.00000529  
Epoch: [10][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.2401(0.1515) Grad: 0.1599  LR: 0.00000504  
Epoch: [10][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1013(0.1513) Grad: 2.4232  LR: 0.00000500  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 1s) Loss: 0.1398(0.1398) 

Epoch 10 - avg_train_loss: 0.1513  avg_val_loss: 0.1844  time: 403s
Epoch 10 - Score: 0.8428 for th=0.3
Epoch 10 - Save Best Score: 0.8428 Model


Epoch: [11][0/715] Elapsed 0m 0s (remain 8m 55s) Loss: 0.0853(0.0853) Grad: 63248.5000  LR: 0.00000500  
Epoch: [11][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.1197(0.1571) Grad: inf  LR: 0.00000475  
Epoch: [11][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.0688(0.1576) Grad: 56011.9023  LR: 0.00000450  
Epoch: [11][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.0516(0.1520) Grad: 140496.1875  LR: 0.00000426  
Epoch: [11][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1435(0.1475) Grad: 220143.1562  LR: 0.00000402  
Epoch: [11][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.3950(0.1487) Grad: 2165213.7500  LR: 0.00000379  
Epoch: [11][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.1695(0.1468) Grad: 99655.3672  LR: 0.00000356  
Epoch: [11][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.1398(0.1466) Grad: 2.0699  LR: 0.00000334  
Epoch: [11][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.0906(0.1457) Grad: 27117.6230  LR: 0.00000331  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 2s)

Epoch 11 - avg_train_loss: 0.1457  avg_val_loss: 0.1845  time: 403s
Epoch 11 - Score: 0.8431 for th=0.3
Epoch 11 - Save Best Score: 0.8431 Model


Epoch: [12][0/715] Elapsed 0m 0s (remain 8m 13s) Loss: 0.2214(0.2214) Grad: nan  LR: 0.00000331  
Epoch: [12][100/715] Elapsed 0m 49s (remain 4m 57s) Loss: 0.2809(0.1495) Grad: 336.4131  LR: 0.00000309  
Epoch: [12][200/715] Elapsed 1m 37s (remain 4m 8s) Loss: 0.1727(0.1409) Grad: 722.1407  LR: 0.00000288  
Epoch: [12][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.1041(0.1389) Grad: 8165.6992  LR: 0.00000268  
Epoch: [12][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1140(0.1426) Grad: 433.6477  LR: 0.00000248  
Epoch: [12][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.0660(0.1414) Grad: 2.7650  LR: 0.00000229  
Epoch: [12][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.0500(0.1454) Grad: 0.0284  LR: 0.00000211  
Epoch: [12][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.0541(0.1430) Grad: 0.0465  LR: 0.00000193  
Epoch: [12][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1719(0.1432) Grad: 0.0585  LR: 0.00000191  
EVAL: [0/179] Elapsed 0m 0s (remain 0m 59s) Loss: 0.1439(0.1439) 

Epoch 12 - avg_train_loss: 0.1432  avg_val_loss: 0.1862  time: 403s
Epoch 12 - Score: 0.8412 for th=0.695


Epoch: [13][0/715] Elapsed 0m 0s (remain 8m 49s) Loss: 0.0279(0.0279) Grad: 0.3883  LR: 0.00000191  
Epoch: [13][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.1278(0.1418) Grad: 0.2201  LR: 0.00000174  
Epoch: [13][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.0387(0.1437) Grad: 0.4689  LR: 0.00000158  
Epoch: [13][300/715] Elapsed 2m 24s (remain 3m 19s) Loss: 0.1967(0.1429) Grad: 0.7469  LR: 0.00000142  
Epoch: [13][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1379(0.1420) Grad: 10498.3936  LR: 0.00000128  
Epoch: [13][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.2116(0.1435) Grad: 28776.5371  LR: 0.00000114  
Epoch: [13][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.2542(0.1421) Grad: 296.5618  LR: 0.00000101  
Epoch: [13][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.2007(0.1401) Grad: 0.2837  LR: 0.00000088  
Epoch: [13][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1295(0.1405) Grad: 81.6745  LR: 0.00000086  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 1s) Loss: 0.1306(0.13

Epoch 13 - avg_train_loss: 0.1405  avg_val_loss: 0.1826  time: 403s
Epoch 13 - Score: 0.8426 for th=0.44


Epoch: [14][0/715] Elapsed 0m 0s (remain 8m 48s) Loss: 0.2029(0.2029) Grad: 1.2148  LR: 0.00000086  
Epoch: [14][100/715] Elapsed 0m 48s (remain 4m 57s) Loss: 0.0518(0.1314) Grad: 0.2755  LR: 0.00000075  
Epoch: [14][200/715] Elapsed 1m 37s (remain 4m 8s) Loss: 0.1216(0.1276) Grad: 0.1391  LR: 0.00000064  
Epoch: [14][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.0328(0.1298) Grad: 0.2108  LR: 0.00000054  
Epoch: [14][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.0260(0.1300) Grad: 0.1582  LR: 0.00000045  
Epoch: [14][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.1422(0.1295) Grad: 0.0779  LR: 0.00000037  
Epoch: [14][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.0853(0.1315) Grad: 226006.5156  LR: 0.00000029  
Epoch: [14][700/715] Elapsed 5m 38s (remain 0m 6s) Loss: 0.2376(0.1337) Grad: 1372.9000  LR: 0.00000023  
Epoch: [14][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1178(0.1336) Grad: 0.6642  LR: 0.00000022  
EVAL: [0/179] Elapsed 0m 0s (remain 1m 1s) Loss: 0.1306(0.1306)

Epoch 14 - avg_train_loss: 0.1336  avg_val_loss: 0.1789  time: 403s
Epoch 14 - Score: 0.8449 for th=0.3
Epoch 14 - Save Best Score: 0.8449 Model


Epoch: [15][0/715] Elapsed 0m 0s (remain 8m 50s) Loss: 0.0104(0.0104) Grad: 4.9355  LR: 0.00000022  
Epoch: [15][100/715] Elapsed 0m 48s (remain 4m 56s) Loss: 0.1092(0.1216) Grad: 0.2931  LR: 0.00000016  
Epoch: [15][200/715] Elapsed 1m 36s (remain 4m 7s) Loss: 0.1335(0.1204) Grad: 0.1574  LR: 0.00000011  
Epoch: [15][300/715] Elapsed 2m 25s (remain 3m 19s) Loss: 0.0211(0.1248) Grad: 282339.5312  LR: 0.00000007  
Epoch: [15][400/715] Elapsed 3m 13s (remain 2m 31s) Loss: 0.1904(0.1321) Grad: 0.1491  LR: 0.00000004  
Epoch: [15][500/715] Elapsed 4m 1s (remain 1m 43s) Loss: 0.1076(0.1327) Grad: 0.1373  LR: 0.00000002  
Epoch: [15][600/715] Elapsed 4m 49s (remain 0m 54s) Loss: 0.0749(0.1344) Grad: 170.3974  LR: 0.00000001  
Epoch: [15][700/715] Elapsed 5m 37s (remain 0m 6s) Loss: 0.0420(0.1368) Grad: 0.0459  LR: 0.00000000  
Epoch: [15][714/715] Elapsed 5m 44s (remain 0m 0s) Loss: 0.1005(0.1364) Grad: 0.0357  LR: 0.00000000  
EVAL: [0/179] Elapsed 0m 0s (remain 0m 59s) Loss: 0.1306(0.1306)

Epoch 15 - avg_train_loss: 0.1364  avg_val_loss: 0.1792  time: 403s
Epoch 15 - Score: 0.8446 for th=0.51
Score: 0.8445


TypeError: unsupported operand type(s) for +: 'PosixPath' and 'str'

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd "/content/drive/MyDrive/Colab Notebooks/nbme/code/"
# !pwd

In [None]:
# '''
# import os
# #%env KAGGLE_KEY="b991cf1aac00917801b6a8e9ea9e59ce"
# #!pip install kaggle
# #%mkdir /root/.kaggle        # successful
# !echo "{\"username\":\"taromasuda\",\"key\":\b991cf1aac00917801b6a8e9ea9e59ce\"}" > /root/.kaggle/kaggle.json
# !chmod 600 /root/.kaggle/kaggle.json

# os.environ['KAGGLE_USERNAME'] = 'taromasuda'
# os.environ['KAGGLE_KEY'] = 'b991cf1aac00917801b6a8e9ea9e59ce'

# !kaggle datasets init -p "../output/exp021-roberta-base"
# !echo "{\"title\": \"exp021-roberta-base\", \"id\": \"taromasuda/exp021-roberta-base\", \"licenses\": [    {\"name\": \"CC0-1.0\"    }]}" > "../output/exp021-roberta-base/dataset-metadata.json"
# !kaggle datasets create -p "../output/exp021-roberta-base" --dir-mode "zip" 
# '''