In [8]:
# !mkdir -p /root/.kaggle
# !cp kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json

In [9]:
#!kaggle competitions download -c learning-equality-curriculum-recommendations

Downloading learning-equality-curriculum-recommendations.zip to /media/will/data/LECR
 99%|███████████████████████████████████████▋| 252M/254M [00:19<00:00, 18.8MB/s]
100%|████████████████████████████████████████| 254M/254M [00:19<00:00, 13.9MB/s]


In [10]:
#!unzip learning-equality-curriculum-recommendations.zip

Archive:  learning-equality-curriculum-recommendations.zip
  inflating: content.csv             
  inflating: correlations.csv        
  inflating: sample_submission.csv   
  inflating: topics.csv              


In [2]:
# !pip install transformers -q
# !pip install multiprocesspandas -q
# !pip install sentencepiece

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting sentencepiece
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/0e/7e/a69d054029c7c0470e490b3265bbd1497df9492599b1820b9d5be2c60444/sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 2.8 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


## CV Split

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm

In [12]:
N_SPLITS = 5

In [13]:
topic_df = pd.read_csv('topics.csv')
content_df = pd.read_csv('content.csv')
corr_df = pd.read_csv('correlations.csv')
# topic_df = topic_df.rename(columns={'id': 'topic_id'}).merge(corr_df)
topic_df_non_source = topic_df[topic_df['category']!='source'].reset_index(drop=True)
topic_df_non_source['stratify'] = topic_df_non_source['category'] + \
topic_df_non_source['language'] + topic_df_non_source['description'].apply(lambda x: str(isinstance(x, str))) + \
topic_df_non_source['has_content'].apply(str)

In [14]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS)
folds = list(kf.split(topic_df_non_source, y=topic_df_non_source["stratify"], groups=topic_df_non_source["channel"]))
topic_df_non_source['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df_non_source.loc[val_idx, "fold"] = fold




In [15]:
fold_df =  topic_df.merge(topic_df_non_source[['id', 'fold']], on='id', how='left').reset_index(drop=True)[['id', 'fold']].fillna(-1).rename(columns={'id': 'topic_id'})
fold_df['fold'] = fold_df['fold'].astype(int)

In [16]:
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [17]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] =  topic_df['title'] + ' [SEP] ' + topic_df['description']
topic_df = topic_df[['id', 'topic_full_text', 'language']]
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id','content_ids','topic_full_text','language']]
df = df.rename(columns={'language':'topic_language'})

In [18]:
content_df = content_df.fillna('')
content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'content_full_text', 'language']]
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language'})
df['label'] = 1

## random sample according to language

In [19]:
neg_df = []
sample_n = 5

def negative_smaple(x, candidates):
    topic_language = x['topic_language'][0]
    candidates = candidates[candidates['content_language'] == topic_language]

    return candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)

for topic_id in tqdm(df['topic_id'].unique()):
    sub_df = df[df['topic_id'] == topic_id]
    topic_language = sub_df['topic_language'].unique()[0]
    candidates = df[df['content_language'] == topic_language]
    sample_neg = candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)
    sample_neg['topic_id'] = topic_id
    sample_neg['label'] = 0
    neg_df.append(sample_neg)
neg_df = pd.concat(neg_df)
neg_df

100%|██████████| 61517/61517 [37:58<00:00, 27.00it/s]


Unnamed: 0,topic_full_text,content_full_text,topic_id,label
259538,Закръгляване на десетични дроби [SEP] Научи ка...,Закръгляване на десетични дроби - предизвикате...,t_00004da3a1b2,0
46887,Формула за смяна на основата при логаритми [SE...,Използване на правилото за смяна на основата п...,t_00004da3a1b2,0
106289,Разпознаване на линейната функция [SEP] В този...,Разпознаване на линейни функции [SEP] Научи се...,t_00004da3a1b2,0
2689,Ъгли [SEP] Преговори знанията си за ъгли.,Упражнения с уравнения с допълващи се ъгли [SE...,t_00004da3a1b2,0
17812,Построяване на ъглополовящи прави и ъгли [SEP]...,Геометрични построения: перпендикулярна права ...,t_00004da3a1b2,0
...,...,...,...,...
74105,مِفْتاحُ القُلوبِ [SEP],المفردات والتراكيب [SEP] [SEP] صحّح الخطأ في ...,t_fffe811a6da9,0
209127,في وَداعِ تِلْميذ [SEP],في وَداعِ تِلْميذ [SEP] رسالة إلكترونيّة تجمع ...,t_fffe811a6da9,0
37181,"""زها الحَديديَّةُ"" [SEP]",المفردات والتراكيب [SEP] [SEP] ما ضدّ كلمة (ق...,t_fffe811a6da9,0
60160,النظريات الاقتصادية [SEP] شرح النظريات الاقتصا...,النظرية الكينزية - جون مينارد كينز | النظريات ...,t_fffe811a6da9,0


In [20]:
df = df[['topic_id', 'topic_full_text', 'content_full_text', 'label']]
df = pd.concat([df, neg_df])
df = df.drop_duplicates()

In [21]:
df = df.merge(fold_df, left_on='topic_id', right_on='topic_id', how='left')
df = df[['topic_full_text', 'content_full_text', 'label' ,'fold']]
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

In [22]:
df.to_csv('train_folds.csv', index=None)

In [2]:
df = pd.read_csv('train_folds.csv')
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

In [3]:
df

Unnamed: 0,topic_full_text,content_full_text,label,fold
0,100 સુધીનો સરવાળો [SEP] 37 અને 49 જેવી બે-અંકન...,સમૂહ બનાવ્યા વિના 2-અંકની સંખ્યા ઉમેરવી 2 [SEP...,1,2
1,100 સુધીનો સરવાળો [SEP] 37 અને 49 જેવી બે-અંકન...,સમૂહ બનાવીને ઉમેરવું [SEP] સલ સ્થાન કિંમત વિશ...,1,2
2,100 સુધીનો સરવાળો [SEP] 37 અને 49 જેવી બે-અંકન...,સ્થાનકિંમતના બ્લોકનો ઉપયોગ કરી 100 સુધીની સંખ્...,1,2
3,12. 20: Bird Reproduction [SEP],12. 20: Bird Reproduction [SEP] [SEP] Is this...,1,3
4,12. 20: Bird Reproduction [SEP],Astounding Mating Dance Birds of Paradise -- H...,1,3
...,...,...,...,...
227620,Movimiento Armónico Simple (MAS) [SEP] Estudio...,Péndulos [SEP] Explicamos cómo podemos tratar ...,0,0
227621,"Módulo 1 [SEP] En este módulo, los estudiantes...",9.1.1_st._lucy's_home_for_girls_raised_by_wolv...,0,0
227622,Solución de problemas con distancia en el plan...,"Puntos dentro, fuera o sobre un círculo [SEP] ...",0,0
227623,Middle School [SEP],Fuerzas y Movimiento: Fundamentos [SEP] Explor...,0,0


## create CFG

In [4]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class CFG:
    input_path = '/media/will/data/LECR'
    model_path = 'microsoft/mdeberta-v3-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 124
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 32
    seed = 1006
    OUTPUT_DIR = '/media/will/data/LECR'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [6]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [7]:
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.topic = df['topic_full_text'].values
        self.content = df['content_full_text'].values
        self.label = df['label'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.topic)
    def __getitem__(self, item):
        topic = self.topic[item].replace('[SEP]', self.sep_token)
        content = self.content[item].replace('[SEP]', self.sep_token)
        label = int(self.label[item])

        
        inputs_topic = self.tokenizer(topic, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        inputs_content = self.tokenizer(content, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return torch.as_tensor(inputs_topic['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_topic['attention_mask'], dtype=torch.long), \
            torch.as_tensor(inputs_content['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_content['attention_mask'], dtype=torch.long), \
            torch.as_tensor(label, dtype=torch.float)

## build model

In [8]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = topic_output.last_hidden_state
        topic_output = torch.mean(topic_output, dim=1)

        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = content_output.last_hidden_state
        content_output = torch.mean(content_output, dim=1)

        diff = torch.abs(topic_output-content_output)

        sentence_embedding = torch.cat([topic_output, content_output, diff], 1)

        output = self.linear(sentence_embedding)
        
        loss = None
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.view(-1))
        
        return loss

## build logger

In [9]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [10]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



## build pipeline

In [11]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        label = batch[2].to(device)
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        batch_size = label.size(0)
        with torch.no_grad():
            output = model(input_ids, mask, labels=label)
        loss = output.loss
        y_preds = output.logits.argmax(dim=-1)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    #print(predictions)
    return losses.avg, predictions, labels

def train_loop(fold, model, train_dataset, valid_dataset):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters

    def get_optimizer(model):

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': 0.0}
            
        ]
        optimizer = AdamW(optimizer_parameters, lr = CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
        return optimizer

    
    optimizer = get_optimizer(model)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        #avg_loss = train_fn_awp(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        # eval
        #avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

        # scoring
        #score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        #LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f}')


        if best_score > avg_loss:
            best_score = avg_loss
            #best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return 

In [12]:
model = Custom_Bert_Simple()
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
fold = 0
tr_data = df[df['fold']!=fold].reset_index(drop=True)
va_data = df[df['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset = TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

Epoch: [1][0/5462] Elapsed 0m 1s (remain 110m 34s) Loss: 0.7121(0.7121) Grad: 1.5052  LR: 0.00002000  
Epoch: [1][100/5462] Elapsed 0m 43s (remain 38m 14s) Loss: 0.6459(0.6877) Grad: 3.1602  LR: 0.00002000  
Epoch: [1][200/5462] Elapsed 1m 25s (remain 37m 16s) Loss: 0.6889(0.6837) Grad: 1.9917  LR: 0.00002000  
Epoch: [1][300/5462] Elapsed 2m 7s (remain 36m 30s) Loss: 0.6667(0.6811) Grad: 1.6751  LR: 0.00001999  
Epoch: [1][400/5462] Elapsed 2m 50s (remain 35m 46s) Loss: 0.6724(0.6783) Grad: 3.2433  LR: 0.00001999  
Epoch: [1][500/5462] Elapsed 3m 32s (remain 35m 3s) Loss: 0.7144(0.6741) Grad: 4.5851  LR: 0.00001998  
Epoch: [1][600/5462] Elapsed 4m 14s (remain 34m 21s) Loss: 0.6096(0.6719) Grad: 3.5531  LR: 0.00001998  
Epoch: [1][700/5462] Elapsed 4m 57s (remain 33m 38s) Loss: 0.6007(0.6672) Grad: 2.0655  LR: 0.00001997  
Epoch: [1][800/5462] Elapsed 5m 39s (remain 32m 56s) Loss: 0.6407(0.6631) Grad: 2.3286  LR: 0.00001996  
Epoch: [1][900/5462] Elapsed 6m 21s (remain 32m 13s) Loss: 

Epoch 1 - avg_train_loss: 0.6030  time: 2319s
Epoch 1 - Save Best Score: 0.6030 Model


Epoch: [1][5461/5462] Elapsed 38m 39s (remain 0m 0s) Loss: 0.5641(0.6030) Grad: 3.6003  LR: 0.00001809  
Epoch: [2][0/5462] Elapsed 0m 0s (remain 55m 21s) Loss: 0.4985(0.4985) Grad: 2.0655  LR: 0.00001809  
Epoch: [2][100/5462] Elapsed 0m 43s (remain 38m 7s) Loss: 0.6030(0.5532) Grad: 2.9004  LR: 0.00001802  
Epoch: [2][200/5462] Elapsed 1m 25s (remain 37m 19s) Loss: 0.5940(0.5610) Grad: 2.6174  LR: 0.00001795  
Epoch: [2][300/5462] Elapsed 2m 8s (remain 36m 37s) Loss: 0.5607(0.5586) Grad: 4.3679  LR: 0.00001788  
Epoch: [2][400/5462] Elapsed 2m 50s (remain 35m 53s) Loss: 0.6172(0.5604) Grad: 3.9766  LR: 0.00001781  
Epoch: [2][500/5462] Elapsed 3m 33s (remain 35m 10s) Loss: 0.4567(0.5599) Grad: 2.4254  LR: 0.00001774  
Epoch: [2][600/5462] Elapsed 4m 15s (remain 34m 28s) Loss: 0.5883(0.5597) Grad: 2.5582  LR: 0.00001767  
Epoch: [2][700/5462] Elapsed 4m 58s (remain 33m 45s) Loss: 0.5977(0.5610) Grad: 3.1096  LR: 0.00001759  
Epoch: [2][800/5462] Elapsed 5m 40s (remain 33m 2s) Loss: 0.

Epoch 2 - avg_train_loss: 0.5541  time: 2323s
Epoch 2 - Save Best Score: 0.5541 Model


Epoch: [2][5461/5462] Elapsed 38m 42s (remain 0m 0s) Loss: 0.4471(0.5541) Grad: 3.4686  LR: 0.00001309  
Epoch: [3][0/5462] Elapsed 0m 0s (remain 60m 8s) Loss: 0.5879(0.5879) Grad: 1.6557  LR: 0.00001309  
Epoch: [3][100/5462] Elapsed 0m 43s (remain 38m 11s) Loss: 0.4373(0.5270) Grad: 2.9245  LR: 0.00001298  
Epoch: [3][200/5462] Elapsed 1m 25s (remain 37m 22s) Loss: 0.4134(0.5348) Grad: 4.8092  LR: 0.00001287  
Epoch: [3][300/5462] Elapsed 2m 8s (remain 36m 38s) Loss: 0.5188(0.5328) Grad: 1.9432  LR: 0.00001276  
Epoch: [3][400/5462] Elapsed 2m 50s (remain 35m 54s) Loss: 0.5821(0.5321) Grad: 5.2909  LR: 0.00001265  
Epoch: [3][500/5462] Elapsed 3m 33s (remain 35m 11s) Loss: 0.5218(0.5332) Grad: 2.2399  LR: 0.00001254  
Epoch: [3][600/5462] Elapsed 4m 15s (remain 34m 29s) Loss: 0.4573(0.5335) Grad: 2.3221  LR: 0.00001243  
Epoch: [3][700/5462] Elapsed 4m 58s (remain 33m 47s) Loss: 0.4638(0.5334) Grad: 3.2822  LR: 0.00001232  
Epoch: [3][800/5462] Elapsed 5m 41s (remain 33m 4s) Loss: 0.

Epoch 3 - avg_train_loss: 0.5320  time: 2325s
Epoch 3 - Save Best Score: 0.5320 Model


Epoch: [3][5461/5462] Elapsed 38m 44s (remain 0m 0s) Loss: 0.5388(0.5320) Grad: 3.8074  LR: 0.00000691  
Epoch: [4][0/5462] Elapsed 0m 0s (remain 77m 5s) Loss: 0.4664(0.4664) Grad: 2.6003  LR: 0.00000691  
Epoch: [4][100/5462] Elapsed 0m 43s (remain 38m 22s) Loss: 0.4231(0.5169) Grad: 1.8408  LR: 0.00000680  
Epoch: [4][200/5462] Elapsed 1m 25s (remain 37m 28s) Loss: 0.5118(0.5163) Grad: 3.6148  LR: 0.00000669  
Epoch: [4][300/5462] Elapsed 2m 8s (remain 36m 42s) Loss: 0.5304(0.5172) Grad: 5.5566  LR: 0.00000659  
Epoch: [4][400/5462] Elapsed 2m 51s (remain 35m 58s) Loss: 0.5024(0.5169) Grad: 2.0228  LR: 0.00000648  
Epoch: [4][500/5462] Elapsed 3m 33s (remain 35m 15s) Loss: 0.4276(0.5187) Grad: 2.5384  LR: 0.00000637  
Epoch: [4][600/5462] Elapsed 4m 16s (remain 34m 31s) Loss: 0.4724(0.5184) Grad: 2.1307  LR: 0.00000626  
Epoch: [4][700/5462] Elapsed 4m 58s (remain 33m 48s) Loss: 0.6063(0.5198) Grad: 2.9034  LR: 0.00000616  
Epoch: [4][800/5462] Elapsed 5m 41s (remain 33m 5s) Loss: 0.

Epoch 4 - avg_train_loss: 0.5159  time: 2326s
Epoch 4 - Save Best Score: 0.5159 Model


Epoch: [4][5461/5462] Elapsed 38m 46s (remain 0m 0s) Loss: 0.5176(0.5159) Grad: 5.6069  LR: 0.00000191  
Epoch: [5][0/5462] Elapsed 0m 0s (remain 56m 31s) Loss: 0.5139(0.5139) Grad: 2.8062  LR: 0.00000191  
Epoch: [5][100/5462] Elapsed 0m 43s (remain 38m 11s) Loss: 0.5017(0.4953) Grad: 1.9788  LR: 0.00000184  
Epoch: [5][200/5462] Elapsed 1m 25s (remain 37m 24s) Loss: 0.6768(0.4991) Grad: 2.8250  LR: 0.00000178  
Epoch: [5][300/5462] Elapsed 2m 8s (remain 36m 40s) Loss: 0.4986(0.4993) Grad: 2.3152  LR: 0.00000171  
Epoch: [5][400/5462] Elapsed 2m 50s (remain 35m 58s) Loss: 0.5037(0.4978) Grad: 2.5982  LR: 0.00000165  
Epoch: [5][500/5462] Elapsed 3m 33s (remain 35m 14s) Loss: 0.4984(0.4989) Grad: 2.5655  LR: 0.00000159  
Epoch: [5][600/5462] Elapsed 4m 16s (remain 34m 31s) Loss: 0.4699(0.4989) Grad: 3.7783  LR: 0.00000153  
Epoch: [5][700/5462] Elapsed 4m 58s (remain 33m 48s) Loss: 0.4690(0.4989) Grad: 3.3792  LR: 0.00000146  
Epoch: [5][800/5462] Elapsed 5m 41s (remain 33m 5s) Loss: 0

Epoch 5 - avg_train_loss: 0.5062  time: 2331s
Epoch 5 - Save Best Score: 0.5062 Model


Epoch: [5][5461/5462] Elapsed 38m 50s (remain 0m 0s) Loss: 0.4820(0.5062) Grad: 3.7955  LR: 0.00000000  


## Inference