In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install -q transformers

!nvidia-smi

Mounted at /content/drive
Tue Jul 26 23:32:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------------------------------

# EarlyStopping

In [2]:
import copy
import torch
import numpy as np

class EarlyStopping:
    def __init__(self, patience=6, mode="max", max_epoch=1e6, min_epoch=0, at_last_score=None):
        self.patience = patience
        self.mode = mode
        self.max_epoch = max_epoch
        self.min_epoch = min_epoch
        self.at_last_score = at_last_score if at_last_score is not None else -np.Inf 
        self.epoch = 0
        self.early_stop = False
        self.best_model = None
        self.best_epoch = 0
        self.model_path = None
        self.best_score = -np.Inf if self.mode == "max" else np.Inf

    def __call__(self, epoch_score, model=None, model_path=None):
        self.model_path = model_path
        self.epoch += 1

        score = -epoch_score if self.mode == "min" else epoch_score
        
        if score <= self.best_score: 
            counter = self.epoch - self.best_epoch
            print('EarlyStopping counter: {} out of {}'.format(counter, self.patience))
            if (counter >= self.patience) and (self.best_score > self.at_last_score) and (self.epoch >= self.min_epoch):
                self.early_stop = True 
                self._save_checkpoint()
        else:                    
            self.best_score = score 
            self.best_epoch = self.epoch
            self.best_model = copy.deepcopy(model).cpu()
        
        if self.max_epoch <= self.epoch:
            self.early_stop = True 
            self._save_checkpoint()

    def _save_checkpoint(self):
        if self.model_path is not None and self.best_model is not None:
            torch.save(self.best_model.state_dict(), self.model_path.replace('_score','_'+str(self.best_score)))
            print('model saved at: ',self.model_path.replace('_score','_'+str(self.best_score)))

# CosineAnnealingWarmupRestarts

In [3]:
import math
import torch
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingWarmupRestarts(_LRScheduler):
    """
        optimizer (Optimizer): Wrapped optimizer.
        first_cycle_steps (int): First cycle step size.
        cycle_mult(float): Cycle steps magnification. Default: -1.
        max_lr(float): First cycle's max learning rate. Default: 0.1.
        min_lr(float): Min learning rate. Default: 0.001.
        warmup_steps(int): Linear warmup step size. Default: 0.
        gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
        last_epoch (int): The index of last epoch. Default: -1.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 first_cycle_steps: int,
                 cycle_mult: float = 1.,
                 max_lr: float = 0.1,
                 min_lr: float = 0.001,
                 warmup_steps: int = 0,
                 gamma: float = 1.,
                 last_epoch: int = -1):
        assert warmup_steps < first_cycle_steps

        self.first_cycle_steps = first_cycle_steps  # first cycle step size
        self.cycle_mult = cycle_mult  # cycle steps magnification
        self.base_max_lr = max_lr  # first max learning rate
        self.max_lr = max_lr  # max learning rate in the current cycle
        self.min_lr = min_lr  # min learning rate
        self.warmup_steps = warmup_steps  # warmup step size
        self.gamma = gamma  # decrease rate of max learning rate by cycle

        self.cur_cycle_steps = first_cycle_steps  # first cycle step size
        self.cycle = 0  # cycle count
        self.step_in_cycle = last_epoch  # step size of the current cycle

        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)

        # set learning rate min_lr
        self.init_lr()

    def init_lr(self):
        self.base_lrs = []
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.min_lr
            self.base_lrs.append(self.min_lr)

    def get_lr(self):
        if self.step_in_cycle == -1:
            return self.base_lrs
        elif self.step_in_cycle < self.warmup_steps:
            return [(self.max_lr - base_lr) * self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.max_lr - base_lr)
                    * (1 + math.cos(math.pi * (self.step_in_cycle - self.warmup_steps)
                                    / (self.cur_cycle_steps - self.warmup_steps))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.step_in_cycle = self.step_in_cycle + 1
            if self.step_in_cycle >= self.cur_cycle_steps:
                self.cycle += 1
                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
                self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
        else:
            if epoch >= self.first_cycle_steps:
                if self.cycle_mult == 1.:
                    self.step_in_cycle = epoch % self.first_cycle_steps
                    self.cycle = epoch // self.first_cycle_steps
                else:
                    n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
                    self.cycle = n
                    self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
            else:
                self.cur_cycle_steps = self.first_cycle_steps
                self.step_in_cycle = epoch

        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


# train setup

In [4]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

from model import _20SampleModel

from transformers import AdamW

def train_setup(args):
    model = _20SampleModel(model_path=args.model_name_or_path)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_train_optimization_steps = args.num_train_steps
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=3e-5, correct_bias=False)
    scheduler = (
        CosineAnnealingWarmupRestarts(
            optimizer=optimizer,
            first_cycle_steps=num_train_optimization_steps,
            cycle_mult=1,
            max_lr=args.max_lr,
            min_lr=args.min_lr,
            warmup_steps=num_train_optimization_steps * 0.2,
            gamma=1.,
            last_epoch=-1
        ))  # Pytorch scheduler

    scaler = torch.cuda.amp.GradScaler()

    return model, optimizer, scheduler, scaler

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# train

In [5]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

from train import validate, read_data

import torch, sys, os
from tqdm import tqdm
import numpy as np
from metrics import kendall_tau


def train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args):
    criterion = torch.nn.L1Loss()
    es = EarlyStopping(patience=4, max_epoch=args.epochs)

    for e in range(args.epoch, 100):
        model.train()

        tbar = tqdm(train_loader, file=sys.stdout, position=0, leave=True)

        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            scaler.scale(loss).backward()

            if idx % args.accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(
                f'Epoch {e+1} Loss: {avg_loss} lr: {scheduler.get_lr()}')

        y_val, y_pred = validate(model, val_loader)
        val_df['pred'] = val_df.groupby(['id', 'cell_type'])['rank'].rank(pct=True)
        val_df.loc[val_df['cell_type'] == 'markdown', 'pred'] = y_pred
        y_dummy = val_df.sort_values('pred').groupby('id')['cell_id'].apply(list)
        preds_score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
        print("Preds score", preds_score)
        
        if not os.path.exists(args.output_path):
            os.mkdir(args.output_path)
            
        es(preds_score, model, model_path=args.output_path + f'/model_epoch_{e}_score.bin')
        if es.early_stop:
            break
        # torch.save(model.state_dict(), args.output_path + f'/model_epoch_{e}.bin')

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# Main

In [6]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

import easydict
import gc
import pandas as pd

from preprocessor import PairwisePreprocessor, _20CodeCellPreprocessor
from dataset import _20SampleDataset, PairwiseDataset, _20sample_data_setup, pairwise_data_setup
from util import pairwise_debug_setup, _20sample_debug_setup


args = {
    'model_name_or_path': 'microsoft/graphcodebert-base',

    'input_path': '../input/',

    'train_path': './data/train.csv',
    'train_mark_path': './data/train_mark.csv',
    'train_features_path': './data/train_fts.json',

    'val_path': "./data/val.csv",
    'val_mark_path': './data/val_mark.csv',
    'val_features_path': './data/val_fts.json',

    'output_path': './output-graphcodebert-20sample-debug',

    'md_max_len': 64,
    'total_max_len': 512,
    'batch_size': 16,
    'accumulation_steps': 2,
    'epoch': 0,
    'epochs': 5,
    'n_workers': 8,
    'debug': True,
    'load_train': False,
    'max_lr': 3e-5,
    'min_lr': .3e-6,
    'kfold': True
}

args = easydict.EasyDict(args)

preprocessor = _20CodeCellPreprocessor(**vars(args))
train_df, val_df, train_df_mark, val_df_mark, train_fts, val_fts = preprocessor.run()

print('before debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

kfolds = []
if args.debug:
    for i in range(5):
        fold = _20sample_debug_setup(train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts)
        kfolds.append(fold)
    
train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[0]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[1]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[2]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[3]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[4]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

df_orders = pd.read_csv(args.input_path + 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

for i in range(5):
    train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[i]

    train_loader, val_loader = _20sample_data_setup(train_df_mark, val_df_mark, train_fts, val_fts, args)

    del train_df, train_df_mark, train_fts
    gc.collect()

    args.num_train_steps = args.epochs * len(train_loader) / args.accumulation_steps

    model, optimizer, scheduler, scaler = train_setup(args)
    model.cuda()

    train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args)

    del model, optimizer, scheduler, scaler, val_fts, train_loader, val_loader
    gc.collect()

/content/drive/MyDrive/NLP/ENG/ai4code/src2
train_df, val_df are already exits
train_fts, val_fts are already exists
before debug (5740832, 8) (629814, 8) (1950118, 8) (629814, 8) 125292 13964
after debug (574687, 8) (62011, 8) (194333, 8) (62011, 8) 12529 1396
after debug (570842, 8) (62595, 8) (193892, 8) (62595, 8) 12529 1396
after debug (576775, 8) (63316, 8) (194854, 8) (63316, 8) 12529 1396
after debug (578808, 8) (63920, 8) (196854, 8) (63920, 8) 12529 1396
after debug (570488, 8) (60371, 8) (195296, 8) (60371, 8) 12529 1396


Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.1959 lr: [2.99999999689486e-05, 2.99999999689486e-05]: 100%|██████████| 12145/12145 [38:26<00:00,  5.27it/s]
100%|██████████| 1299/1299 [01:24<00:00, 15.33it/s]
Preds score 0.7948407506998547
Epoch 2 Loss: 0.1392 lr: [2.5649177506341414e-05, 2.5649177506341414e-05]: 100%|██████████| 12145/12145 [38:25<00:00,  5.27it/s]
100%|██████████| 1299/1299 [01:22<00:00, 15.72it/s]
Preds score 0.8090225161631049
Epoch 3 Loss: 0.1187 lr: [1.514711902116428e-05, 1.514711902116428e-05]: 100%|██████████| 12145/12145 [38:20<00:00,  5.28it/s]
100%|██████████| 1299/1299 [01:24<00:00, 15.44it/s]
Preds score 0.8146040945381995
Epoch 4 Loss: 0.1024 lr: [4.646748437804579e-06, 4.646748437804579e-06]: 100%|██████████| 12145/12145 [38:21<00:00,  5.28it/s]
100%|██████████| 1299/1299 [01:22<00:00, 15.68it/s]
Preds score 0.8207595364908199
Epoch 5 Loss: 0.092 lr: [3.122272540139975e-07, 3.122272540139975e-07]: 100%|██████████| 12145/12145 [38:22<00:00,  5.28it/s]
100%|██████████| 1299/1299 [01:24<

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.1934 lr: [2.99999998752403e-05, 2.99999998752403e-05]: 100%|██████████| 12118/12118 [38:17<00:00,  5.28it/s]
100%|██████████| 1357/1357 [01:27<00:00, 15.46it/s]
Preds score 0.7951092423226598
Epoch 2 Loss: 0.1369 lr: [2.5647813082897015e-05, 2.5647813082897015e-05]: 100%|██████████| 12118/12118 [38:13<00:00,  5.28it/s]
100%|██████████| 1357/1357 [01:26<00:00, 15.66it/s]
Preds score 0.8043450290642826
Epoch 3 Loss: 0.1154 lr: [1.5144225204275051e-05, 1.5144225204275051e-05]: 100%|██████████| 12118/12118 [38:15<00:00,  5.28it/s]
100%|██████████| 1357/1357 [01:27<00:00, 15.45it/s]
Preds score 0.8078201644173928
Epoch 4 Loss: 0.099 lr: [4.644021181358554e-06, 4.644021181358554e-06]: 100%|██████████| 12118/12118 [38:13<00:00,  5.28it/s]
100%|██████████| 1357/1357 [01:26<00:00, 15.72it/s]
Preds score 0.8122243044899291
Epoch 5 Loss: 0.0892 lr: [3.2450899488364414e-07, 3.2450899488364414e-07]: 100%|██████████| 12118/12118 [38:11<00:00,  5.29it/s]
100%|██████████| 1357/1357 [01

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.2204 lr: [2.9999999876466635e-05, 2.9999999876466635e-05]: 100%|██████████| 12178/12178 [38:22<00:00,  5.29it/s]
100%|██████████| 1378/1378 [01:28<00:00, 15.61it/s]
Preds score 0.6106892932609259
Epoch 2 Loss: 0.2585 lr: [2.564782649873934e-05, 2.564782649873934e-05]: 100%|██████████| 12178/12178 [38:17<00:00,  5.30it/s]
100%|██████████| 1378/1378 [01:27<00:00, 15.73it/s]
Preds score 0.6112665620450166
Epoch 3 Loss: 0.2573 lr: [1.5144253656215115e-05, 1.5144253656215115e-05]: 100%|██████████| 12178/12178 [38:14<00:00,  5.31it/s]
100%|██████████| 1378/1378 [01:28<00:00, 15.60it/s]
Preds score 0.6115388285116485
Epoch 4 Loss: 0.2567 lr: [4.644047992228955e-06, 4.644047992228955e-06]: 100%|██████████| 12178/12178 [38:14<00:00,  5.31it/s]
100%|██████████| 1378/1378 [01:27<00:00, 15.79it/s]
Preds score 0.6117911340936302
Epoch 5 Loss: 0.2563 lr: [3.2438824109049105e-07, 3.2438824109049105e-07]: 100%|██████████| 12178/12178 [38:12<00:00,  5.31it/s]
100%|██████████| 1378/1378 

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.204 lr: [2.999999996974103e-05, 2.999999996974103e-05]: 100%|██████████| 12303/12303 [38:47<00:00,  5.29it/s]
100%|██████████| 1382/1382 [01:29<00:00, 15.46it/s]
Preds score 0.7824658389213179
Epoch 2 Loss: 0.1402 lr: [2.5649194949923735e-05, 2.5649194949923735e-05]: 100%|██████████| 12303/12303 [38:47<00:00,  5.28it/s]
100%|██████████| 1382/1382 [01:28<00:00, 15.62it/s]
Preds score 0.7992569286989242
Epoch 3 Loss: 0.1188 lr: [1.5147156019835366e-05, 1.5147156019835366e-05]: 100%|██████████| 12303/12303 [38:46<00:00,  5.29it/s]
100%|██████████| 1382/1382 [01:28<00:00, 15.59it/s]
Preds score 0.8032511819190506
Epoch 4 Loss: 0.1027 lr: [4.646783311520415e-06, 4.646783311520415e-06]: 100%|██████████| 12303/12303 [38:42<00:00,  5.30it/s]
100%|██████████| 1382/1382 [01:27<00:00, 15.73it/s]
Preds score 0.8045099860099377
Epoch 5 Loss: 0.0925 lr: [3.1207022677395755e-07, 3.1207022677395755e-07]: 100%|██████████| 12303/12303 [38:38<00:00,  5.31it/s]
100%|██████████| 1382/1382 [

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.2005 lr: [2.9999999877032745e-05, 2.9999999877032745e-05]: 100%|██████████| 12206/12206 [38:19<00:00,  5.31it/s]
100%|██████████| 1282/1282 [01:22<00:00, 15.54it/s]
Preds score 0.8125917407859551
Epoch 2 Loss: 0.1394 lr: [2.5647832714322816e-05, 2.5647832714322816e-05]: 100%|██████████| 12206/12206 [38:21<00:00,  5.30it/s]
100%|██████████| 1282/1282 [01:21<00:00, 15.68it/s]
Preds score 0.8265890230787117
Epoch 3 Loss: 0.119 lr: [1.5144266838061588e-05, 1.5144266838061588e-05]: 100%|██████████| 12206/12206 [38:21<00:00,  5.30it/s]
100%|██████████| 1282/1282 [01:22<00:00, 15.49it/s]
Preds score 0.8261575688873387
EarlyStopping counter: 1 out of 4
Epoch 4 Loss: 0.1026 lr: [4.644060413787472e-06, 4.644060413787472e-06]: 100%|██████████| 12206/12206 [38:19<00:00,  5.31it/s]
100%|██████████| 1282/1282 [01:22<00:00, 15.53it/s]
Preds score 0.8356577360190676
Epoch 5 Loss: 0.0924 lr: [3.2433229559233163e-07, 3.2433229559233163e-07]: 100%|██████████| 12206/12206 [38:21<00:00,  5.