In [1]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from apex import amp

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [2]:
use_amp = True

In [3]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Mon_May__3_19:15:13_PDT_2021
Cuda compilation tools, release 11.3, V11.3.109
Build cuda_11.3.r11.3/compiler.29920130_0


In [4]:
torch.cuda.is_available()

True

In [5]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.synchronize()

In [6]:
'''class Config:
    
    model_name = '../input/roberta-base'
        
    learning_rate = 1e-4
    epochs = 1
    train_bs =32
    valid_bs = 64
    test_bs = 128
        
    seed = 2021
    max_length = 128
    min_lr = 1e-7
    scheduler = 'CosineAnnealingLR' # 学习率衰减策略
    T_max  = 500
    weight_decay = 1e-6 # 权重衰减 L2正则化 减少过拟合
    max_grad_norm = 1.0 # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
    num_classes = 1
    margin = 0.5
    n_fold = 5
    n_accululate = 1
    device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    hidden_size =768
    num_hidden_layers = 24
    
    dropout = 0.2'''

'class Config:\n    \n    model_name = \'../input/roberta-base\'\n        \n    learning_rate = 1e-4\n    epochs = 1\n    train_bs =32\n    valid_bs = 64\n    test_bs = 128\n        \n    seed = 2021\n    max_length = 128\n    min_lr = 1e-7\n    scheduler = \'CosineAnnealingLR\' # 学习率衰减策略\n    T_max  = 500\n    weight_decay = 1e-6 # 权重衰减 L2正则化 减少过拟合\n    max_grad_norm = 1.0 # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小\n    num_classes = 1\n    margin = 0.5\n    n_fold = 5\n    n_accululate = 1\n    device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")\n    \n    hidden_size =768\n    num_hidden_layers = 24\n    \n    dropout = 0.2'

In [7]:
CONFIG = {"seed": 2021,
          "epochs": 3,
          "model_name": "GroNLP/hateBERT",
          "train_batch_size": 28,
          "valid_batch_size": 56,
          "max_length": 100,
          "learning_rate": 5e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [8]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [9]:
df = pd.read_csv("input/jigsaw-toxic-severity-rating/validation_data.csv")
df.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


In [10]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.worker)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

Unnamed: 0,worker,less_toxic,more_toxic,kfold
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,4
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,0
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",0
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,2
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",0


In [11]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.int),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.int),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.int),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.int),
            'target': torch.tensor(target, dtype=torch.int)
        }


In [12]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                             output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

In [13]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)

In [14]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    #Automatic Mixed Precision
    scaler = torch.cuda.amp.GradScaler()
    #------------------
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
            
        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.int)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.int)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.int)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.int)
        targets = data['target'].to(device, dtype=torch.int)

        batch_size = more_toxic_ids.size(0)

        #Automatic Mixed Precision
        with torch.cuda.amp.autocast():
        #------------------
            more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
            less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
            loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
            loss = loss / CONFIG['n_accumulate']
        
        #Automatic Mixed Precision
        scaler.scale(loss).backward()
        #------------------

        if (step + 1) % CONFIG['n_accumulate'] == 0:
                
            #optimizer.step()
            
            #Automatic Mixed Precision
            scaler.step(optimizer)
            scaler.update()
            #------------------
            
            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
 
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                            LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    return epoch_loss


In [15]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.int)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.int)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.int)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.int)
        targets = data['target'].to(device, dtype=torch.int)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    return epoch_loss


In [16]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        

        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            #run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [17]:
def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = JigsawDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [18]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [19]:
for fold in range(0, CONFIG['n_fold']):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
    
    model = JigsawModel(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
    
    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()
    



Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 2080



100%|██| 860/860 [04:09<00:00,  3.45it/s, Epoch=1, LR=4.11e-5, Train_Loss=0.358]
100%|██| 108/108 [00:36<00:00,  2.97it/s, Epoch=1, LR=4.11e-5, Valid_Loss=0.349]


[34mValidation Loss Improved (inf ---> 0.3493074211754382)
Model Saved[0m



100%|██| 860/860 [04:12<00:00,  3.41it/s, Epoch=2, LR=2.09e-5, Train_Loss=0.334]
100%|██| 108/108 [00:36<00:00,  3.00it/s, Epoch=2, LR=2.09e-5, Valid_Loss=0.337]


[34mValidation Loss Improved (0.3493074211754382 ---> 0.3369933101613036)
Model Saved[0m



100%|██| 860/860 [04:13<00:00,  3.39it/s, Epoch=3, LR=4.03e-6, Train_Loss=0.307]
100%|██| 108/108 [00:36<00:00,  2.97it/s, Epoch=3, LR=4.03e-6, Valid_Loss=0.346]



Training complete in 0h 14m 26s
Best Loss: 0.3370



Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 2080



100%|██| 860/860 [04:14<00:00,  3.37it/s, Epoch=1, LR=4.11e-5, Train_Loss=0.357]
100%|██| 108/108 [00:36<00:00,  2.98it/s, Epoch=1, LR=4.11e-5, Valid_Loss=0.357]


[34mValidation Loss Improved (inf ---> 0.3569630403943887)
Model Saved[0m



100%|██| 860/860 [04:11<00:00,  3.42it/s, Epoch=2, LR=2.09e-5, Train_Loss=0.324]
100%|██| 108/108 [00:36<00:00,  2.99it/s, Epoch=2, LR=2.09e-5, Valid_Loss=0.341]


[34mValidation Loss Improved (0.3569630403943887 ---> 0.341101918900302)
Model Saved[0m



100%|██| 860/860 [04:11<00:00,  3.42it/s, Epoch=3, LR=4.03e-6, Train_Loss=0.304]
100%|██| 108/108 [00:39<00:00,  2.72it/s, Epoch=3, LR=4.03e-6, Valid_Loss=0.342]



Training complete in 0h 14m 32s
Best Loss: 0.3411



Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 2080



100%|██| 860/860 [04:10<00:00,  3.44it/s, Epoch=1, LR=4.11e-5, Train_Loss=0.356]
100%|██| 108/108 [00:36<00:00,  2.99it/s, Epoch=1, LR=4.11e-5, Valid_Loss=0.348]


[34mValidation Loss Improved (inf ---> 0.34800329427313703)
Model Saved[0m



100%|██| 860/860 [04:07<00:00,  3.48it/s, Epoch=2, LR=2.09e-5, Train_Loss=0.327]
100%|███| 108/108 [00:35<00:00,  3.02it/s, Epoch=2, LR=2.09e-5, Valid_Loss=0.34]


[34mValidation Loss Improved (0.34800329427313703 ---> 0.3397659766776979)
Model Saved[0m



100%|████| 860/860 [04:06<00:00,  3.50it/s, Epoch=3, LR=4.03e-6, Train_Loss=0.3]
100%|██| 108/108 [00:35<00:00,  3.02it/s, Epoch=3, LR=4.03e-6, Valid_Loss=0.347]



Training complete in 0h 14m 13s
Best Loss: 0.3398



Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 2080



100%|██| 860/860 [04:09<00:00,  3.45it/s, Epoch=1, LR=4.11e-5, Train_Loss=0.357]
100%|██| 108/108 [00:36<00:00,  2.99it/s, Epoch=1, LR=4.11e-5, Valid_Loss=0.343]


[34mValidation Loss Improved (inf ---> 0.34261022992475426)
Model Saved[0m



100%|██| 860/860 [04:09<00:00,  3.44it/s, Epoch=2, LR=2.09e-5, Train_Loss=0.342]
100%|██| 108/108 [00:36<00:00,  2.99it/s, Epoch=2, LR=2.09e-5, Valid_Loss=0.336]


[34mValidation Loss Improved (0.34261022992475426 ---> 0.33641061873812944)
Model Saved[0m



100%|███| 860/860 [04:08<00:00,  3.45it/s, Epoch=3, LR=4.03e-6, Train_Loss=0.32]
100%|██| 108/108 [00:36<00:00,  2.99it/s, Epoch=3, LR=4.03e-6, Valid_Loss=0.335]


[34mValidation Loss Improved (0.33641061873812944 ---> 0.33464388907935766)
Model Saved[0m

Training complete in 0h 14m 19s
Best Loss: 0.3346



Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 2080



100%|██| 860/860 [04:11<00:00,  3.42it/s, Epoch=1, LR=4.11e-5, Train_Loss=0.357]
100%|██| 108/108 [00:36<00:00,  2.96it/s, Epoch=1, LR=4.11e-5, Valid_Loss=0.346]


[34mValidation Loss Improved (inf ---> 0.34616987957369943)
Model Saved[0m



100%|███| 860/860 [04:10<00:00,  3.43it/s, Epoch=2, LR=2.09e-5, Train_Loss=0.33]
100%|██| 108/108 [00:36<00:00,  2.96it/s, Epoch=2, LR=2.09e-5, Valid_Loss=0.339]


[34mValidation Loss Improved (0.34616987957369943 ---> 0.3387380821565084)
Model Saved[0m



100%|███| 860/860 [04:10<00:00,  3.44it/s, Epoch=3, LR=4.03e-6, Train_Loss=0.31]
100%|██| 108/108 [00:36<00:00,  2.97it/s, Epoch=3, LR=4.03e-6, Valid_Loss=0.342]


Training complete in 0h 14m 24s
Best Loss: 0.3387




