In [1]:
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.9
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl (149.9 MB)
[K     |████████████████████████████████| 149.9 MB 26 kB/s s eta 0:00:01
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 1.1 kB/s  eta 0:00:017     |█████████████████████           | 548.2 MB 93.9 MB/s eta 0:00:04
Installing collected packages: torch-xla, torch
  Attempting uninstall: torch
    Found existing installation: torch 1.7.1+cpu
    Uninstalling torch-1.7.1+cpu:
      Successfully uninstalled torch-1.7.1+cpu
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.8.2+cpu requires torch==1.7.1, but you have torch 1.9.0 which is incompatible.
torchtext 0.8.1 requires torch==1.7.1, but you have torch 1.9.0

In [2]:
import gc
import copy
import time
import random
import string
import os

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

#Text Cleaning
from bs4 import BeautifulSoup
import re 

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")



2022-02-07 08:13:50.356688: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-02-07 08:13:50.357482: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
# imports the torch_xla package
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.serialization as xser

'''import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils'''

'import torch_xla\nimport torch_xla.debug.metrics as met\nimport torch_xla.distributed.parallel_loader as pl\nimport torch_xla.utils.utils as xu\nimport torch_xla.core.xla_model as xm\nimport torch_xla.distributed.xla_multiprocessing as xmp\nimport torch_xla.test.test_utils as test_utils'

In [4]:
os.environ['XLA_USE_BF16']="1"
os.environ['XLA_TENSOR_ALLOCATOR_MAXSIZE'] = '100000000'

In [5]:
CONFIG = {"seed": 2021,
          "epochs": 10,
          "model_name": "unitary/unbiased-toxic-roberta",
          "train_batch_size": 8,
          "valid_batch_size": 16,
          "max_length": 512,
          "learning_rate": 1e-5,
          "epsilon" : 1e-6,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-5,
          "n_fold": 1,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "patience": 4
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/997 [00:00<?, ?B/s]

In [6]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [7]:
df_train = pd.read_csv('../input/train-jigsaw-rate/train.csv')
df_train.head()

Unnamed: 0,more_toxic,less_toxic
0,Banning guns and explosives doesn t make a sta...,Banning guns and explosives doesn t make a sta...
1,I m saying it hasn t been researched and yet t...,I m saying it hasn t been researched and yet t...
2,The fuck did I just read,what what the fuck did i just read
3,It s almost as if these leave voting money gru...,Leading Britain off a cliff and cheating their...
4,Bulgaria Croatia Czech Republic Denmark Hungar...,Bulgaria Croatia Czech Republic Denmark Hungar...


In [8]:
df_valid = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_valid.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


In [9]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [10]:
'''df_train.less_toxic = df_train.less_toxic.apply(text_cleaning)
df_train.more_toxic = df_train.more_toxic.apply(text_cleaning)
df_valid.less_toxic = df_valid.less_toxic.apply(text_cleaning)
df_valid.more_toxic = df_valid.more_toxic.apply(text_cleaning)'''

'df_train.less_toxic = df_train.less_toxic.apply(text_cleaning)\ndf_train.more_toxic = df_train.more_toxic.apply(text_cleaning)\ndf_valid.less_toxic = df_valid.less_toxic.apply(text_cleaning)\ndf_valid.more_toxic = df_valid.more_toxic.apply(text_cleaning)'

In [11]:
df_train = df_train.sample(frac=1, random_state=CONFIG['seed'])
df_valid = df_valid.sample(frac=1, random_state=CONFIG['seed'])

In [12]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }


In [13]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                             output_hidden_states=False)
        out = self.drop(out[1])
        out = out[1]
        outputs = self.fc(out)
        
        return outputs

In [14]:
JigsawM = xmp.MpModelWrapper(JigsawModel(CONFIG['model_name']))

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at unitary/unbiased-toxic-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)

In [16]:
def accuracy(y_more_toxic, y_less_toxic): 
    correct = torch.sum(torch.where(y_more_toxic>y_less_toxic, 1, 0))
    wrong = torch.sum(torch.where(y_more_toxic<=y_less_toxic, 1, 0))
    acc = correct / (correct + wrong)
    return acc.view(-1).cpu().detach().numpy()

In [17]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    score_batch = []

    tk0 = tqdm(dataloader, total=len(dataloader), desc="Training", disable=not xm.is_master_ordinal())
    start_time = time.time()
    
    for bi, d in enumerate(tk0):
            
        
        more_toxic_ids = d['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = d['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = d['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = d['less_toxic_mask'].to(device, dtype = torch.long)
        targets = d['target'].to(device, dtype=torch.long)

        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
            
        score_batch.append(accuracy(more_toxic_outputs, less_toxic_outputs))
        
        loss.backward()

        if (bi + 1) % CONFIG['n_accumulate'] == 0:

            xm.optimizer_step(optimizer)    
            #optimizer.step()
            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
        # since the loss is on all 8 cores, reduce the loss values and print the average
        loss_reduced = xm.mesh_reduce('loss_reduce',loss, lambda x: sum(x) / len(x)) 

        if bi % 100 == 0:
            xm.master_print(
                f"bi={bi}, {time.time()-start_time:<2.2f} - loss:{loss_reduced}"
            )
 
        running_loss += (loss_reduced.detach().item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size
        
        epoch_score = np.mean(score_batch)

        tk0.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                            LR=optimizer.param_groups[0]['lr'])

    del loss
    del running_loss
    del loss_reduced
    gc.collect()
    return epoch_loss, epoch_score


In [18]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    score_batch = []

    vk0 = tqdm(dataloader, total=len(dataloader), desc="Validation", disable=not xm.is_master_ordinal())
    
    for bi, d in enumerate(vk0):        
        more_toxic_ids = d['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = d['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = d['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = d['less_toxic_mask'].to(device, dtype = torch.long)
        targets = d['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        score_batch.append(accuracy(more_toxic_outputs, less_toxic_outputs))

        # since the loss is on all 8 cores, reduce the loss values and print the average
        loss_reduced = xm.mesh_reduce('loss_reduce',loss, lambda x: sum(x) / len(x)) 
        # master_print will only print once (not from all 8 cores)
        #xm.master_print(f'val. loss={loss_reduced}')
        
        running_loss += (loss_reduced.detach().item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        epoch_score = np.mean(score_batch)
        
        vk0.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss)   
    
    gc.collect()
    return epoch_loss, epoch_score


In [19]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [20]:
def run(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    
    device = xm.xla_device() # our device (single TPU core)
    
    xm.set_rng_state(CONFIG['seed'], device)
    fold = FLAGS["fold"]

   
    
    
    train_dataset = JigsawDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = JigsawDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    
    # special sampler needed for distributed/multi-core (divides dataset among the replicas/cores/devices)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(), #divide dataset among this many replicas
        rank=xm.get_ordinal(), #which replica/device/core
        shuffle=True)
    
    # define DataLoader with the defined sampler
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=CONFIG['train_batch_size'],
        sampler=train_sampler,
        num_workers=0,
        drop_last=True)
    
    # same as train but with valid data
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=CONFIG['valid_batch_size'],
        sampler=valid_sampler,
        num_workers=0,
        drop_last=False)
    
    
    train_loader = pl.MpDeviceLoader(train_loader, device) # puts the train data onto the current TPU core
    valid_loader = pl.MpDeviceLoader(valid_loader, device) # puts the valid data onto the current TPU core
    

    model = JigsawM.to(device) # put model onto the current TPU core

    lr = CONFIG['learning_rate'] * xm.xrt_world_size()
    optimizer = AdamW(model.parameters(), lr=lr, eps=CONFIG['epsilon'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)

    gc.collect()

    best_epoch_loss = np.inf
    history = defaultdict(list)
    epochs_no_improve = 0

    best_model_wts = copy.deepcopy(model.state_dict())
    
    xm.master_print(f'========== training fold {FLAGS["fold"]} for {CONFIG["epochs"]} epochs ==========')
    for epoch in range(CONFIG["epochs"]):
        xm.master_print(f'EPOCH {i}:')
        # train one epoch
        train_epoch_loss, train_epoch_score = train_one_epoch(model, optimizer, scheduler,
                                                              train_loader, device, epoch)
                
        # validation one epoch
        val_epoch_loss, val_epoch_score = valid_one_epoch(model, valid_loader, device, epoch)

        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train Score'].append(train_epoch_score)
        history['Valid Score'].append(val_epoch_score)
        
        xm.master_print(f"Train score {train_epoch_score}")
        xm.master_print(f"Valid score {val_epoch_score}")

        if val_epoch_loss <= best_epoch_loss:
            xm.master_print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            history["Best Loss"].append(best_epoch_loss)
            best_model_wts = copy.deepcopy(model.state_dict())
            xm.rendezvous('save_model')
    
            xm.master_print('save model')

            PATH =f'Loss-Fold-{FLAGS["fold"]}.bin'
    
            xm.save(model.state_dict(), PATH)
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= CONFIG["patience"]:
                xm.master_print('Early stopping!' )
                xm.master_print()
                break

        model.load_state_dict(best_model_wts)

        gc.collect()
    
    

In [21]:
for i in range(CONFIG['n_fold']):
    FLAGS={}
    FLAGS["fold"] = i
    start_time = time.time()
    xmp.spawn(run, args=(FLAGS,), nprocs=8, start_method='fork')
    print('time taken: ', time.time()-start_time)
    print('==============================================================================')



EPOCH 0:


Training:   0%|          | 0/939 [00:00<?, ?it/s]

bi=0, 22.42 - loss:0.38427734375


Training:  11%|█         | 100/939 [03:49<11:18,  1.24it/s, Epoch=0, LR=7.25e-5, Train_Loss=0.429]

bi=100, 230.27 - loss:0.233154296875


Training:  21%|██▏       | 200/939 [05:10<10:06,  1.22it/s, Epoch=0, LR=5.27e-5, Train_Loss=0.416]

bi=200, 311.00 - loss:0.233642578125


Training:  32%|███▏      | 300/939 [06:31<08:31,  1.25it/s, Epoch=0, LR=2.83e-5, Train_Loss=0.412]

bi=300, 391.95 - loss:0.5029296875


Training:  43%|████▎     | 400/939 [07:56<08:30,  1.06it/s, Epoch=0, LR=8.54e-6, Train_Loss=0.409]

bi=400, 477.27 - loss:0.2841796875


Training:  53%|█████▎    | 500/939 [09:17<05:45,  1.27it/s, Epoch=0, LR=1e-6, Train_Loss=0.405]   

bi=500, 558.05 - loss:0.3701171875


Training:  64%|██████▍   | 600/939 [10:38<04:40,  1.21it/s, Epoch=0, LR=8.54e-6, Train_Loss=0.4]  

bi=600, 639.22 - loss:0.70166015625


Training:  75%|███████▍  | 700/939 [11:59<03:15,  1.22it/s, Epoch=0, LR=2.83e-5, Train_Loss=0.396]

bi=700, 720.73 - loss:0.37744140625


Training:  85%|████████▌ | 800/939 [13:21<01:54,  1.21it/s, Epoch=0, LR=5.27e-5, Train_Loss=0.394]

bi=800, 802.05 - loss:0.191162109375


Training:  96%|█████████▌| 900/939 [14:42<00:31,  1.23it/s, Epoch=0, LR=7.25e-5, Train_Loss=0.394]

bi=900, 883.04 - loss:0.548095703125


Training: 100%|██████████| 939/939 [15:13<00:00,  1.03it/s, Epoch=0, LR=7.71e-5, Train_Loss=0.394]
Validation: 100%|██████████| 236/236 [02:21<00:00,  1.66it/s, Epoch=0, Valid_Loss=0.368]


Train score 0.6485623121261597
Valid score 0.7118644118309021
Validation Loss Improved (inf ---> 0.3682491614638682)
save model
EPOCH 0:


Training:   0%|          | 0/939 [00:00<?, ?it/s]

bi=0, 0.95 - loss:0.3720703125


Training:  11%|█         | 100/939 [01:19<10:59,  1.27it/s, Epoch=1, LR=7.88e-5, Train_Loss=0.292]

bi=100, 80.18 - loss:0.102783203125


Training:  21%|██▏       | 200/939 [02:38<09:45,  1.26it/s, Epoch=1, LR=6.59e-5, Train_Loss=0.316]

bi=200, 159.57 - loss:0.15234375


Training:  32%|███▏      | 300/939 [03:57<08:14,  1.29it/s, Epoch=1, LR=4.32e-5, Train_Loss=0.329]

bi=300, 238.65 - loss:0.8603515625


Training:  43%|████▎     | 400/939 [05:17<07:03,  1.27it/s, Epoch=1, LR=1.95e-5, Train_Loss=0.336]

bi=400, 318.27 - loss:0.507080078125


Training:  53%|█████▎    | 500/939 [06:36<05:47,  1.26it/s, Epoch=1, LR=3.87e-6, Train_Loss=0.343]

bi=500, 397.32 - loss:0.501708984375


Training:  64%|██████▍   | 600/939 [07:56<04:32,  1.24it/s, Epoch=1, LR=2.18e-6, Train_Loss=0.347]

bi=600, 477.15 - loss:0.465576171875


Training:  75%|███████▍  | 700/939 [09:16<03:11,  1.25it/s, Epoch=1, LR=1.51e-5, Train_Loss=0.35] 

bi=700, 556.96 - loss:0.2275390625


Training:  85%|████████▌ | 800/939 [10:35<01:48,  1.28it/s, Epoch=1, LR=3.78e-5, Train_Loss=0.349]

bi=800, 636.66 - loss:0.20947265625


Training:  96%|█████████▌| 900/939 [11:56<00:31,  1.25it/s, Epoch=1, LR=6.15e-5, Train_Loss=0.34] 

bi=900, 717.11 - loss:0.17041015625


Training: 100%|██████████| 939/939 [12:27<00:00,  1.26it/s, Epoch=1, LR=6.9e-5, Train_Loss=0.337] 
Validation: 100%|██████████| 236/236 [01:33<00:00,  2.53it/s, Epoch=1, Valid_Loss=0.505]


Train score 0.7060703039169312
Valid score 0.694915235042572
EPOCH 0:


Training:   0%|          | 0/939 [00:00<?, ?it/s]

bi=0, 0.71 - loss:0.193359375


Training:  11%|█         | 100/939 [01:18<10:54,  1.28it/s, Epoch=2, LR=7.96e-5, Train_Loss=0.317]

bi=100, 79.02 - loss:0.302734375


Training:  21%|██▏       | 200/939 [02:37<09:42,  1.27it/s, Epoch=2, LR=7.54e-5, Train_Loss=0.33] 

bi=200, 158.41 - loss:0.413330078125


Training:  32%|███▏      | 300/939 [03:57<08:15,  1.29it/s, Epoch=2, LR=5.78e-5, Train_Loss=0.338]

bi=300, 238.00 - loss:0.382568359375


Training:  43%|████▎     | 400/939 [05:16<07:01,  1.28it/s, Epoch=2, LR=3.36e-5, Train_Loss=0.349]

bi=400, 316.86 - loss:0.510009765625


Training:  53%|█████▎    | 500/939 [06:35<05:38,  1.30it/s, Epoch=2, LR=1.2e-5, Train_Loss=0.353] 

bi=500, 396.00 - loss:0.23095703125


Training:  64%|██████▍   | 600/939 [07:54<04:28,  1.26it/s, Epoch=2, LR=1.38e-6, Train_Loss=0.351]

bi=600, 475.26 - loss:0.906982421875


Training:  75%|███████▍  | 700/939 [09:13<03:11,  1.25it/s, Epoch=2, LR=5.65e-6, Train_Loss=0.349]

bi=700, 554.42 - loss:0.35107421875


Training:  85%|████████▌ | 800/939 [10:33<01:51,  1.24it/s, Epoch=2, LR=2.32e-5, Train_Loss=0.349]

bi=800, 633.92 - loss:0.142822265625


Training:  96%|█████████▌| 900/939 [11:52<00:31,  1.25it/s, Epoch=2, LR=4.74e-5, Train_Loss=0.342]

bi=900, 712.91 - loss:0.29296875


Training: 100%|██████████| 939/939 [12:23<00:00,  1.26it/s, Epoch=2, LR=5.66e-5, Train_Loss=0.339]
Validation: 100%|██████████| 236/236 [01:33<00:00,  2.53it/s, Epoch=2, Valid_Loss=0.444]


Train score 0.7113950848579407
Valid score 0.6991525292396545
EPOCH 0:


Training:   0%|          | 0/939 [00:00<?, ?it/s]

bi=0, 0.71 - loss:0.268798828125


Training:  11%|█         | 100/939 [01:18<11:10,  1.25it/s, Epoch=3, LR=7.47e-5, Train_Loss=0.291]

bi=100, 79.54 - loss:0.299560546875


Training:  21%|██▏       | 200/939 [02:37<09:34,  1.29it/s, Epoch=3, LR=7.98e-5, Train_Loss=0.296]

bi=200, 157.94 - loss:0.28955078125


Training:  32%|███▏      | 300/939 [03:56<08:32,  1.25it/s, Epoch=3, LR=6.98e-5, Train_Loss=0.31] 

bi=300, 237.36 - loss:0.7275390625


Training:  43%|████▎     | 400/939 [05:15<06:57,  1.29it/s, Epoch=3, LR=4.86e-5, Train_Loss=0.333]

bi=400, 316.64 - loss:0.355224609375


Training:  53%|█████▎    | 500/939 [06:35<05:50,  1.25it/s, Epoch=3, LR=2.44e-5, Train_Loss=0.341]

bi=500, 395.79 - loss:0.255859375


Training:  64%|██████▍   | 600/939 [07:54<04:30,  1.25it/s, Epoch=3, LR=6.25e-6, Train_Loss=0.344]

bi=600, 475.39 - loss:0.524658203125


Training:  75%|███████▍  | 700/939 [09:13<03:11,  1.25it/s, Epoch=3, LR=1.23e-6, Train_Loss=0.347]

bi=700, 554.64 - loss:0.27001953125


Training:  85%|████████▌ | 800/939 [10:33<01:49,  1.27it/s, Epoch=3, LR=1.12e-5, Train_Loss=0.343]

bi=800, 634.07 - loss:0.030029296875


Training:  96%|█████████▌| 900/939 [11:52<00:30,  1.27it/s, Epoch=3, LR=3.24e-5, Train_Loss=0.335]

bi=900, 713.59 - loss:0.17333984375


Training: 100%|██████████| 939/939 [12:24<00:00,  1.26it/s, Epoch=3, LR=4.2e-5, Train_Loss=0.334] 
Validation: 100%|██████████| 236/236 [01:33<00:00,  2.51it/s, Epoch=3, Valid_Loss=0.449]


Train score 0.7188498377799988
Valid score 0.7203390002250671
EPOCH 0:


Training:   0%|          | 0/939 [00:00<?, ?it/s]

bi=0, 0.74 - loss:0.380126953125


Training:  11%|█         | 100/939 [01:18<11:17,  1.24it/s, Epoch=4, LR=6.49e-5, Train_Loss=0.288]

bi=100, 79.73 - loss:0.34814453125


Training:  21%|██▏       | 200/939 [02:38<09:42,  1.27it/s, Epoch=4, LR=7.85e-5, Train_Loss=0.272]

bi=200, 158.79 - loss:0.251220703125


Training:  32%|███▏      | 300/939 [03:57<08:23,  1.27it/s, Epoch=4, LR=7.76e-5, Train_Loss=0.293]

bi=300, 237.87 - loss:0.92626953125


Training:  43%|████▎     | 400/939 [05:16<07:04,  1.27it/s, Epoch=4, LR=6.25e-5, Train_Loss=0.317]

bi=400, 316.90 - loss:0.275390625


Training:  53%|█████▎    | 500/939 [06:35<05:46,  1.27it/s, Epoch=4, LR=3.9e-5, Train_Loss=0.323] 

bi=500, 396.69 - loss:0.693359375


Training:  64%|██████▍   | 600/939 [07:55<04:31,  1.25it/s, Epoch=4, LR=1.61e-5, Train_Loss=0.331]

bi=600, 476.17 - loss:0.617431640625


Training:  75%|███████▍  | 700/939 [09:14<03:13,  1.23it/s, Epoch=4, LR=2.5e-6, Train_Loss=0.331] 

bi=700, 555.71 - loss:0.2724609375


Training:  85%|████████▌ | 800/939 [10:35<01:51,  1.24it/s, Epoch=4, LR=3.42e-6, Train_Loss=0.329]

bi=800, 635.91 - loss:0.182373046875


Training:  96%|█████████▌| 900/939 [11:54<00:31,  1.23it/s, Epoch=4, LR=1.85e-5, Train_Loss=0.322]

bi=900, 715.20 - loss:0.209228515625


Training: 100%|██████████| 939/939 [12:25<00:00,  1.26it/s, Epoch=4, LR=2.71e-5, Train_Loss=0.32] 
Validation: 100%|██████████| 236/236 [01:32<00:00,  2.54it/s, Epoch=4, Valid_Loss=0.408]


Train score 0.7358892560005188
Valid score 0.7033898234367371
Early stopping!

time taken:  4436.113703012466
