In [10]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [11]:
from urllib3.util import Retry
import wandb
wandb.login(key="ccd9ac345498698c7334b84a05cd115af151690f")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/dilab/.netrc


True

In [12]:
def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
    return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

HASH_NAME = id_generator(size=12)
print(HASH_NAME)

h8mkk0bhw454


In [13]:
CONFIG = {"seed": 42,
          "epochs": 5,
          "model_name": "roberta-base",
          "train_batch_size": 32,
          "valid_batch_size": 64,
          "max_length": 128,
          "learning_rate": 1e-4,
          "scheduler": 'get_linear_schedule_with_warmup', #'get_linear_schedule_with_warmup',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 4,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [14]:
val_df = pd.read_csv("/home/dilab/dataset/jigsaw/validation_data.csv")
ruddit_df = pd.read_csv("/home/dilab/kook/kaggle/jigsaw/dataset/final_ruddit_less_more.csv")
ruddit_df = ruddit_df.drop(["Unnamed: 0"], axis=1)
cls_df = pd.read_csv("/home/dilab/movie/kaggle/dataset/cls_toxic0.csv")
clean_df = pd.read_csv("/home/dilab/movie/kaggle/dataset/clean_cls.csv")
df = pd.concat([ruddit_df, val_df, cls_df, clean_df], axis=0).reset_index(drop=True)
df = df.sample(frac=1, random_state=CONFIG['seed']).reset_index(drop=True)

skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.less_toxic)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):       
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)

        return outputs

In [15]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)



def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:


        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)

        targets = data['target'].to(device, dtype=torch.long)


        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)


        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)


        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)


        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()
    
    

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    return epoch_loss



@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:

        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)


        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    
    return epoch_loss





def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    
    for epoch in range(1, num_epochs + 1): 



        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)


        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
        


    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)


        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
    
    
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
    
    
            best_model_wts = copy.deepcopy(model.state_dict())
    
    
            PATH = f"/home/dilab/movie/kaggle/model16/Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, history


def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = JigsawDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader



def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'get_linear_schedule_with_warmup':
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=500,
            num_training_steps=len(train_loader) * CONFIG["epochs"],
            last_epoch=-1,
        )
    return scheduler

In [17]:
for fold in range(0, CONFIG['n_fold']):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
    
    model = JigsawModel(CONFIG['model_name'])

    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])


    scheduler = fetch_scheduler(optimizer)
    
    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    run.finish()
    del model, history, train_loader, valid_loader
    print()



[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 1233/1233 [14:56<00:00,  1.38it/s, Epoch=1, LR=8.71e-5, Train_Loss=0.305
100%|██| 206/206 [01:55<00:00,  1.78it/s, Epoch=1, LR=8.71e-5, Valid_Loss=0.264]


[34mValidation Loss Improved (inf ---> 0.263842803523309)
Model Saved[0m



100%|█| 1233/1233 [14:49<00:00,  1.39it/s, Epoch=2, LR=6.53e-5, Train_Loss=0.26]
100%|██| 206/206 [01:54<00:00,  1.79it/s, Epoch=2, LR=6.53e-5, Valid_Loss=0.254]


[34mValidation Loss Improved (0.263842803523309 ---> 0.25401582010628115)
Model Saved[0m



100%|█| 1233/1233 [14:56<00:00,  1.38it/s, Epoch=3, LR=4.35e-5, Train_Loss=0.224
100%|██| 206/206 [01:54<00:00,  1.79it/s, Epoch=3, LR=4.35e-5, Valid_Loss=0.254]


[34mValidation Loss Improved (0.25401582010628115 ---> 0.25395633209179175)
Model Saved[0m



100%|█| 1233/1233 [14:49<00:00,  1.39it/s, Epoch=4, LR=2.18e-5, Train_Loss=0.194
100%|██| 206/206 [01:54<00:00,  1.79it/s, Epoch=4, LR=2.18e-5, Valid_Loss=0.235]


[34mValidation Loss Improved (0.25395633209179175 ---> 0.23508463615641878)
Model Saved[0m



100%|██████| 1233/1233 [14:56<00:00,  1.38it/s, Epoch=5, LR=0, Train_Loss=0.167]
100%|████████| 206/206 [01:54<00:00,  1.79it/s, Epoch=5, LR=0, Valid_Loss=0.242]



Training complete in 1h 24m 11s
Best Loss: 0.2351


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train Loss,█▆▄▂▁
Valid Loss,█▆▆▁▃

0,1
Best Loss,0.23508
Train Loss,0.16697
Valid Loss,0.2416





[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 1233/1233 [15:06<00:00,  1.36it/s, Epoch=1, LR=8.71e-5, Train_Loss=0.308
100%|██| 206/206 [01:57<00:00,  1.75it/s, Epoch=1, LR=8.71e-5, Valid_Loss=0.273]


[34mValidation Loss Improved (inf ---> 0.2725234802661198)
Model Saved[0m



100%|█| 1233/1233 [14:54<00:00,  1.38it/s, Epoch=2, LR=6.53e-5, Train_Loss=0.26]
100%|██| 206/206 [01:56<00:00,  1.77it/s, Epoch=2, LR=6.53e-5, Valid_Loss=0.256]


[34mValidation Loss Improved (0.2725234802661198 ---> 0.25580588853516817)
Model Saved[0m



100%|█| 1233/1233 [15:04<00:00,  1.36it/s, Epoch=3, LR=4.35e-5, Train_Loss=0.223
100%|██| 206/206 [01:56<00:00,  1.77it/s, Epoch=3, LR=4.35e-5, Valid_Loss=0.242]


[34mValidation Loss Improved (0.25580588853516817 ---> 0.2421796413073694)
Model Saved[0m



100%|█| 1233/1233 [14:54<00:00,  1.38it/s, Epoch=4, LR=2.18e-5, Train_Loss=0.191
100%|██| 206/206 [01:56<00:00,  1.77it/s, Epoch=4, LR=2.18e-5, Valid_Loss=0.235]


[34mValidation Loss Improved (0.2421796413073694 ---> 0.2349181018635017)
Model Saved[0m



100%|██████| 1233/1233 [15:04<00:00,  1.36it/s, Epoch=5, LR=0, Train_Loss=0.165]
100%|█████████| 206/206 [01:56<00:00,  1.77it/s, Epoch=5, LR=0, Valid_Loss=0.24]



Training complete in 1h 24m 55s
Best Loss: 0.2349


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train Loss,█▆▄▂▁
Valid Loss,█▅▂▁▂

0,1
Best Loss,0.23492
Train Loss,0.16456
Valid Loss,0.24018





[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 1233/1233 [15:01<00:00,  1.37it/s, Epoch=1, LR=8.71e-5, Train_Loss=0.344
100%|██| 206/206 [01:57<00:00,  1.75it/s, Epoch=1, LR=8.71e-5, Valid_Loss=0.497]


[34mValidation Loss Improved (inf ---> 0.4965871800370645)
Model Saved[0m



100%|█| 1233/1233 [14:57<00:00,  1.37it/s, Epoch=2, LR=6.53e-5, Train_Loss=0.494
100%|████| 206/206 [01:57<00:00,  1.75it/s, Epoch=2, LR=6.53e-5, Valid_Loss=0.5]





100%|█| 1233/1233 [14:58<00:00,  1.37it/s, Epoch=3, LR=4.35e-5, Train_Loss=0.502
100%|████| 206/206 [01:58<00:00,  1.74it/s, Epoch=3, LR=4.35e-5, Valid_Loss=0.5]





100%|█| 1233/1233 [14:59<00:00,  1.37it/s, Epoch=4, LR=2.18e-5, Train_Loss=0.501
100%|████| 206/206 [01:58<00:00,  1.74it/s, Epoch=4, LR=2.18e-5, Valid_Loss=0.5]





100%|██████| 1233/1233 [14:59<00:00,  1.37it/s, Epoch=5, LR=0, Train_Loss=0.499]
100%|██████████| 206/206 [01:58<00:00,  1.74it/s, Epoch=5, LR=0, Valid_Loss=0.5]



Training complete in 1h 24m 50s
Best Loss: 0.4966


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train Loss,▁████
Valid Loss,▁████

0,1
Best Loss,0.49659
Train Loss,0.49929
Valid Loss,0.49999





[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 1233/1233 [15:13<00:00,  1.35it/s, Epoch=1, LR=8.71e-5, Train_Loss=0.309
100%|██| 206/206 [01:59<00:00,  1.73it/s, Epoch=1, LR=8.71e-5, Valid_Loss=0.267]


[34mValidation Loss Improved (inf ---> 0.26723049471338656)
Model Saved[0m



100%|█| 1233/1233 [15:00<00:00,  1.37it/s, Epoch=2, LR=6.53e-5, Train_Loss=0.258
100%|██| 206/206 [01:59<00:00,  1.73it/s, Epoch=2, LR=6.53e-5, Valid_Loss=0.245]


[34mValidation Loss Improved (0.26723049471338656 ---> 0.24496472175564596)
Model Saved[0m



100%|█| 1233/1233 [15:24<00:00,  1.33it/s, Epoch=3, LR=4.35e-5, Train_Loss=0.221
100%|██| 206/206 [02:00<00:00,  1.71it/s, Epoch=3, LR=4.35e-5, Valid_Loss=0.246]





100%|█| 1233/1233 [15:18<00:00,  1.34it/s, Epoch=4, LR=2.18e-5, Train_Loss=0.189
100%|██| 206/206 [02:00<00:00,  1.70it/s, Epoch=4, LR=2.18e-5, Valid_Loss=0.226]


[34mValidation Loss Improved (0.24496472175564596 ---> 0.22570508470081302)
Model Saved[0m



100%|██████| 1233/1233 [15:01<00:00,  1.37it/s, Epoch=5, LR=0, Train_Loss=0.161]
100%|████████| 206/206 [02:00<00:00,  1.72it/s, Epoch=5, LR=0, Valid_Loss=0.232]



Training complete in 1h 26m 4s
Best Loss: 0.2257


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train Loss,█▆▄▂▁
Valid Loss,█▄▄▁▂

0,1
Best Loss,0.22571
Train Loss,0.16098
Valid Loss,0.23216



