# NLP Experiments



In [1]:
# Set DEBUG to False to run the full training pipeline, or True to run on a small sample
DEBUG = True

# Install and log into W&B
!pip install wandb -qq
import wandb
wandb.login()

[K     |████████████████████████████████| 1.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 181 kB 70.5 MB/s 
[K     |████████████████████████████████| 162 kB 70.7 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 158 kB 70.0 MB/s 
[K     |████████████████████████████████| 157 kB 76.2 MB/s 
[K     |████████████████████████████████| 157 kB 76.1 MB/s 
[K     |████████████████████████████████| 157 kB 69.9 MB/s 
[K     |████████████████████████████████| 157 kB 76.4 MB/s 
[K     |████████████████████████████████| 157 kB 72.4 MB/s 
[K     |████████████████████████████████| 157 kB 79.3 MB/s 
[K     |████████████████████████████████| 157 kB 75.6 MB/s 
[K     |████████████████████████████████| 156 kB 79.5 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Installs, imports, settings

In [2]:
%%capture
!pip install ml_collections transformers[sentencepiece] pytorch_lightning

In [3]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_SILENT=false
%env WANDB_PROJECT=readability

env: TOKENIZERS_PARALLELISM=false
env: WANDB_SILENT=false
env: WANDB_PROJECT=readability


In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import random
import gc
from pathlib import Path

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification
from torch.optim import Adam, SGD, AdamW

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from ml_collections import config_dict
from functools import partial
import warnings

warnings.filterwarnings("ignore")

## Configuration

In [5]:
cfg = config_dict.ConfigDict()
cfg.random_crop_p = 0.5 if DEBUG else 0.0 # probability of random text crop
cfg.min_len = 126 # minimum length of random text crop
cfg.problem_type = 'regression'
cfg.exp_name = '004_adv_lr' 
cfg.token_dropout = 0.1 # probability of random mask augmentation
cfg.adv_lr = 0.0002 # AWP - adversarial lr (needs awp to be True)
cfg.adv_eps = 0.001 # AWP - eps (needs awp to be True)
cfg.awp = False # set True to use awp
cfg.max_len = None # no max length at the moment
cfg.wd = 0.01 # weight decay
cfg.num_warmup_steps = 100 # number of warmup steps
cfg.dropout = 0. # dropout probability
cfg.bs = 8 # batch size 
cfg.val_bs = 1 # batch size for validation
cfg.seed = 42 # random seed 
cfg.epochs = 2 if DEBUG else 10 # number of epochs
cfg.lr = 2e-5 # learning rate
cfg.grad_acc = 1 # gradient accumulation 
cfg.num_classes = 1
cfg.model_checkpoint = 'microsoft/deberta-v3-small'
cfg.PROJECT_NAME = 'readability'

In [6]:
pl.seed_everything(42, workers=True)

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


42

## 

## Training code

In [7]:
# Source: https://www.kaggle.com/code/wht1996/feedback-nn-train

class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param="weight",
        adv_lr=0.0001,
        adv_eps=0.001,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, x, y, attention_mask,epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast():
                out = self.model(input_ids=x, attention_mask=attention_mask, labels=y)
                adv_loss = out.loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [8]:
def random_crop(enc, min_len, BOS, EOS):
    cur_len = len(enc['input_ids'])
    if cur_len > min_len:
        total_len = random.randint(min_len, cur_len)
        start = random.randint(0, cur_len - total_len)
        enc['input_ids'] = [BOS] + enc['input_ids'][start:start+total_len] + [EOS]
        enc['attention_mask'] = [1] * len(enc['input_ids'])
    else: 
        enc['input_ids'] = [BOS] + enc['input_ids'] + [EOS]
        enc['attention_mask'] = [1] * len(enc['input_ids'])
    return enc

In [9]:
class MyModule(pl.LightningModule):
    def __init__(self, lr, model_checkpoint, num_classes=cfg.num_classes):
        super().__init__()
        self.lr = lr
        self.num_classes = num_classes
        config = AutoConfig.from_pretrained(model_checkpoint, output_hidden_states=True)
        config.update(
            {
                "hidden_dropout_prob": cfg.dropout,
                "attention_probs_dropout_prob": cfg.dropout,
                "num_labels": cfg.num_classes,
                "problem_type": 'regression',
            }
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config)
        self.model.gradient_checkpointing_enable()
        transformers.logging.set_verbosity_error()
    
    def load_model(self, path):
        self.load_state_dict(torch.load(path, map_location='cuda:0'), strict=False)
        print('Model Loaded!')
    
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': self.lr, 'weight_decay': cfg.wd},
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': self.lr, 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_parameters, lr=self.lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=cfg.num_warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]
    
    def on_train_start(self):
        if cfg.awp:
            self.awp = AWP(
              self.model,
              self.optimizers().optimizer,
              adv_lr=cfg.adv_lr,
              adv_eps=cfg.adv_eps,
              start_epoch=1,
              scaler=self.trainer.scaler
            )
        
    def on_train_end(self):
        if cfg.awp:
            self.awp = None
            gc.collect()
            torch.cuda.empty_cache()
            gc.collect()
                
    def training_step(self, train_batch, batch_idx):
        input_ids, attention_mask, target = \
            train_batch["input_ids"], train_batch["attention_mask"], train_batch["target"]
        
        self.batch = train_batch
        
        out = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=target,
        )
        
        train_mse = out['loss']
        train_rmse = torch.sqrt(train_mse)

        self.log('train_mse', train_mse)
        self.log('train_rmse', train_rmse)
        
        return train_rmse
    
    def on_after_backward(self):
        if cfg.awp:
            self.awp.attack_backward(self.batch["input_ids"], self.batch['target'],
                                     self.batch["attention_mask"], self.current_epoch)
        
    def validation_step(self, val_batch, batch_idx):
        input_ids, attention_mask, target = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["target"]
        
        out = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=target,
        )
        
        preds = out['logits']
        
        return {
                "preds": preds,
                "scores": target,
                }   
    
    def validation_epoch_end(self, validation_step_outputs):
        outputs = torch.cat([x["preds"] for x in validation_step_outputs], dim=0).squeeze()
        targets = torch.cat([x["scores"] for x in validation_step_outputs], dim=0)
        metric = torch.sqrt(F.mse_loss(outputs, targets)).item()
        print(f'Epoch {self.current_epoch} RMSE', metric)
        self.log('val_rmse', metric)
        
        
    def predict_step(self, val_batch, batch_idx):
        input_ids, attention_mask, target = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["target"]
        
        out = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=target,
        )        
        
        preds = out['logits']
        
        return {
                "preds": preds,
                "scores": target,
                }   
        

In [10]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, stage, rand_prob, random_p, min_len=cfg.min_len):
        self.df = df
        self.stage = stage
        self.random_p = random_p # random crop
        self.rand_prob = rand_prob # token dropout
        self.tokenizer = tokenizer
        self.text_id = df['id'].values
        self.full_text = df['excerpt'].values
        self.target = df['target'].values
        self.bos = tokenizer.convert_tokens_to_ids(tokenizer.bos_token)
        self.eos = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
        
    def __getitem__(self, idx):
        text_id = self.text_id[idx]
        full_text = self.full_text[idx]
        target = self.target[idx]
        if random.random() < self.random_p:
            enc = self.tokenizer(full_text, add_special_tokens=False)
            enc = random_crop(enc, cfg.min_len, self.bos, self.eos)
        else:
            enc = self.tokenizer(full_text, add_special_tokens=True)
        input_ids = enc['input_ids']
        attention_mask = enc['attention_mask']
        target = self.target[idx]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "target": target
        }

    def __len__(self):
        return len(self.df)

In [11]:
class Collate:
    def __init__(self, tokenizer, stage, max_len=2048):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.stage = stage
        
    def _mask(self, batch, tokenizer, mlm_probability, special_tokens=[0,1,2]):
        probability_matrix = torch.full(batch['input_ids'].shape, mlm_probability)
        special_tokens_mask = [[
            1 if x in special_tokens else 0 for x in row.tolist() 
        ] for row in batch['input_ids']]
        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        batch['input_ids'][masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
        return batch

    def __call__(self, batch):
        output = dict()
        keys = batch[0].keys()
        for key in keys:
            output[key] = [sample[key] for sample in batch]
            
            
        # calculate max token length of this batch
        batch_lens = [len(ids) for ids in output["input_ids"]]
        batch_max = max(batch_lens)
        if batch_max > self.max_len:
            batch_max = self.max_len

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        for key in keys:
            if key not in ['input_ids', 'attention_mask']:
                output[key] = torch.tensor(output[key], dtype=torch.float)
            else:
                output[key] = torch.tensor(output[key], dtype=torch.long)
                            
        if self.stage == 'train':
            output = self._mask(output, self.tokenizer, cfg.token_dropout)
        
        return output


In [12]:
def train_fold(fold, cfg):
    
    print()
    print('*' * 100)
    print(f'Training fold {fold}')
    print('*' * 100)
    tags = ['debug'] if DEBUG else ['train']
    tags += [cfg.problem_type]
    
    wandb_logger = WandbLogger(project=cfg.PROJECT_NAME,
                     name=f'{cfg.exp_name}_fold_{fold}',
                     group=f'{cfg.exp_name}',
                     tags=tags, 
                     job_type="training", 
                     config=cfg.to_dict()
                    )
    cfg = config_dict.ConfigDict(wandb_logger.experiment.config) # we may need this for sweeps
    wandb.define_metric('val_rmse', summary="min")
    wandb_logger.experiment.log_code()

    artifact = wandb_logger.experiment.use_artifact('darek/readability/readability_dataset:v0', type='dataset')
    artifact_dir = Path(artifact.download())
    train = pd.read_csv(artifact_dir / 'train_folds.csv')

    frac = 0.02 if DEBUG else 1.
    train = train.sample(frac=frac, random_state=cfg.seed)
    
    df_train = train[train.kfold != fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)

    train_dataset = MyDataset(
        df_train,
        tokenizer,
        stage='train',
        rand_prob=cfg.token_dropout,
        random_p=cfg.random_crop_p,
        min_len=cfg.min_len,
    )

    valid_dataset = MyDataset(
        df_valid,
        tokenizer,
        stage='eval',
        rand_prob=0.,
        random_p=0.,
        min_len=cfg.min_len,
    )

    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.bs,
                              shuffle=True,
                              collate_fn=collate_train,
                              num_workers=4, pin_memory=True, drop_last=True)

    val_loader = DataLoader(valid_dataset,
                              batch_size=cfg.val_bs,
                              shuffle=False,
                              collate_fn=collate_valid,
                              num_workers=4, pin_memory=True, drop_last=False)

    model = MyModule(lr=cfg.lr,
                     model_checkpoint=cfg.model_checkpoint, 
                     num_classes=cfg.num_classes
                    )

    checkpoint_callback = ModelCheckpoint(
        save_top_k=1,
        save_weights_only=True,
        monitor="val_rmse",
        mode="min",
        dirpath=f"../output/{cfg.exp_name}/{fold}/",
        filename="readability-{epoch:02d}-{metric:.2f}",
    )

    trainer = pl.Trainer(precision=16, 
                         accelerator="gpu", devices=1, max_epochs=cfg.epochs,
                         log_every_n_steps=int(200/(cfg.bs * cfg.grad_acc)),
                         logger=wandb_logger,
                         default_root_dir=f"../output/{cfg.exp_name}",
                         callbacks=[checkpoint_callback],
                         accumulate_grad_batches=cfg.grad_acc,
                         enable_progress_bar=False,
                         )

    trainer.fit(model, train_loader, val_loader)

    wandb.finish()

In [13]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint)
collate_train = Collate(tokenizer, stage='train')
collate_valid = Collate(tokenizer, stage='valid')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
train_fold_zero = partial(train_fold, 0, cfg)

## Run Training

In [15]:
# UNCOMMENT BELOW TO RUN FOLD 0
train_fold_zero()


****************************************************************************************************
Training fold 0
****************************************************************************************************


[34m[1mwandb[0m: Currently logged in as: [33mmssong[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Epoch 0 RMSE 1.5114234685897827
Epoch 0 RMSE 1.737749695777893
Epoch 1 RMSE 1.7259596586227417


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁█
trainer/global_step,▁█
val_rmse,█▁

0,1
epoch,1
trainer/global_step,11


In [16]:
# UNCOMMENT BELOW TO RUN ALL FOLDS
# rng = 1 if DEBUG else 5
# for fold in range(rng):
#     train_fold(fold)

In [17]:
# UNCOMMENT BELOW TO RUN AS A SWEEP
sweep_config = {
  "name" : "lr-sweep",
  "method" : "grid",
  "metric" : {
      "name" : "val_rmse.min",
      "goal" : "minimize",
  },
  "parameters" : {
    "lr" : {
      "values" : [1e-3, 1e-4, 3e-4]
    },
  }
}
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=train_fold_zero)

Create sweep with ID: zwmjovcg
Sweep URL: https://wandb.ai/mssong/readability/sweeps/zwmjovcg


[34m[1mwandb[0m: Agent Starting Run: m8eniqma with config:
[34m[1mwandb[0m: 	lr: 0.001



****************************************************************************************************
Training fold 0
****************************************************************************************************


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                               | Params
-------------------------------------------------------------
0 | model | DebertaV2ForSequenceClassification | 141 M 
-------------------------------------------------------------
141 M     Trainable params
0         Non-trainable params
141 M     Total params
283.791   Total estimate

Epoch 0 RMSE 1.457093596458435
Epoch 0 RMSE 1.4568780660629272
Epoch 1 RMSE 1.2585923671722412


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁█
trainer/global_step,▁█
val_rmse,█▁

0,1
epoch,1
trainer/global_step,11


[34m[1mwandb[0m: Agent Starting Run: o80u36kd with config:
[34m[1mwandb[0m: 	lr: 0.0001



****************************************************************************************************
Training fold 0
****************************************************************************************************


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                               | Params
-------------------------------------------------------------
0 | model | DebertaV2ForSequenceClassification | 141 M 
-------------------------------------------------------------
141 M     Trainable params
0         Non-trainable params
141 M     Total params
283.791   Total estimate

Epoch 0 RMSE 1.5322753190994263
Epoch 0 RMSE 1.7771776914596558
Epoch 1 RMSE 1.7012962102890015


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁█
trainer/global_step,▁█
val_rmse,█▁

0,1
epoch,1
trainer/global_step,11


[34m[1mwandb[0m: Agent Starting Run: r02zwojn with config:
[34m[1mwandb[0m: 	lr: 0.0003



****************************************************************************************************
Training fold 0
****************************************************************************************************


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                               | Params
-------------------------------------------------------------
0 | model | DebertaV2ForSequenceClassification | 141 M 
-------------------------------------------------------------
141 M     Trainable params
0         Non-trainable params
141 M     Total params
283.791   Total estimate

Epoch 0 RMSE 1.5370827913284302
Epoch 0 RMSE 1.7260345220565796
Epoch 1 RMSE 1.5229774713516235


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁█
trainer/global_step,▁█
val_rmse,█▁

0,1
epoch,1
trainer/global_step,11


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


## Document Results!

Use [W&B Reports](https://docs.wandb.ai/guides/reports) to document your experiment results and post them in [W&B Discord](https://wandb.me/discord)