In [None]:
# Install required packages
# IMPORTANT:
# torch version here is for CUDA 11.5 (cu115).
# If your CUDA version is different, please visit https://pytorch.org/get-started/locally/ and replace this line with the appropriate command.

!pip install torch==1.10.2+cu115 torchvision==0.11.3+cu115 torchaudio==0.10.2+cu115 -f https://download.pytorch.org/whl/cu115/torch_stable.html
!pip install transformers==4.38.2 datasets==3.5.0 neptune-scale tqdm numpy<2.0 pandas matplotlib scikit-learn notebook


In [None]:
%env NEPTUNE_API_TOKEN="YOUR_API_TOKEN" # Replace before running
%env NEPTUNE_PROJECT="YOUR_WORKSPACE/YOUR_PROJECT" # Replace before running

In [None]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
from random import random
from neptune_scale import Run
from transformers import AdamW
import torch.optim as optim

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
configs = {
    "learning_rate": 1e-3,
    "batch_size": 8,
    "optimizer": "Adam",
    "num_epochs": 10,
    "seed": 42,
    "dataset_name": "glue",
    "dataset_config_name": "mrpc",
    "model_name": "bert-base-uncased"
}


# Step 1. Load and Preprocess Dataset
We will use the GLUE dataset and transformers from HuggingFace. 

In [None]:
dataset = load_dataset(configs["dataset_name"], name=configs["dataset_config_name"])
tokenizer = BertTokenizer.from_pretrained(configs["model_name"])



def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"]
)

# Step 2. Prepare the Dataloaders and load the Bert model for sequence classification


In [None]:
train_dataset = tokenized_datasets["train"].shuffle(seed=configs["seed"]).select(range(1000)) # Sample for demonstration

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=configs["batch_size"], collate_fn=data_collator)


from transformers import BertConfig, BertForSequenceClassification

# Random BERT config with no LayerNorm or Dropout
config = BertConfig(
    hidden_dropout_prob=0.0,
    attention_probs_dropout_prob=0.0,
    num_hidden_layers=12,
    num_attention_heads=12,
    hidden_size=768,
    intermediate_size=3072,
    layer_norm_eps=1e-12,
    output_attentions=False,
    output_hidden_states=False,
    num_labels=2,
)

model = BertForSequenceClassification(config)  # Not pretrained
model.to(device)

# Step 3. Initialize Neptune for Logging


In [None]:
run = Run(
    experiment_name=f"gradient_tracking_lr={configs['learning_rate']}",
)

# Log configs to Neptune
run.log_configs(configs)

run.add_tags(["gradient_tracking", "pytorch", "transformers"])

# Step 4. Define the Gradient Norm Logging Function



In [None]:
def log_gradient_norms(model, step, log_every_n_steps=1):
    """
    Logs L2 norm of gradients for model parameters every n steps using torch.no_grad.

    Args:
        model (torch.nn.Module): The neural network model.
        step (int): The current training step or epoch, for tracking.
        log_every_n_steps (int): Log only every n steps to reduce overhead.
    """

    if step % log_every_n_steps != 0:
        return  # Skip logging for this step

    grad_norms = {}
    with torch.no_grad():  # Prevent building a computation graph during norm computation
        for name, param in model.named_parameters():
            if param.grad is not None:
                # Optional: filter layers if needed, e.g., encoder only
                # if not name.startswith("encoder.layer."): continue
                grad_norms[f"gradients/{name}"] = param.grad.norm().item()

    run.log_metrics(grad_norms, step=step)


# Step 5. Train the Model and Track Gradients


In [None]:
optimizer = optim.Adam(model.parameters(), lr=configs["learning_rate"])

model.train()
for epoch in range(10):
    for step, batch in enumerate(train_dataloader):
        inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        # Log gradient norms
        log_gradient_norms(model, step + epoch * len(train_dataloader))

        optimizer.step()

        # Log Loss to Neptune Scale
        run.log_metrics({"loss": loss.item()}, step=step + epoch * len(train_dataloader))

# Close the run
run.close()