In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
from random import random
from neptune_scale import Run
from transformers import AdamW
import torch.optim as optim


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1. Load and Preprocess Dataset
We will use the GLUE dataset and transformers from HuggingFace. 

In [12]:
dataset = load_dataset("glue", name="mrpc")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="longest",
        return_tensors="pt",
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])


# Step 2. Prepare the Dataloaders and load the Bert model for sequence classification


In [13]:
train_dataset = (
    tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
)  # Sample for demonstration

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"]
)

# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# # Move model to device
# model.to(device)

from transformers import BertConfig, BertForSequenceClassification

# Random BERT config with no LayerNorm or Dropout
config = BertConfig(
    hidden_dropout_prob=0.0,
    attention_probs_dropout_prob=0.0,
    num_hidden_layers=12,
    num_attention_heads=12,
    hidden_size=768,
    intermediate_size=3072,
    layer_norm_eps=1e-12,
    output_attentions=False,
    output_hidden_states=False,
    num_labels=2,
)


# config = BertConfig(hidden_dropout_prob=0.5)  # Add more dropout
model = BertForSequenceClassification(config)  # Not pretrained
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Step 3. Initialize Neptune for Logging


In [14]:
run = Run(
    api_token="YOUR_API_TOKEN",# replace with your Neptune API token
    project="YOUR_WORKSPACE/YOUR_PROJECT", # replace with your workspace and project name
    experiment_name="gradient_tracking_lr=1"
)

run.log_configs(
    {
        "learning_rate": 1,
        "batch_size": 1,
        "optimizer": "Adam",
    }
)
run.add_tags(["gradient_tracking", "pytorch", "transformers"])

2025-06-04 12:19:25,126 [94mneptune[0m:[1mINFO[0m: Data synchronization started


# Step 4. Define the Gradient Norm Logging Function



In [15]:
def log_gradient_norms(model, step, log_every_n_steps=1):
    
    """
    Logs L2 norm of gradients for model parameters every n steps using torch.no_grad.
    
    Args:
        model (torch.nn.Module): The neural network model.
        step (int): The current training step or epoch, for tracking.
        log_every_n_steps (int): Log only every n steps to reduce overhead.
    """

    if step % log_every_n_steps != 0:
        return  # Skip logging for this step

    with torch.no_grad():  # Prevent building a computation graph during norm computation
        for name, param in model.named_parameters():
            if param.grad is not None:
                # Optional: skip small/irrelevant layers if needed
                # if not name.startswith("encoder.layer."): continue
                
                grad_norm = param.grad.norm().item()
                run.log_metrics({f"gradients/{name}": grad_norm}, step=step)


# Step 5. Train the Model and Track Gradients


In [None]:
optimizer = optim.Adam(model.parameters(), lr=1)

model.train()
for epoch in range(10):
    for step, batch in enumerate(train_dataloader):
        inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        # Log gradient norms
        log_gradient_norms(model, step + epoch * len(train_dataloader))

        optimizer.step()

        # Log Loss to Neptune Scale
        run.log_metrics({"loss": loss.item()}, step=step + epoch * len(train_dataloader))

# Close the run
run.close()