In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
from random import random
from neptune_scale import Run
from transformers import AdamW



# Step 2. Load and Preprocess Dataset
We will use the GLUE dataset and transformers from HuggingFace. 

In [2]:
dataset = load_dataset('glue', name='mrpc')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], 
                     truncation=True, padding="longest", return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])





# Step 3. Prepare the Dataloaders and load the Bert model for sequence classification


In [3]:
train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))  # Sample for demonstration
eval_dataset = tokenized_datasets['validation'].shuffle(seed=42).select(range(408))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# Move model to CUDA
model.to('cuda')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Step 4. Initialize Neptune Scale for Logging


In [None]:
from random import random
from neptune_scale import Run

custom_id = random()

run = Run(
    api_token="YOUR_API_TOKEN",# replace with your Neptune API token
    project="your_workspace/your_project", # replace with your workspace and project name
    experiment_name="gradient_tracking",
    run_id=f"gradient-{custom_id}",
)

run.log_configs({
    "learning_rate": 5e-5,
    "batch_size": 8,
    "optimizer": "AdamW",
})



# Step 5. Define the Gradient Norm Logging Function



In [None]:
def log_gradient_norms(model, step):
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm().item()
            run.log_metrics({"gradients/" + name: grad_norm}, step=step)


# Step 6. Train the Model and Track Gradients


In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(10):
    for step, batch in enumerate(train_dataloader):
        inputs = {k: v.to('cuda') for k, v in batch.items() if k in tokenizer.model_input_names}
        labels = batch['labels'].to('cuda')
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        # Log gradient norms
        log_gradient_norms(model, step + epoch * len(train_dataloader))

        optimizer.step()

        # Log Loss to Neptune Scale
        run.log_metrics({"loss": loss.item()}, step=step + epoch * len(train_dataloader))

# Add tags and close the run
run.add_tags(["gradient_tracking", "pytorch", "transformers"])
run.close()
