In [1]:
!pip install -qq comet-ml

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/682.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m675.8/682.3 kB[0m [31m25.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m682.3/682.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/979.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m979.1/979.1 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.6/303.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import random
import time
from datetime import timedelta

import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.model_selection import train_test_split
from transformers import (
    BertForSequenceClassification,
    AdamW,
    BertTokenizer,
    get_linear_schedule_with_warmup
)

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

In [3]:
from google.colab import userdata


# Initialize Comet experiment
exp = Experiment(
    api_key=userdata.get('COMET_API_KEY'),
    project_name="fine-tuning-bert",
    workspace=userdata.get("COMET_WORKSPACE")
)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manoo/fine-tuning-bert/37d5ea0db41d454d9cc688b37883b672



In [4]:
# Constants
MAX_LEN = 512
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
EPOCHS = 4
RANDOM_SEED = 42

def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

In [5]:
def load_data(file_path):
    """Load and preprocess the data."""
    df = pd.read_csv(file_path)
    df['label'] = df['label'].map({'safe': 0, 'malicious': 1})
    return df['text'].values, df['label'].values

def tokenize_data(texts, tokenizer):
    """Tokenize the input texts."""
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)


In [6]:
def create_dataloaders(input_ids, attention_masks, labels):
    """Create train and validation DataLoaders."""
    dataset = TensorDataset(input_ids, attention_masks, labels)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=BATCH_SIZE
    )
    validation_dataloader = DataLoader(
        val_dataset,
        sampler=SequentialSampler(val_dataset),
        batch_size=BATCH_SIZE
    )

    return train_dataloader, validation_dataloader


In [7]:
def flat_accuracy(preds, labels):
    """Calculate the accuracy of predictions vs labels."""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    """Format elapsed time as hh:mm:ss."""
    return str(timedelta(seconds=int(round(elapsed))))

In [8]:
def train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, device, exp):
    """Train the model and perform validation."""
    training_stats = []
    total_t0 = time.time()

    accumulation_steps = 4

    for epoch in range(EPOCHS):
        print(f"\n======== Epoch {epoch + 1} / {EPOCHS} ========")
        print("Training...")
        t0 = time.time()
        total_train_loss = 0
        model.train()

        # Reset gradients
        model.zero_grad()

        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss / accumulation_steps
            total_train_loss += loss.item() * accumulation_steps
            loss.backward()

            # Gradient accumulation
            if (step + 1) % accumulation_steps == 0:
                # Clip the norm of the gradients to 1.0
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters
                optimizer.step()

                # Update the learning rate
                scheduler.step()

                # Reset gradients
                model.zero_grad()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print(f"  Average training loss: {avg_train_loss:.2f}")
        print(f"  Training epoch took: {training_time}")

        print("\nRunning Validation...")
        t0 = time.time()
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0

        for batch in validation_dataloader:
            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_eval_loss += loss.item()
            total_eval_accuracy += flat_accuracy(logits.detach().cpu().numpy(), b_labels.cpu().numpy())

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        validation_time = format_time(time.time() - t0)

        print(f"  Accuracy: {avg_val_accuracy:.2f}")

        training_stats.append({
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

        metrics = {
            "Training Loss": round(avg_train_loss, 3),
            "Valid. Loss": round(avg_val_loss, 3),
            'Valid. Accur': round(avg_val_accuracy, 3)
        }
        exp.log_metrics(metrics, epoch=epoch)

    print(f"\nTraining complete! Total training took {format_time(time.time() - total_t0)}")
    return model, training_stats


In [9]:
# Set seed for reproducibility
set_seed(RANDOM_SEED)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Using device: cuda


In [10]:
# Load and preprocess data
texts, labels = load_data("/content/combined_data.csv")
labels = torch.tensor(labels)

In [11]:
# Tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
input_ids, attention_masks = tokenize_data(texts, tokenizer)

# Create dataloaders
train_dataloader, validation_dataloader = create_dataloaders(input_ids, attention_masks, labels)

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
).to(device)

# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Log hyperparameters
exp.log_parameters({
    "learning_rate": LEARNING_RATE,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "max_length": MAX_LEN
})


In [13]:
# Train and validate the model
model, training_stats = train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, device, exp)



Training...
  Average training loss: 0.73
  Training epoch took: 0:00:18

Running Validation...
  Accuracy: 0.50

Training...
  Average training loss: 0.68
  Training epoch took: 0:00:14

Running Validation...
  Accuracy: 0.51

Training...
  Average training loss: 0.64
  Training epoch took: 0:00:15

Running Validation...
  Accuracy: 0.53

Training...
  Average training loss: 0.61
  Training epoch took: 0:00:15

Running Validation...
  Accuracy: 0.59

Training complete! Total training took 0:01:13


Remember to also adjust your learning rate if you change the effective batch size, as this can impact the training dynamics. You might need to experiment with different values of accumulation_steps to find the optimal setting for your specific case.

You typically only need to do gradient accumulation on the training data, not on the validation set. Here's a brief explanation:

Training data:

Gradient accumulation is used during training to simulate larger batch sizes when hardware memory constraints limit the actual batch size.
It involves accumulating gradients over multiple smaller batches before updating the model parameters.
This technique helps stabilize training and potentially improve model performance, especially for larger models or when working with limited GPU memory.


Validation set:

The validation set is used to evaluate the model's performance on unseen data during training.
You don't perform any parameter updates or gradient calculations on the validation set.
The model is only run in inference mode (typically using torch.no_grad() in PyTorch) to calculate metrics and assess generalization.



Since gradient accumulation is a technique used for parameter updates during training, it's not applicable to the validation process where no updates occur.

In [14]:
# Log the model
log_model(exp, model=model, model_name="BERT-uncased")
exp.end()

# Print final training stats
print("\nTraining stats:")
for stat in training_stats:
    print(stat)

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : bronze_projection_3456
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manoo/fine-tuning-bert/37d5ea0db41d454d9cc688b37883b672
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Training Loss [4] : (0.611, 0.73)
[1;38;5;39mCOMET INFO:[0m     Valid. Accur [4]  : (0.499, 0.593)
[1;38;5;39mCOMET INFO:[0m     Valid. Loss [4]   : (0.664, 0.704)
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     batch_size    : 32
[1;38;5;39mCOMET INFO:[0m     epochs        : 4
[1;


Training stats:
{'epoch': 1, 'Training Loss': 0.7302526235580444, 'Valid. Loss': 0.7044115960597992, 'Valid. Accur.': 0.49895833333333334, 'Training Time': '0:00:18', 'Validation Time': '0:00:03'}
{'epoch': 2, 'Training Loss': 0.6789565980434418, 'Valid. Loss': 0.6871802806854248, 'Valid. Accur.': 0.5145833333333334, 'Training Time': '0:00:14', 'Validation Time': '0:00:03'}
{'epoch': 3, 'Training Loss': 0.6419486502806345, 'Valid. Loss': 0.6737086176872253, 'Valid. Accur.': 0.528125, 'Training Time': '0:00:15', 'Validation Time': '0:00:03'}
{'epoch': 4, 'Training Loss': 0.6105738480885824, 'Valid. Loss': 0.6643588244915009, 'Valid. Accur.': 0.5927083333333334, 'Training Time': '0:00:15', 'Validation Time': '0:00:03'}
