In [None]:
%pip install textattack transformers

In [3]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from textattack.augmentation import EmbeddingAugmenter

# Load dataset
dataset = load_dataset("ag_news", split='train[:10%]')  # Using a subset for quick demonstration

# Load model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Function to encode the dataset
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

dataset_encoded = dataset.map(encode, batched=True)
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Standard Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded,
    eval_dataset=dataset_encoded,
)

# Train the standard model
trainer.train()

# Augmenter for adversarial training
augmenter = EmbeddingAugmenter()

# Augment training data
def augment_data(examples):
    augmented_texts = augmenter.augment(examples['text'])
    return {'text': augmented_texts}

dataset_augmented = dataset.map(augment_data, batched=True)
dataset_augmented = dataset_augmented.map(encode, batched=True)
dataset_augmented.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Train model with augmented data
trainer_with_augmentation = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_augmented,
    eval_dataset=dataset_encoded,
)

trainer_with_augmentation.train()

# Evaluate models
eval_standard = trainer.evaluate()
eval_augmented = trainer_with_augmentation.evaluate()

print("Standard Training Evaluation:", eval_standard)
print("Adversarial Training Evaluation:", eval_augmented)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

  0%|          | 0/4500 [00:00<?, ?it/s]

{'loss': 1.4005, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 1.3806, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 1.3916, 'learning_rate': 3e-06, 'epoch': 0.02}
{'loss': 1.3777, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}
{'loss': 1.3973, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 1.3314, 'learning_rate': 6e-06, 'epoch': 0.04}
{'loss': 1.3093, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.05}
{'loss': 1.1905, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.05}
{'loss': 1.1729, 'learning_rate': 9e-06, 'epoch': 0.06}
{'loss': 1.1228, 'learning_rate': 1e-05, 'epoch': 0.07}
{'loss': 1.0261, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.07}
{'loss': 0.9813, 'learning_rate': 1.2e-05, 'epoch': 0.08}
{'loss': 0.8481, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.09}
{'loss': 0.758, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.09}
{'loss': 0.6811, 'learning_rate': 1.5e-05, 'epoch': 0.1}
{'loss': 0.6599, 'learnin

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

TypeError: Invalid text_input type <class 'list'> (required str or OrderedDict)