In [1]:
# 1. Import libraries
from datasets import load_dataset
from transformers import BertTokenizerFast

# 2. Load AG News dataset
dataset = load_dataset("ag_news")
print("Dataset loaded!")
print(dataset)

# 3. Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# 4. Tokenize headlines
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=64)

tokenized_dataset = dataset.map(tokenize, batched=True)
print("Tokenization complete!")
print(tokenized_dataset)

Dataset loaded!
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
Tokenization complete!
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})


In [2]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
import torch

# 5. Set format for PyTorch
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Split train and test sets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

# 7. Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Training arguments for older transformers versions
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=50
)

# 9. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 10. Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.8036
100,0.4121
150,0.4261
200,0.4734
250,0.5639
300,0.4732
350,0.4291
400,0.4279
450,0.4785
500,0.5647


TrainOutput(global_step=15000, training_loss=0.2870666958173116, metrics={'train_runtime': 32067.774, 'train_samples_per_second': 3.742, 'train_steps_per_second': 0.468, 'total_flos': 3946736701440000.0, 'train_loss': 0.2870666958173116, 'epoch': 1.0})

In [3]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(axis=1)
labels = predictions.label_ids

# Calculate metrics
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='weighted')

print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1-score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(labels, preds))



Test Accuracy: 0.9433
Test F1-score: 0.9433

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1900
           1       0.99      0.99      0.99      1900
           2       0.92      0.90      0.91      1900
           3       0.91      0.93      0.92      1900

    accuracy                           0.94      7600
   macro avg       0.94      0.94      0.94      7600
weighted avg       0.94      0.94      0.94      7600



In [4]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')