# Task 1 – News Topic Classification using BERT

In [1]:
# Installing required libraries for transformers and datasets
!pip install -q transformers datasets scikit-learn accelerate


In [2]:
# PyTorch for deep learning
import torch

# Numerical and data handling
import numpy as np
import pandas as pd

# Hugging Face datasets and transformers
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Evaluation metrics
from sklearn.metrics import accuracy_score, f1_score


In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [4]:
# Load AG News dataset from Hugging Face
dataset = load_dataset("ag_news")

# Display dataset structure
dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
# Rename 'label' column to 'labels' (required by Trainer API)
dataset = dataset.rename_column("label", "labels")

# Set format for PyTorch
dataset.set_format("torch")


In [6]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Tokenization function
def tokenize_data(batch):
    """
    Tokenizes text using BERT tokenizer.
    Padding and truncation ensure fixed input size.
    """
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenization to entire dataset
dataset = dataset.map(tokenize_data, batched=True)


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [8]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4   # 4 news categories
)

# Move model to GPU
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Custom evaluation metrics
def compute_metrics(pred):
    """
    Computes accuracy and weighted F1-score
    """
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": acc,
        "f1_score": f1
    }


In [12]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./bert_results",        # Directory to save outputs
    eval_strategy="epoch",
    save_strategy="epoch",               # Save after each epoch
    learning_rate=2e-5,                  # Standard BERT learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,                  # Safe for Colab GPU
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"
)


In [13]:
# Initialize Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [14]:
# Start fine-tuning BERT
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.1878,0.176853,0.945,0.94497


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.1878,0.176853,0.945,0.94497
2,0.107,0.189238,0.947105,0.947127


TrainOutput(global_step=15000, training_loss=0.1770511740366618, metrics={'train_runtime': 4930.893, 'train_samples_per_second': 48.673, 'train_steps_per_second': 3.042, 'total_flos': 1.578694680576e+16, 'train_loss': 0.1770511740366618, 'epoch': 2.0})

In [15]:
# Evaluate trained model on test set
results = trainer.evaluate()
results


{'eval_loss': 0.17685341835021973,
 'eval_accuracy': 0.945,
 'eval_f1_score': 0.9449698055293028,
 'eval_runtime': 51.0105,
 'eval_samples_per_second': 148.989,
 'eval_steps_per_second': 9.312,
 'epoch': 2.0}

In [16]:
# Save trained model and tokenizer
model.save_pretrained("bert-news-classifier")
tokenizer.save_pretrained("bert-news-classifier")


('bert-news-classifier/tokenizer_config.json',
 'bert-news-classifier/special_tokens_map.json',
 'bert-news-classifier/vocab.txt',
 'bert-news-classifier/added_tokens.json')

In [17]:
# Zip the model folder
!zip -r bert-news-classifier.zip bert-news-classifier


  adding: bert-news-classifier/ (stored 0%)
  adding: bert-news-classifier/special_tokens_map.json (deflated 42%)
  adding: bert-news-classifier/tokenizer_config.json (deflated 75%)
  adding: bert-news-classifier/config.json (deflated 52%)
  adding: bert-news-classifier/vocab.txt (deflated 53%)
  adding: bert-news-classifier/model.safetensors (deflated 7%)
