In [None]:
# Install required libraries
!pip install transformers torch pandas scikit-learn numpy tqdm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the dataset
df = pd.read_csv('bbc-text.csv')

# Ensure the dataset has the required columns
assert 'text' in df.columns and 'category' in df.columns, "Dataset must have 'text' and 'category' columns"

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['category'])

# Save label encoder classes for later use
label_classes = label_encoder.classes_
num_labels = len(label_classes)  # Dynamically set number of labels

# Verify that all labels are valid
valid_labels = ['business', 'entertainment', 'politics', 'sport', 'tech']
assert all(label in valid_labels for label in df['category'].unique()), "Labels must be one of: " + ", ".join(valid_labels)

# Split dataset into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print('Training data shape:', train_df.shape)
print('Test data shape:', test_df.shape)

# Custom Dataset Class
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].values
        self.labels = dataframe['label_encoded'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = NewsDataset(train_df, tokenizer)
test_dataset = NewsDataset(test_df, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize model with dynamic number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print('Model loaded to:', device)

# Training parameters
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Average training loss: {avg_loss:.4f}')

# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')

# Inference function
def classify_news(text):
    model.eval()
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return label_classes[prediction]

# Example inference
sample_text = 'Tesla shares surge after strong quarterly earnings'
predicted_category = classify_news(sample_text)
print(f'Input: {sample_text}')
print(f'Predicted Category: {predicted_category}')

# Save model and tokenizer
model.save_pretrained('newsbert_model')
tokenizer.save_pretrained('newsbert_model')

# Save label encoder
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print('Model and tokenizer saved.')

Training data shape: (1780, 3)
Test data shape: (445, 3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to: cuda
Epoch 1/3


100%|██████████| 223/223 [00:43<00:00,  5.12it/s]


Average training loss: 0.4582
Epoch 2/3


100%|██████████| 223/223 [00:45<00:00,  4.91it/s]


Average training loss: 0.0614
Epoch 3/3


100%|██████████| 223/223 [00:47<00:00,  4.69it/s]


Average training loss: 0.0339


100%|██████████| 56/56 [00:05<00:00,  9.45it/s]


Test Accuracy: 0.9685
Input: Tesla shares surge after strong quarterly earnings
Predicted Category: business
Model and tokenizer saved.


In [None]:
# Testing code for the trained BERT model (to be placed in the next cell)

# Load the saved model, tokenizer, and label encoder
model = BertForSequenceClassification.from_pretrained('newsbert_model')
tokenizer = BertTokenizer.from_pretrained('newsbert_model')
with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Define the dataset class (same as in training)
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].values
        self.labels = dataframe['label_encoded'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the dataset for batch testing
df = pd.read_csv('bbc-text.csv')

# Ensure the dataset has the required columns
assert 'text' in df.columns and 'category' in df.columns, "Dataset must have 'text' and 'category' columns"

# Encode labels (using the loaded label encoder to ensure consistency)
df['label_encoded'] = label_encoder.transform(df['category'])

# Split dataset into train and test (same split as training for consistency)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create test dataset and data loader
test_dataset = NewsDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Batch Testing: Evaluate model on test set
def evaluate_model():
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

# Single Input Testing: Classify a new text input
def classify_news(text):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return label_encoder.inverse_transform([prediction])[0]

# Run batch evaluation
print("Running batch evaluation on test set...")
evaluate_model()

# Test with example inputs
test_texts = [
    "Tesla shares surge after strong quarterly earnings",
    "New blockbuster movie breaks box office records",
    "Government announces new policy on immigration",
    "Team wins championship in stunning upset",
    "New smartphone technology unveiled at trade show"
]

print("\nTesting single inputs:")
for text in test_texts:
    predicted_category = classify_news(text)
    print(f'Input: {text}')
    print(f'Predicted Category: {predicted_category}\n')

# Interactive testing loop
print("Interactive Testing: Enter your own text to classify (type 'exit' to quit)")
while True:
    user_input = input("Enter text: ")
    if user_input.lower() == 'exit':
        break
    predicted_category = classify_news(user_input)
    print(f'Predicted Category: {predicted_category}\n')

Running batch evaluation on test set...


Evaluating: 100%|██████████| 56/56 [00:15<00:00,  3.62it/s]


Test Accuracy: 0.9685

Testing single inputs:
Input: Tesla shares surge after strong quarterly earnings
Predicted Category: business

Input: New blockbuster movie breaks box office records
Predicted Category: entertainment

Input: Government announces new policy on immigration
Predicted Category: politics

Input: Team wins championship in stunning upset
Predicted Category: sport

Input: New smartphone technology unveiled at trade show
Predicted Category: tech

Interactive Testing: Enter your own text to classify (type 'exit' to quit)
Enter text: Tech Giant Unveils AI Tool Set to Revolutionize Workflows
Predicted Category: tech

Enter text: exit
