In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load AG News dataset from Hugging Face
dataset = load_dataset('ag_news')

print(f"  Dataset loaded successfully")
print(f"  Train samples: {len(dataset['train']):,}")
print(f"  Test samples:  {len(dataset['test']):,}")


TRAIN_SIZE = 10000 
TEST_SIZE = 2000    

print(f"\n Using subset to minimize disk space:")
print(f"  Training: {TRAIN_SIZE:,} samples")
print(f"  Testing:  {TEST_SIZE:,} samples")

# Create subsets
train_dataset = dataset['train'].shuffle(seed=42).select(range(TRAIN_SIZE))
test_dataset = dataset['test'].shuffle(seed=42).select(range(TEST_SIZE))



In [None]:
# Display sample
print("\nSample data:")
for i in range(3):
    example = train_dataset[i]
    print(f"\nExample {i+1}:")
    print(f"  Text:  {example['text'][:80]}...")
    print(f"  Label: {example['label']} ({['World', 'Sports', 'Business', 'Sci/Tech'][example['label']]})")

In [None]:

# Class distribution
train_labels = [example['label'] for example in train_dataset]
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print("\nClass Distribution in Training Set:")
for i, name in enumerate(class_names):
    count = train_labels.count(i)
    print(f"  {name:12s}: {count:5d} ({count/len(train_labels)*100:.1f}%)")


In [None]:
# Use bert-base-uncased 
model_name = 'bert-base-uncased'

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
print(f" Tokenizer loaded: {model_name}")

# Load model for sequence classification (4 classes)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    problem_type="single_label_classification"
)

print(f" Model loaded: {model_name}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Using device: {device}")
if device.type == 'cuda':
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128  
    )

# Tokenize datasets
print("Tokenizing training data...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)

print("Tokenizing test data...")
tokenized_test = test_dataset.map(tokenize_function, batched=True)


In [None]:
# Set format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print(" Tokenization complete")
print(f"  Input shape: {tokenized_train[0]['input_ids'].shape}")

In [None]:
# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1
    }



In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=32,
    num_train_epochs=3,             
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,             
    fp16=torch.cuda.is_available(),  
    report_to='none'                
)


In [None]:
print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision (fp16): {training_args.fp16}")

In [None]:

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
train_result = trainer.train()

print("\n Training completed!")
print(f"  Training loss: {train_result.training_loss:.4f}")
print(f"  Training time: {train_result.metrics['train_runtime']:.2f} seconds")


In [None]:
# Evaluate
eval_results = trainer.evaluate()


print("EVALUATION RESULTS: ")

print(f"Test Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"Test F1-Score:  {eval_results['eval_f1']:.4f}")
print(f"Test Loss:      {eval_results['eval_loss']:.4f}")



In [None]:
# Get predictions
predictions = trainer.predict(tokenized_test)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

In [None]:
cm = confusion_matrix(true_labels, predicted_labels)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Create confusion matrix heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names
)
plt.title('Confusion Matrix - BERT News Classifier', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()


In [None]:
def predict_news_category(text, model, tokenizer, device):
   
    # Tokenize
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    ).to(device)
    
    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    return predicted_class, confidence, predictions[0].cpu().numpy()

# Move model to device
model.to(device)

# Test examples
test_examples = [
    "Apple announces new iPhone with revolutionary camera technology",
    "Stock market reaches all-time high amid economic recovery",
    "Scientists discover new species in Amazon rainforest",
    "Champions League final: Real Madrid defeats Manchester City 2-1"
]


for i, text in enumerate(test_examples, 1):
    predicted_class, confidence, probs = predict_news_category(text, model, tokenizer, device)
    
    print(f"\nExample {i}:")
    print(f"Text: {text}")
    print(f"Predicted: {class_names[predicted_class]} (Confidence: {confidence:.2%})")
    print(f"All probabilities:")
    for j, (name, prob) in enumerate(zip(class_names, probs)):
        print(f"  {name:12s}: {prob:.2%}")

In [None]:
# Save model and tokenizer
save_directory = './bert_news_classifier'
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Calculate model size
import os
model_size = sum(
    os.path.getsize(os.path.join(save_directory, f)) 
    for f in os.listdir(save_directory) 
    if os.path.isfile(os.path.join(save_directory, f))
) / (1024 * 1024)  # Convert to MB

print(f" Model size: {model_size:.2f} MB")

In [None]:
deployment_script = '''

from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load model
model_path = './bert_news_classifier'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Class names
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

def classify_news(text):
    """Classify news headline"""
    inputs = tokenizer(text, padding='max_length', truncation=True, 
                      max_length=128, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    return {
        'category': class_names[predicted_class],
        'confidence': confidence,
        'all_probabilities': {
            name: prob.item() 
            for name, prob in zip(class_names, predictions[0])
        }
    }

# Example usage
if __name__ == '__main__':
    example = "NASA launches new Mars rover mission"
    result = classify_news(example)
    print(f"Text: {example}")
    print(f"Category: {result['category']} ({result['confidence']:.2%})")
'''

with open('deploy_classifier.py', 'w') as f:
    f.write(deployment_script)

print(" Deployment script saved as 'deploy_classifier.py'")

In [None]:
import streamlit as st
from transformers import BertTokenizer, BertForSequenceClassification
import torch

@st.cache_resource
def load_model():
    """Load model (cached)"""
    model_path = './bert_news_classifier'
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    model.eval()
    return tokenizer, model

# Load model
tokenizer, model = load_model()
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

# Page config
st.set_page_config(page_title="News Classifier", page_icon="")

# Title
st.title(" BERT News Topic Classifier")
st.markdown("Classify news headlines into: **World, Sports, Business, or Sci/Tech**")

# Input
text_input = st.text_area(
    "Enter a news headline:",
    placeholder="e.g., Tesla announces new electric vehicle with 500-mile range",
    height=100
)

if st.button("Classify", type="primary"):
    if text_input.strip():
        # Predict
        inputs = tokenizer(text_input, padding='max_length', truncation=True,
                         max_length=128, return_tensors='pt')
        
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(predictions, dim=-1).item()
            confidence = predictions[0][predicted_class].item()
        
        # Display results
        st.success(f"**Predicted Category:** {class_names[predicted_class]}")
        st.metric("Confidence", f"{confidence:.2%}")
        
        # Show all probabilities
        st.subheader("All Predictions:")
        for name, prob in zip(class_names, predictions[0]):
            st.progress(prob.item(), text=f"{name}: {prob.item():.2%}")
    else:
        st.warning("Please enter some text!")

# Example buttons
st.markdown("---")
st.markdown("**Try these examples:**")
col1, col2 = st.columns(2)

examples = [
    "Stock market hits record high",
    "Olympic champion breaks world record",
    "New COVID vaccine shows promising results",
    "Tech giant announces quarterly earnings"
]

for i, example in enumerate(examples):
    col = col1 if i % 2 == 0 else col2
    if col.button(example, key=f"ex_{i}"):
        st.rerun()
'''

with open('app_streamlit.py', 'w') as f:
    f.write(streamlit_app)

print(" Streamlit app saved as 'app_streamlit.py'")
print(" Run with: streamlit run app_streamlit.py")
