<a href="https://colab.research.google.com/github/malick08012/AI-ML-Internship-Task/blob/main/News_Topic_Classifier_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the necessary libraries, ensuring they are updated
!pip install -U transformers datasets evaluate accelerate streamlit fsspec
!pip install torch # Ensure torch is installed

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc

In [2]:
from datasets import load_dataset

# Load the AG News dataset
dataset = load_dataset('ag_news')

# Inspect the dataset structure
print(dataset)
print(dataset['train'][0]) # Example of a training sample

# Define the label mapping for clarity (AG News labels are 0, 1, 2, 3)
label_mapping = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}
print(f"Example label (raw): {dataset['train'][0]['label']}")
print(f"Example label (mapped): {label_mapping[dataset['train'][0]['label']]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}
Example label (raw): 2
Example label (mapped): Business


In [3]:
from transformers import AutoTokenizer

# Load the tokenizer for bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    # This will tokenize the 'text' column of our dataset.
    # truncation=True ensures that sequences longer than BERT's max length (512) are truncated.
    # padding=True pads shorter sequences to the maximum length within each batch.
    return tokenizer(examples['text'], truncation=True, padding=True)

# Apply the tokenization to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Rename the 'label' column to 'labels' as expected by the Trainer API
tokenized_datasets = tokenized_datasets.rename_columns({"label": "labels"})

# Set the format to PyTorch tensors for efficient training
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Create small subsets for faster experimentation (optional, but highly recommended for initial runs)
# Using full dataset takes a long time in Colab, especially without paid tier.
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

print("\nSample tokenized input:")
print(small_train_dataset[0])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]


Sample tokenized input:
{'labels': tensor(0), 'input_ids': tensor([  101,  7269, 11498,  2135,  6924,  2011,  9326,  4559, 10134,  2031,
         2716,  2116,  4865,  1998,  3655,  1999,  7269,  2000,  1037,  9190,
         1010,  1996,  2154,  2044,  2324,  2111,  2351,  1999, 18217,  2012,
         1037,  2576,  8320,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
    

In [5]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

# Load the pre-trained BERT model for sequence classification
# num_labels is 4 for AG News (World, Sports, Business, Sci/Tech)
# The id2label and label2id arguments help map integer labels to human-readable names.
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=4,
    id2label=label_mapping,
    label2id={name: id for id, name in label_mapping.items()}
)

# Define evaluation metrics
f1_metric = evaluate.load("f1") # We'll compute F1-score
accuracy_metric = evaluate.load("accuracy") # And accuracy

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # F1-score: Use 'weighted' to account for label imbalance (if any)
    f1_score = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    # Accuracy
    accuracy_score = accuracy_metric.compute(predictions=predictions, references=labels)

    return {"f1_score": f1_score["f1"], "accuracy": accuracy_score["accuracy"]}

# Define training arguments - CORRECTED
training_args = TrainingArguments(
    output_dir="./results",                   # Output directory for model checkpoints
    num_train_epochs=3,                       # Total number of training epochs (adjust as needed)
    per_device_train_batch_size=16,           # Batch size per GPU/CPU for training
    per_device_eval_batch_size=16,            # Batch size per GPU/CPU for evaluation
    warmup_steps=500,                         # Number of steps for learning rate warmup
    weight_decay=0.01,                        # Strength of weight decay
    logging_dir="./logs",                     # Directory for storing logs
    logging_steps=100,                        # Log every 100 steps
    eval_strategy="epoch",              # <--- CORRECTED: Changed from evaluation_strategy
    save_strategy="epoch",              # <--- CORRECTED: Changed from save_strategy
    load_best_model_at_end=True,              # Load the best model based on evaluation metric
    metric_for_best_model="f1_score",         # Metric to monitor for best model
    push_to_hub=False,                        # Set to True if you want to push to Hugging Face Hub (requires login)
    report_to="none",                         # Disable integrations like Weights & Biases for simplicity
    seed=42,                                  # For reproducibility
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,        # Use small_train_dataset for faster training
    eval_dataset=small_eval_dataset,          # Use small_eval_dataset for faster evaluation
    tokenizer=tokenizer,                      # Pass tokenizer to the Trainer
    compute_metrics=compute_metrics,
)

# Train the model
print("\nStarting model training...")
trainer.train()
print("Model training complete!")

# Save the fine-tuned model and tokenizer
model_save_path = "./fine-tuned-bert-agnews"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting model training...


Epoch,Training Loss,Validation Loss,F1 Score,Accuracy
1,0.3463,0.339465,0.896548,0.897
2,0.2331,0.269709,0.918364,0.918
3,0.1149,0.311703,0.92316,0.923


Model training complete!
Model and tokenizer saved to ./fine-tuned-bert-agnews


In [6]:
# Evaluate the model on the full test set
print("\nEvaluating model on the full test set...")
full_test_results = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation Results on full test set:")
print(full_test_results)


Evaluating model on the full test set...


Evaluation Results on full test set:
{'eval_loss': 0.30680882930755615, 'eval_f1_score': 0.9241646317949473, 'eval_accuracy': 0.9242105263157895, 'eval_runtime': 100.277, 'eval_samples_per_second': 75.79, 'eval_steps_per_second': 4.737, 'epoch': 3.0}


In [7]:
# Create a Python file for your Streamlit app
%%writefile app.py
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import os

# Define the label mapping for AG News (must match your training)
id2label = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Load the model and tokenizer
@st.cache_resource # Cache the model loading for better performance
def load_classification_pipeline():
    model_path = "./fine-tuned-bert-agnews"
    # Ensure the model directory exists
    if not os.path.exists(model_path):
        st.error(f"Model directory not found at {model_path}. Please make sure the model was saved correctly.")
        return None

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        return pipeline("text-classification", model=model, tokenizer=tokenizer)
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None

classifier = load_classification_pipeline()

st.title("News Topic Classifier")
st.write("Enter a news headline to classify its topic.")

user_input = st.text_area("News Headline:", "")

if classifier:
    if user_input:
        # Perform inference
        result = classifier(user_input)

        # The pipeline often returns labels like 'LABEL_0', 'LABEL_1', etc.
        # We need to parse this and map it back to our meaningful labels.
        predicted_label_raw = result[0]['label']

        # Extract the numeric part (e.g., '0' from 'LABEL_0')
        predicted_label_id = int(predicted_label_raw.split('_')[-1])

        predicted_label_name = id2label.get(predicted_label_id, "Unknown")
        confidence = result[0]['score']

        st.write(f"**Predicted Topic:** {predicted_label_name}")
        st.write(f"**Confidence:** {confidence:.2f}")
    else:
        st.info("Please enter a news headline to classify.")
else:
    st.warning("Model could not be loaded. Please check the Colab output for errors.")

Writing app.py


In [8]:
# Install localtunnel if not already installed
!npm install localtunnel

# Run Streamlit in the background and then expose it
# You might need to confirm the connection in your browser after clicking the link.
print("Starting Streamlit app... Look for a public URL (e.g., https://xxxxx.loca.lt) above this cell.")
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K
added 22 packages in 3s
[1G[0K⠸[1G[0K
[1G[0K⠸[1G[0K3 packages are looking for funding
[1G[0K⠸[1G[0K  run `npm fund` for details
[1G[0K⠸[1G[0KStarting Streamlit app... Look for a public URL (e.g., https://xxxxx.loca.lt) above this cell.
[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://five-heads-like.loca.lt

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.101.199:8501[0m
[0m
[34m  Stopping...[0m
^C
