<a href="https://colab.research.google.com/github/mmubashirm381/-Classifier-Using-BERT/blob/main/finetune_distilBERT_for_news_headline_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path
import numpy as np
import torch
import datasets

!pip install evaluate
!pip install --upgrade datasets transformers

import evaluate
from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "fancyzhx/ag_news"
MODEL_NAME = "distilbert/distilbert-base-uncased"
MODEL_SAVE_DIR_NAME = "models/news-classification-distilbert-base-uncased"

print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
dataset = datasets.load_dataset(DATASET_NAME)

model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 3. Reduce train dataset to 40k samples while preserving class distribution
print(f"[INFO] Reducing training data to 40k samples balanced across all classes...")
train_dataset = dataset["train"]
labels = train_dataset.unique("label")
samples_per_class = 40000 // len(labels)

balanced_samples = []
for label in labels:S
    class_subset = train_dataset.filter(lambda example: example["label"] == label)
    class_subset = class_subset.shuffle(seed=42).select(range(samples_per_class))
    balanced_samples.append(class_subset)

balanced_train_dataset = datasets.concatenate_datasets(balanced_samples).shuffle(seed=42)
dataset["train"] = balanced_train_dataset

# 4. Load and preprocess tokenizer
id2label = {0:"World", 1:"Sports", 2:"Business", 3:"Sci/Tech"}
label2id = {label: id for id, label in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, use_fast=True)

def tokenize_text(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text, batched=True, batch_size=1000)

# 5. Set up evaluation metric
accuracy_metric = evaluate.load("accuracy")
def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels
    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# 6. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# 7. Training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.0001,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="none",
    push_to_hub=False,
    hub_private_repo=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)

# 10. Push to Hub (optional)
model_upload_url = trainer.push_to_hub(commit_message="Uploading news headline classifier model...")
print(f"[INFO] Model upload complete! Model available at: {model_upload_url}")

# 11. Evaluate
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

print(f"[INFO] Prediction metrics on test data:")
pprint.pprint(prediction_metrics)


[INFO] Creating directory for saving models: models/news-classification-distilbert-base-uncased


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

[INFO] Reducing training data to 40k samples balanced across all classes...


Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[INFO] Model loading complete!
[INFO] Commencing model training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2829,0.207593,0.933816
2,0.1444,0.218549,0.931974
3,0.0736,0.255406,0.935526


[INFO] Model training complete, saving model to local path: models/news-classification-distilbert-base-uncased


training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Model upload complete! Model available at: https://huggingface.co/mercury99/news-classification-distilbert-base-uncased/tree/main/
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on test data:
{'test_accuracy': 0.9338157894736843,
 'test_loss': 0.20759274065494537,
 'test_runtime': 46.9704,
 'test_samples_per_second': 161.804,
 'test_steps_per_second': 5.067}


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# Load tokenizer and model from the Hugging Face Hub
MODEL_REPO = "mercury99/news-classification-distilbert-base-uncased"  # change this
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)

# Create a pipeline for inference
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
classifier("Dutch Retailer Beats Apple to Local Download Market AMSTERDAM (Reuters) - Free Record Shop, a Dutch music retail chain, beat Apple Computer Inc. to market on Tuesday with the launch of a new download service in Europe's latest battleground for digital song services.")

[{'label': 'Sci/Tech', 'score': 0.9675270915031433}]

In [None]:
import torch

In [None]:
from typing import Dict
def news_classifier(text: str)-> Dict[str, float]:
  news_classification_pipeline = pipeline(task="text-classification",
                                          model=MODEL_REPO,
                                          batch_size=32,
                                          device="cuda" if torch.cuda.is_available() else "cpu",
                                          top_k=None)

  outputs = news_classification_pipeline(text)[0]

  output_dict = {}

  for item in outputs:
    output_dict[item["label"]] = item["score"]
  return output_dict

news_classifier(text="Iran launched Missles on Israel")

Device set to use cuda


{'World': 0.9876114726066589,
 'Sci/Tech': 0.006084149237722158,
 'Business': 0.004901876673102379,
 'Sports': 0.0014024028787389398}

In [None]:
# import gradio
import gradio as gr


demo = gr.Interface(
    fn=news_classifier,
    inputs="text",
    outputs=gr.Label(num_top_classes=4),
    title="News Classifier",
    description="A text classifier to determine is either World, Business, Science/Technology or Sports Related.",
    examples=[["Iran launched missiles towards Israel."],["Apple launched its new flagship phone, iPhone 16 pro max for $1200"],["Cristiano Ronaldo will join Real Madrid for £100 million fee."], ["China's Deepseek caused 1.2 trillion dollar crash in stock market."]],

)
demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://72c53200cd3877bd1f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
## Making our demo publicly accessible
from pathlib import Path

# Make directory for demos
demos_dir = Path("../demos")
demos_dir.mkdir(exist_ok=True)

news_headline_classifier_dir = Path(demos_dir, "news_headline_classifier")
news_headline_classifier_dir.mkdir(exist_ok=True)

In [None]:
%%writefile ../demos/news_headline_classifier/app.py

# import packages
import torch
import gradio as gr
from typing import Dict
from transformers import pipeline

# initialize pipeline only once
news_classifier_pipeline = pipeline(
    task="text-classification",
    model="mercury99/news-classification-distilbert-base-uncased",
    batch_size=32,
    device=0 if torch.cuda.is_available() else -1,  # 0 for CUDA, -1 for CPU
    top_k=None
)

# define our function to use with our model
def news_classifier(text: str) -> Dict[str, float]:
    outputs = news_classifier_pipeline(text)[0]
    output_dict = {item["label"]: item["score"] for item in outputs}
    return output_dict

# Create a Gradio interface
description = """
A text classifier to determine whether a news headline is about science/tech, sports, world, or business.

Fine-tuned from [DistilBERT](https://huggingface.co/distilbert-base-uncased).
See [source code](https://github.com/HaileleulGirma/fine-tuned-distilBERT-for-news-classification)
"""

demo = gr.Interface(
    fn=news_classifier,
    inputs="text",
    outputs=gr.Label(num_top_classes=4),
    title="🌍news⚽️headline📈classifier⚛️",
    description=description,
    examples=[
        ["Iran launched missiles towards Israel."],
        ["Apple launched its new flagship phone, iPhone 16 pro max for $1200"],
        ["Cristiano Ronaldo will join Real Madrid for £100 million fee."],
        ["China's Deepseek caused 1.2 trillion dollar crash in stock market."]
    ]
)

# Launch the interface
if __name__ == "__main__":
    demo.launch()


Writing ../demos/news_headline_classifier/app.py


In [None]:
%%writefile ../demos/news_headline_classifier/README.md
---
title: news headline classifier
emoji: 🌍 ⚽️ 📈 ⚛️
colorFrom: blue
colorTo: yellow
sdk: gradio
app_file: app.py
pinned: true
license: apache-2.0
---

# 🌍 ⚽️ 📈 ⚛️ Food Not Food Text Classifier

A demo to showcase a text classifier to determine if a news headline is about sports, world, sci/tech or business related.

DistillBERT model fine-tuned on a dataset of news headlines [Food or Not Food image captions](https://huggingface.co/datasets/fancyzhx/ag_news).

[Source code notebook](https://github.com/mrdbourke/learn-huggingface/blob/main/notebooks/hugging_face_text_classification_tutorial.ipynb).

Writing ../demos/news_headline_classifier/README.md


In [None]:
%%writefile ../demos/news_headline_classifier/requirements.txt
gradio
torch
transformers

Writing ../demos/news_headline_classifier/requirements.txt


In [None]:
import huggingface_hub

In [None]:
from huggingface_hub import(
    create_repo,
    get_full_repo_name,
    upload_file,
    upload_file,
    upload_folder
)

LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD = "../demos/news_headline_classifier"
HF_TARGET_SPACE_NAME = "news_headline_classifier"
HF_REPO_TYPE = "space"
HF_SPACE_SDK = "gradio"

create_repo(
    repo_id=HF_TARGET_SPACE_NAME,
    repo_type=HF_REPO_TYPE,
    private=False,
    space_sdk=HF_SPACE_SDK,
    exist_ok=True,
)

full_hf_repo_name = get_full_repo_name(model_id=HF_TARGET_SPACE_NAME)

folder_upload_url = upload_folder(
    repo_id=full_hf_repo_name,
    folder_path = LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD,
    path_in_repo=".",
    repo_type=HF_REPO_TYPE,
    commit_message="Fix app.py"
)

print(f"[INFO] Demo folder successfully uploaded with commit URL: {folder_upload_url}")

[INFO] Demo folder successfully uploaded with commit URL: https://huggingface.co/spaces/mercury99/news_headline_classifier/tree/main/.
