# Project Overview

This project is focused on designing and implementing a system to automatically annotate a dataset of news articles (the AG News dataset) with semantic tags such as topics. Four different models were developed for this purpose:

1. **Cosine Similarity**  
2. **Classification with Transformers**  
3. **Zero-Shot Classification**  
4. **Clustering**

## AG News Dataset

The AG News dataset is a widely used benchmark in text classification tasks, particularly for news categorization. It contains news articles categorized into four classes: 

- **World**  
- **Sports**  
- **Business**  
- **Science/Technology**

Each article comprises a title and a description, providing a robust foundation for developing and evaluating classification models.


In [None]:
# !pip install transformers datasets scikit-learn

## Imports

In [8]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import os
import torch
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
import logging
from transformers import logging as transformers_logging
from colorama import Fore, Style


# Suppress  logs
logging.getLogger("sklearn").setLevel(logging.ERROR)
transformers_logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

import datasets
datasets.utils.logging.set_verbosity_error()

import warnings
warnings.filterwarnings("ignore")


## 1. **Cosine Similarity**  

In [9]:
# Method 1: Cosine Similarity
def load_and_prepare_data(dataset_name="ag_news", split="train", sample_size=1000, seed=42):

    dataset = load_dataset(dataset_name)
    data = dataset[split].shuffle(seed=seed).select(range(sample_size))
    return data  


def encode_topics(model):
    """Encode predefined topic tags."""
    topics = {
        "World": "global news, international events, foreign affairs, geopolitics, diplomacy, international conflicts, world leaders, cultural diversity",
        "Sports": "sports updates, athletic events, player performance, championships, sports leagues, team news, sports statistics, Olympic games",
        "Business": "economic trends, corporate news, stock market updates, financial reports, trade policies, business strategies, entrepreneurship, global markets",
        "Science/Technology": "scientific research, groundbreaking discoveries, emerging technologies, innovation, space exploration, AI developments, medical advancements, environmental science"
    }
    topic_embeddings = np.array([model.encode(desc) for desc in topics.values()])
    topic_names = list(topics.keys())
    return topic_embeddings, topic_names


def assign_tags_with_cosine_similarity(model, data, topic_embeddings, topic_names, batch_size=512):
    """Assign tags using cosine similarity."""
    def annotate_articles_in_batch(articles):
        article_embeddings = model.encode(articles)
        similarities = np.dot(article_embeddings, topic_embeddings.T)
        return [topic_names[np.argmax(similarity)] for similarity in similarities]

    true_labels = []
    predicted_labels = []

    for i in range(0, len(data), batch_size):
        batch = data[i: i + batch_size]
        articles = [row['text'] for row in batch]
        predicted_batch = annotate_articles_in_batch(articles)
        predicted_labels.extend(predicted_batch)
        true_labels.extend([topic_names[row['label']] for row in batch])

    accuracy = accuracy_score(true_labels, predicted_labels)
    
    print(f"{Fore.RED + Style.BRIGHT}Cosine Similarity Method Accuracy: {accuracy:.2f}{Style.RESET_ALL}")





In [10]:
if __name__ == "__main__":
    # Load and encode data
    train_data = load_and_prepare_data(split="train", sample_size=10000).to_list()
    test_data = load_and_prepare_data(split="test", sample_size=1000)

    model = SentenceTransformer('all-mpnet-base-v2')
    topic_embeddings, topic_names = encode_topics(model)

    # Method 1: Cosine Similarity
    assign_tags_with_cosine_similarity(model, train_data, topic_embeddings, topic_names)


[31m[1mCosine Similarity Method Accuracy: 0.71[0m


## 2. **Classification with Transformers**  

In [11]:
# Method 2: Classification with Transformers
def train_and_evaluate_transformer_model(train_texts, train_labels, test_texts, test_labels):

    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

    train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
    test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    os.environ["WANDB_DISABLED"] = "true"

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        logging_dir="./logs",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(axis=1)
        return {"accuracy": accuracy_score(labels, predictions)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    results = trainer.evaluate()
    print(f"Transformer Model Accuracy: {results['eval_accuracy']:.2f}")

    return test_dataset, tokenizer, model


def predict_and_evaluate(test_dataset, tokenizer, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    def predict_in_batches(dataset, batch_size=32):
        predictions = []
        for i in range(0, len(dataset['text']), batch_size):
            batch_texts = dataset['text'][i: i + batch_size]
            inputs = tokenizer(
                batch_texts, truncation=True, padding=True, max_length=512, return_tensors="pt"
            ).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
                batch_predictions = outputs.logits.argmax(axis=1).cpu().tolist()
                predictions.extend(batch_predictions)
        return predictions

    test_predictions = predict_in_batches(test_dataset)
    true_labels = test_dataset['label']

    accuracy = accuracy_score(true_labels, test_predictions)
    print(f"Transformer Model Detailed Accuracy: {accuracy:.2f}")

    topics = ["World", "Sports", "Business", "Science/Technology"]
    print(classification_report(true_labels, test_predictions, target_names=topics))






In [12]:
if __name__ == "__main__":
    # Load and encode data
    train_data = load_and_prepare_data(split="train", sample_size=10000).to_list()
    test_data = load_and_prepare_data(split="test", sample_size=1000)


    # Method 2: Transformer Classification
    train_texts = [row['text'] for row in train_data]
    train_labels = [row['label'] for row in train_data]
    test_texts = [row['text'] for row in test_data]
    test_labels = [row['label'] for row in test_data]

    test_dataset, tokenizer, transformer_model = train_and_evaluate_transformer_model(train_texts, train_labels, test_texts, test_labels)
    predict_and_evaluate(test_dataset, tokenizer, transformer_model)




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'eval_loss': 0.29967784881591797, 'eval_accuracy': 0.898, 'eval_runtime': 3.3984, 'eval_samples_per_second': 294.252, 'eval_steps_per_second': 9.416, 'epoch': 1.0}
{'loss': 0.3602, 'grad_norm': 3.674365282058716, 'learning_rate': 9.350372736954207e-06, 'epoch': 1.5974440894568689}
{'eval_loss': 0.2852456569671631, 'eval_accuracy': 0.908, 'eval_runtime': 3.4007, 'eval_samples_per_second': 294.055, 'eval_steps_per_second': 9.41, 'epoch': 2.0}
{'eval_loss': 0.26768365502357483, 'eval_accuracy': 0.911, 'eval_runtime': 3.4152, 'eval_samples_per_second': 292.81, 'eval_steps_per_second': 9.37, 'epoch': 3.0}
{'train_runtime': 536.628, 'train_samples_per_second': 55.905, 'train_steps_per_second': 1.75, 'train_loss': 0.2820566195649461, 'epoch': 3.0}
{'eval_loss': 0.26768365502357483, 'eval_accuracy': 0.911, 'eval_runtime': 3.4082, 'eval_samples_per_second': 293.414, 'eval_steps_per_second': 9.389, 'epoch': 3.0}
Transformer Model Accuracy: 0.91
Transformer Model Detailed Accuracy: 0.91
        

## 3. **Zero-Shot Classification**  

In [13]:
# Method 3: Zero-Shot Classification

def zero_shot_classification(data, batch_size=32):

    texts = data['text']
    true_labels = data['label']

    label_to_topic = {
        0: "World",
        1: "Sports",
        2: "Business",
        3: "Science/Technology"
    }
    candidate_labels = list(label_to_topic.values())

    # Load the zero-shot-classification pipeline
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

    predicted_labels = []

    # Process data in batches
    for text in texts:
        result = classifier(text, candidate_labels)
        predicted_topic = result['labels'][0]  # Get the most likely topic
        predicted_label = candidate_labels.index(predicted_topic)  # Convert topic back to label index
        predicted_labels.append(predicted_label)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Zero-Shot Classification Accuracy: {accuracy * 100:.2f}%")




In [14]:
if __name__ == "__main__":

    # Method 3: Zero-Shot Classification
    train_data = load_and_prepare_data(split="train", sample_size=1000)
    zero_shot_classification(train_data)



config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Zero-Shot Classification Accuracy: 69.50%


## 4. **Clustering**

In [15]:
# Method 4: Clustering
def clustering_with_kmeans(train_articles, train_labels, test_articles, test_labels):

    label_to_topic = {
        0: "World",
        1: "Sports",
        2: "Business",
        3: "Science/Technology"
    }

    embedder = SentenceTransformer('all-mpnet-base-v2')

    print("Generating embeddings for training articles...")
    train_embeddings = embedder.encode(train_articles, batch_size=32, show_progress_bar=True)

    print("Generating embeddings for test articles...")
    test_embeddings = embedder.encode(test_articles, batch_size=32, show_progress_bar=True)

    num_clusters = len(label_to_topic)

    print("Performing K-Means clustering...")
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(train_embeddings)

    centroids = kmeans.cluster_centers_
    topic_embeddings = embedder.encode(list(label_to_topic.values()))

    print("Assigning topics to clusters...")
    cluster_to_topic = {}
    for cluster_id, centroid in enumerate(centroids):
        similarities = cosine_similarity([centroid], topic_embeddings)[0]
        assigned_topic = np.argmax(similarities)
        cluster_to_topic[cluster_id] = assigned_topic

    predicted_topics = [cluster_to_topic[cluster] for cluster in cluster_labels]
    train_accuracy = np.mean([predicted == true_label for predicted, true_label in zip(predicted_topics, train_labels)])
    print(f"Training Accuracy: {train_accuracy:.2%}")

    test_cluster_labels = kmeans.predict(test_embeddings)
    test_predicted_topics = [cluster_to_topic[cluster] for cluster in test_cluster_labels]

    test_accuracy = np.mean([predicted == true_label for predicted, true_label in zip(test_predicted_topics, test_labels)])
    print(f"Test Accuracy: {test_accuracy:.2%}")



In [16]:
if __name__ == "__main__":
    # Load and encode data
    train_data = load_and_prepare_data(split="train", sample_size=10000).to_list()
    test_data = load_and_prepare_data(split="test", sample_size=1000)


    train_texts = [row['text'] for row in train_data]
    train_labels = [row['label'] for row in train_data]
    test_texts = [row['text'] for row in test_data]
    test_labels = [row['label'] for row in test_data]



    # Method 4: Clustering
    clustering_with_kmeans(train_texts, train_labels, test_texts, test_labels)



Generating embeddings for training articles...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Generating embeddings for test articles...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Performing K-Means clustering...
Assigning topics to clusters...
Training Accuracy: 69.10%
Test Accuracy: 69.10%
