# Assignment <span style="color:red">option Four</span> - News Categorization  using PyTorch 
Download the dataset from https://www.kaggle.com/uciml/news-aggregator-dataset and develop a news classification or categorization model. The dataset contain only titles of a news item and some metadata. The categories of the news items include one of: –<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health. 

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.
2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.
3. Adapt the Text Categorization PyTorch code (see above) and evaluate the performance of the system for these task
4. Use a pre-trained embeddings and compare your result. When you use pre-trained embeddings, you have to average the word embeddings of each tokens in each document to get the unique representation of the document. DOC_EMBEDDING = (TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the <span  style="color:red">spacy/FLAIR </span>document embedding methods
6. Report the recall, precision, and F1 scores for both binary and multi-class classification.
 

In [1]:
# %pip install torchtext

In [2]:
# Import all relevant packages
from sklearn.base import clone
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
import pandas as pd
from itertools import combinations
import numpy as np
from collections import Counter
import spacy


debug = True

# Exercise 1

In [3]:
# 1
# Assuming you've downloaded the dataset and stored it locally as 'uci-news-aggregator.csv'
df = pd.read_csv("uci-news-aggregator.csv")
# Keep only 'TITLE' and 'CATEGORY' columns
df = df[['TITLE', 'CATEGORY']]
# Drop rows with NaN values in the 'TITLE' column
df = df.dropna(subset=['TITLE'])

# Map category labels to numerical values
label_mapping = {label: i for i, label in enumerate(df['CATEGORY'].unique())}

# Split the data into training and test sets maintaining nearly the same percentage as inside the input file(80% train and 20% test)
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['CATEGORY'], random_state=42)


def preprocess_text(text):
    # Write your code here
    pass

# Excercise 2

In [4]:
def binary_classification(train_df, test_df, label1, label2, model=None):
    # Combine label mapping and filtering
    label_mapping = {label1: 0, label2: 1}
    train_df['CATEGORY_BINARY'] = train_df['CATEGORY'].map(label_mapping)
    test_df['CATEGORY_BINARY'] = test_df['CATEGORY'].map(label_mapping)

    # Handle missing values
    train_df['CATEGORY_BINARY'].fillna(0, inplace=True)
    test_df['CATEGORY_BINARY'].fillna(0, inplace=True)

    # Feature: news titles, Label: binary category
    X_train, y_train = train_df['TITLE'], train_df['CATEGORY_BINARY']
    X_test, y_test = test_df['TITLE'], test_df['CATEGORY_BINARY']

    # Use a pipeline for preprocessing
    if model is None:
        model = make_pipeline(
            TfidfVectorizer(),  # You can use other vectorizers based on your needs
            # Impute missing values with the mean
            SimpleImputer(strategy='mean'),
            LogisticRegression(max_iter=1000)
        )

    # Fit the model
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)

    # Evaluate and report metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }

    print(f"Binary Classification ({label1} vs {label2}):")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.2f}")

    return metrics


# Get all unique pairs of labels
label_pairs = list(combinations(label_mapping.keys(), 2))

# Iterate over each pair
for label1, label2 in label_pairs:
    binary_classification(train_df, test_df, label1, label2)

Binary Classification (b vs t):
Accuracy: 0.96
Precision: 0.94
Recall: 0.89
F1 Score: 0.91
Binary Classification (b vs e):
Accuracy: 0.98
Precision: 0.98
Recall: 0.97
F1 Score: 0.97
Binary Classification (b vs m):
Accuracy: 0.98
Precision: 0.97
Recall: 0.85
F1 Score: 0.91
Binary Classification (t vs e):
Accuracy: 0.98
Precision: 0.98
Recall: 0.97
F1 Score: 0.97
Binary Classification (t vs m):
Accuracy: 0.98
Precision: 0.97
Recall: 0.85
F1 Score: 0.91
Binary Classification (e vs m):
Accuracy: 0.98
Precision: 0.97
Recall: 0.85
F1 Score: 0.91


# Exercise 3-5

In [4]:
# Define a function to perform multi-class classification
if debug:
    print('total texts in train:', len(train_df))
if debug:
    print('total texts in test:', len(test_df))

# Getting all the vocabularies and indexing to a unique position
vocab = Counter()
# Indexing words from the training data
for text in train_df['TITLE']:
    for word in text.split(' '):
        vocab[word.lower()] += 1

# Indexing words from the test data
for text in test_df['TITLE']:
    for word in text.split(' '):
        vocab[word.lower()] += 1

total_words = len(vocab)


def get_word_2_index(vocab):
    word2index = {}
    for i, word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index


word2index = get_word_2_index(vocab)

if debug:
    print(len(word2index))
if debug:
    print(total_words)


def get_batch(df, i, batch_size):
    batches = []
    results = []
    texts = df['TITLE'].iloc[i * batch_size:i * batch_size + batch_size]
    categories = df['CATEGORY'].iloc[i *
                                     batch_size:i * batch_size + batch_size]

    for text in texts:
        layer = np.zeros(total_words, dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    for category in categories:
        # Map category labels to numerical values
        results.append(label_mapping[category])

    return np.array(batches), np.array(results)


# Setup of the nn
# Parameters
learning_rate = 0.05
num_epochs = 10
batch_size = 300
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words  # Words in vocab
num_classes = len(label_mapping)   # Number of categories

# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"
device = torch.device(device)
print(f"Using device: {device}")


# Define the network
class TextClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TextClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out


# Instantiate the model
news_net = TextClassifier(input_size, hidden_size, num_classes).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(train_df) / batch_size)
    for i in range(total_batch):
        batch_x, batch_y = get_batch(train_df, i, batch_size)
        articles = torch.FloatTensor(batch_x).to(device)
        labels = torch.LongTensor(batch_y).to(device)

        optimizer.zero_grad()
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 4 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                  (epoch + 1, num_epochs, i + 1, len(train_df) / batch_size, loss.data))

# Test the Model
all_predicted = []
all_labels = []

with torch.no_grad():
    # Calculate the total number of batches
    total_batches = int(np.ceil(len(test_df) / batch_size))

    for i in range(total_batches):
        # Calculate the start and end indices for the current batch
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(test_df))

        # Get the current batch
        batch_x_test, batch_y_test = get_batch(test_df, i, batch_size)

        articles = torch.FloatTensor(batch_x_test).to(device)
        labels = torch.LongTensor(batch_y_test).to(device)
        # Forward pass
        outputs = news_net(articles)

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)

        # Collect predicted and true labels for all batches
        all_predicted.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
all_predicted = np.array(all_predicted)
all_labels = np.array(all_labels)

# Get class labels
class_labels = list(map(str, np.unique(all_labels)))

# Calculate and print classification report
print("Classification Report:")
print(classification_report(all_labels, all_predicted, target_names=label_mapping))

total texts in train: 337935
total texts in test: 84484
135402
135402
Using device: mps
Epoch [1/10], Step [4/1126], Loss: 1.1653
Epoch [1/10], Step [8/1126], Loss: 0.8322
Epoch [1/10], Step [12/1126], Loss: 0.7836
Epoch [1/10], Step [16/1126], Loss: 0.5668
Epoch [1/10], Step [20/1126], Loss: 0.4230
Epoch [1/10], Step [24/1126], Loss: 0.4636
Epoch [1/10], Step [28/1126], Loss: 0.6043
Epoch [1/10], Step [32/1126], Loss: 0.4303
Epoch [1/10], Step [36/1126], Loss: 0.5576
Epoch [1/10], Step [40/1126], Loss: 0.4463
Epoch [1/10], Step [44/1126], Loss: 0.4204
Epoch [1/10], Step [48/1126], Loss: 0.3331
Epoch [1/10], Step [52/1126], Loss: 0.4707
Epoch [1/10], Step [56/1126], Loss: 0.3779
Epoch [1/10], Step [60/1126], Loss: 0.3750
Epoch [1/10], Step [64/1126], Loss: 0.2290
Epoch [1/10], Step [68/1126], Loss: 0.3324
Epoch [1/10], Step [72/1126], Loss: 0.4068
Epoch [1/10], Step [76/1126], Loss: 0.3198
Epoch [1/10], Step [80/1126], Loss: 0.3980
Epoch [1/10], Step [84/1126], Loss: 0.3886
Epoch [1/10

# 4

In [6]:
# Define the model parameters
input_size = 300  # Assuming 300-dimensional GloVe embeddings
output_size = len(label_mapping)
hidden_size = 100      # 1st layer and 2nd layer number of features
num_epochs = 300
batch_size = 3000
learning_rate = 0.02

# Load GloVe embeddings
glove = GloVe(name='6B', dim=300)

# Tokenization and embeddings using spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])


def get_average_embedding(text):
    tokens = nlp(text)
    embeddings = [glove[token.text].numpy()
                  for token in tokens if token.text in glove.stoi]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(input_size)  # Return zeros if no embeddings are found


# Apply tokenization and embeddings to the dataset
train_df['EMBEDDING_GLOVE'] = train_df['TITLE'].apply(get_average_embedding)
test_df['EMBEDDING_GLOVE'] = test_df['TITLE'].apply(get_average_embedding)

# Convert embeddings to torch tensors
train_embeddings = torch.tensor(
    np.vstack(train_df['EMBEDDING_GLOVE'].to_numpy()))
test_embeddings = torch.tensor(
    np.vstack(test_df['EMBEDDING_GLOVE'].to_numpy()))

# Convert labels to torch tensors
train_labels = torch.tensor(train_df['CATEGORY'].map(label_mapping).to_numpy())
test_labels = torch.tensor(test_df['CATEGORY'].map(label_mapping).to_numpy())

# Instantiate the model
model = TextClassifier(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
if debug:
    print("Training the model...")
for epoch in range(num_epochs):
    for i in range(0, len(train_embeddings), batch_size):
        inputs = train_embeddings[i:i+batch_size]
        labels = train_labels[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if ((i/batch_size) + 1) % 4 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                  (epoch + 1, num_epochs, i/batch_size, len(train_embeddings) / batch_size, loss.data))


# Evaluate on the test set
with torch.no_grad():
    test_outputs = model(test_embeddings.float())
    _, test_predictions = torch.max(test_outputs, 1)

test_predictions = test_predictions.numpy()
test_labels = test_labels.numpy()

# Print evaluation metrics
print("Classification Report:")
print(classification_report(test_labels,
      test_predictions, target_names=label_mapping))



Training the model...
Epoch [1/300], Step [3/112], Loss: 1.2078
Epoch [1/300], Step [7/112], Loss: 1.1118
Epoch [1/300], Step [11/112], Loss: 1.0523
Epoch [1/300], Step [15/112], Loss: 0.9697
Epoch [1/300], Step [19/112], Loss: 0.9404
Epoch [1/300], Step [23/112], Loss: 0.9228
Epoch [1/300], Step [27/112], Loss: 0.8742
Epoch [1/300], Step [31/112], Loss: 0.9135
Epoch [1/300], Step [35/112], Loss: 0.8719
Epoch [1/300], Step [39/112], Loss: 0.8720
Epoch [1/300], Step [43/112], Loss: 0.8565
Epoch [1/300], Step [47/112], Loss: 0.8560
Epoch [1/300], Step [51/112], Loss: 0.8486
Epoch [1/300], Step [55/112], Loss: 0.8608
Epoch [1/300], Step [59/112], Loss: 0.8443
Epoch [1/300], Step [63/112], Loss: 0.8580
Epoch [1/300], Step [67/112], Loss: 0.8562
Epoch [1/300], Step [71/112], Loss: 0.8401
Epoch [1/300], Step [75/112], Loss: 0.8529
Epoch [1/300], Step [79/112], Loss: 0.8399
Epoch [1/300], Step [83/112], Loss: 0.8607
Epoch [1/300], Step [87/112], Loss: 0.8339
Epoch [1/300], Step [91/112], Loss

# 5

In [None]:
# See outputs of 2 and 3 above