<a href="https://colab.research.google.com/github/muajnstu/Multi-Class-Text-Classification-with-Transformers-and-LSTM/blob/main/aFull%20Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install tensorflow
!pip install torch
!pip install transformers
!pip install tqdm

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia

In [None]:
import numpy as np
import pandas as pd
import torch
import re
import string
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,  roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    AlbertTokenizer, AlbertForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    Trainer, TrainingArguments
)

In [None]:
# DATA PREPROCESSING

def preprocess_data(csv_url, category_map):
    df = pd.read_csv(csv_url)
    df["Category"] = df["Category"].str.lower().str.strip().map(category_map)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df['processed_text'] = df['Title'].astype(str) + " " + df['Description'].astype(str)
    def extract_txt(text):
        match = re.search(r"(?<=\s\-\s).*", str(text))
        return match.group(0) if match else text
    df['processed_text'] = df['processed_text'].apply(extract_txt)
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\d', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    df['processed_text'] = df['processed_text'].apply(clean_text)
    df.rename(columns={'processed_text': 'text'}, inplace=True)
    X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['Category'], test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

# KERAS TOKENIZER & SEQUENCES (LSTM)

def keras_prepare_sequences(X_train, X_val, X_test, max_words=10000, max_len=128):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq = tokenizer.texts_to_sequences(X_val)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
    X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')
    return X_train_pad, X_val_pad, X_test_pad, tokenizer

# TRANSFORMERS TOKENIZER

def pytorch_transformer_tokenize(tokenizer, X, max_length=128):
    encodings = tokenizer(
        list(X),
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encodings

# DATASET CLASS FOR PYTORCH TRANSFORMERS

class TorchTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # labels: pandas Series or np.array or list
        if hasattr(labels, 'values'):
            labels = labels.values
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.encodings = encodings
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

# LSTM MODEL (TensorFlow)

def build_lstm_model(max_words, max_len, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
        tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_lstm(model, X_train_pad, y_train, X_val_pad, y_val, X_test_pad, y_test, category_names):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(
        X_train_pad, y_train,
        epochs=15,
        batch_size=32,
        validation_data=(X_val_pad, y_val),
        callbacks=[early_stopping],
        verbose=2
    )
    test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
    y_pred_proba = model.predict(X_test_pad)
    y_pred = np.argmax(y_pred_proba, axis=1)
    print("\nLSTM Test Accuracy:", test_acc)
    print("\nClassification Report (LSTM):")
    print(classification_report(y_test, y_pred, target_names=category_names))
    print("\nConfusion Matrix (LSTM):")
    print(confusion_matrix(y_test, y_pred))
    # ROC AUC
    y_test_oh = np.eye(len(category_names))[y_test]
    roc_auc = roc_auc_score(y_test_oh, y_pred_proba, average="macro", multi_class="ovr")
    print("\nROC AUC (LSTM):", roc_auc)
    return accuracy_score(y_test, y_pred), roc_auc

# TRAINING FUNCTION FOR PYTORCH TRANSFORMER MODELS

def train_pytorch_transformer(model, tokenizer, X_train, y_train, X_val, y_val, X_test, y_test, category_names, max_length=128, model_name="Model"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    train_encodings = pytorch_transformer_tokenize(tokenizer, X_train, max_length)
    val_encodings = pytorch_transformer_tokenize(tokenizer, X_val, max_length)
    test_encodings = pytorch_transformer_tokenize(tokenizer, X_test, max_length)
    # Pass Series directly, let TorchTextDataset handle .values
    train_dataset = TorchTextDataset(train_encodings, y_train)
    val_dataset = TorchTextDataset(val_encodings, y_val)
    test_dataset = TorchTextDataset(test_encodings, y_test)
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.lower()}",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"./logs_{model_name.lower()}",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=1,
        logging_steps=50,
        report_to="none"
    )
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, predictions)
        return {"accuracy": acc}
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"\n{model_name} Validation Results:", eval_results)
    test_preds = trainer.predict(test_dataset)
    test_preds_labels = np.argmax(test_preds.predictions, axis=1)
    test_acc = accuracy_score(y_test, test_preds_labels)
    print(f"\n{model_name} Test Accuracy:", test_acc)
    print(f"\nClassification Report ({model_name}):")
    print(classification_report(y_test, test_preds_labels, target_names=category_names))
    print(f"\nConfusion Matrix ({model_name}):")
    print(confusion_matrix(y_test, test_preds_labels))
    # ROC AUC
    test_labels_oh = np.eye(len(category_names))[np.array(y_test)]
    test_pred_proba = torch.nn.functional.softmax(torch.tensor(test_preds.predictions), dim=1).numpy()
    roc_auc = roc_auc_score(test_labels_oh, test_pred_proba, average="macro", multi_class="ovr")
    print(f"\nROC AUC ({model_name}):", roc_auc)
    return test_acc, roc_auc

In [None]:

# Data Preparation

csv_url = "https://raw.githubusercontent.com/muajnstu/ML-Datasets/refs/heads/main/Youtube%20Video%20Dataset.csv"
category_map = {
    "travel blog": 0,
    "science&technology": 1,
    "food": 2,
    "art&music": 3,
    "manufacturing": 4,
    "history": 5
}
category_names = list(category_map.keys())
X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(csv_url, category_map)
print("Data loaded and preprocessed.")
summary = {}
summary_rocauc = {}

Data loaded and preprocessed.


In [None]:
# LSTM Model Cell
print("\nTraining LSTM model...")
max_words = 10000
max_len = 128
X_train_pad, X_val_pad, X_test_pad, keras_tokenizer = keras_prepare_sequences(X_train, X_val, X_test, max_words, max_len)
lstm_model = build_lstm_model(max_words, max_len, len(category_map))
lstm_acc, lstm_rocauc = train_lstm(lstm_model, X_train_pad, y_train, X_val_pad, y_val, X_test_pad, y_test, category_names)
summary["LSTM"] = lstm_acc
summary_rocauc["LSTM"] = lstm_rocauc


Training LSTM model...




Epoch 1/15
281/281 - 150s - 534ms/step - accuracy: 0.2189 - loss: 1.7788 - val_accuracy: 0.2293 - val_loss: 1.7525
Epoch 2/15
281/281 - 125s - 445ms/step - accuracy: 0.2488 - loss: 1.7143 - val_accuracy: 0.2444 - val_loss: 1.7070
Epoch 3/15
281/281 - 126s - 448ms/step - accuracy: 0.2910 - loss: 1.6413 - val_accuracy: 0.2640 - val_loss: 1.7184
Epoch 4/15
281/281 - 119s - 424ms/step - accuracy: 0.3262 - loss: 1.5520 - val_accuracy: 0.3140 - val_loss: 1.6331
Epoch 5/15
281/281 - 147s - 523ms/step - accuracy: 0.5230 - loss: 1.1816 - val_accuracy: 0.6735 - val_loss: 1.2021
Epoch 6/15
281/281 - 136s - 484ms/step - accuracy: 0.7612 - loss: 0.8182 - val_accuracy: 0.7921 - val_loss: 0.7610
Epoch 7/15
281/281 - 144s - 512ms/step - accuracy: 0.8353 - loss: 0.6104 - val_accuracy: 0.8162 - val_loss: 0.6859
Epoch 8/15
281/281 - 138s - 492ms/step - accuracy: 0.8646 - loss: 0.4956 - val_accuracy: 0.8162 - val_loss: 0.6391
Epoch 9/15
281/281 - 117s - 418ms/step - accuracy: 0.8847 - loss: 0.4060 - val_a

In [None]:
# BERT Model Cell
print("\nTraining BERT (PyTorch) model...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_map))
bert_acc, bert_rocauc = train_pytorch_transformer(
    bert_model, bert_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="BERT"
)
summary["BERT"] = bert_acc
summary_rocauc["BERT"] = bert_rocauc


Training BERT (PyTorch) model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4506,0.441532,0.867975
2,0.3237,0.409378,0.882248
3,0.2315,0.524576,0.877788
4,0.1896,0.53625,0.874219
5,0.1694,0.542884,0.87868



BERT Validation Results: {'eval_loss': 0.40937814116477966, 'eval_accuracy': 0.8822479928635147, 'eval_runtime': 8.0088, 'eval_samples_per_second': 139.971, 'eval_steps_per_second': 8.865, 'epoch': 5.0}

BERT Test Accuracy: 0.8761140819964349

Classification Report (BERT):
                    precision    recall  f1-score   support

       travel blog       0.86      0.81      0.84       204
science&technology       0.77      0.95      0.85       220
              food       0.92      0.87      0.89       191
         art&music       0.92      0.86      0.89       173
     manufacturing       0.92      0.89      0.91       162
           history       0.93      0.87      0.90       172

          accuracy                           0.88      1122
         macro avg       0.89      0.87      0.88      1122
      weighted avg       0.88      0.88      0.88      1122


Confusion Matrix (BERT):
[[166  18   8   5   3   4]
 [  3 209   1   2   2   3]
 [  9  10 166   3   2   1]
 [  6   8   5 1

In [None]:
# DeBERTa Model Cell
print("\nTraining DeBERTa (PyTorch) model...")
deberta_tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
deberta_model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=len(category_map))
deberta_acc, deberta_rocauc = train_pytorch_transformer(
    deberta_model, deberta_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="DeBERTa"
)
summary["DeBERTa"] = deberta_acc
summary_rocauc["DeBERTa"] = deberta_rocauc


Training DeBERTa (PyTorch) model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5679,0.589816,0.837645
2,0.3991,0.45227,0.871543
3,0.3711,0.484868,0.87868
4,0.2945,0.466281,0.874219
5,0.2535,0.475465,0.879572



DeBERTa Validation Results: {'eval_loss': 0.47546467185020447, 'eval_accuracy': 0.8795718108831401, 'eval_runtime': 9.9862, 'eval_samples_per_second': 112.255, 'eval_steps_per_second': 7.11, 'epoch': 5.0}

DeBERTa Test Accuracy: 0.8725490196078431

Classification Report (DeBERTa):
                    precision    recall  f1-score   support

       travel blog       0.84      0.80      0.82       204
science&technology       0.78      0.95      0.86       220
              food       0.89      0.86      0.87       191
         art&music       0.91      0.88      0.89       173
     manufacturing       0.93      0.88      0.91       162
           history       0.95      0.86      0.91       172

          accuracy                           0.87      1122
         macro avg       0.88      0.87      0.88      1122
      weighted avg       0.88      0.87      0.87      1122


Confusion Matrix (DeBERTa):
[[163  19  10   9   1   2]
 [  3 208   2   3   1   3]
 [ 13   7 164   3   3   1]
 [  

In [None]:
# ALBERT v2 Model Cell
print("\nTraining ALBERT v2 (PyTorch) model...")
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(category_map))
albert_acc, albert_rocauc = train_pytorch_transformer(
    albert_model, albert_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="ALBERTv2"
)
summary["ALBERTv2"] = albert_acc
summary_rocauc["ALBERTv2"] = albert_rocauc


Training ALBERT v2 (PyTorch) model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6779,0.672346,0.801963
2,0.5542,0.591085,0.82694
3,0.5059,0.540597,0.854594
4,0.3542,0.539497,0.849242
5,0.3096,0.545103,0.855486



ALBERTv2 Validation Results: {'eval_loss': 0.5451033711433411, 'eval_accuracy': 0.855486173059768, 'eval_runtime': 8.5236, 'eval_samples_per_second': 131.517, 'eval_steps_per_second': 8.33, 'epoch': 5.0}

ALBERTv2 Test Accuracy: 0.8556149732620321

Classification Report (ALBERTv2):
                    precision    recall  f1-score   support

       travel blog       0.82      0.77      0.80       204
science&technology       0.76      0.94      0.84       220
              food       0.87      0.84      0.85       191
         art&music       0.90      0.86      0.88       173
     manufacturing       0.93      0.85      0.89       162
           history       0.93      0.86      0.89       172

          accuracy                           0.86      1122
         macro avg       0.87      0.85      0.86      1122
      weighted avg       0.86      0.86      0.86      1122


Confusion Matrix (ALBERTv2):
[[158  18  15   7   4   2]
 [  4 207   3   3   1   2]
 [ 14  10 161   1   2   3]
 [

In [None]:
# DistilBERT Model Cell
print("\nTraining DistilBERT (PyTorch) model...")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(category_map))
distilbert_acc, distilbert_rocauc = train_pytorch_transformer(
    distilbert_model, distilbert_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="DistilBERT"
)
summary["DistilBERT"] = distilbert_acc
summary_rocauc["DistilBERT"] = distilbert_rocauc


Training DistilBERT (PyTorch) model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4392,0.441818,0.851026
2,0.3377,0.407019,0.881356
3,0.2336,0.460603,0.876896
4,0.1938,0.498199,0.874219
5,0.1667,0.50753,0.874219



DistilBERT Validation Results: {'eval_loss': 0.4070194959640503, 'eval_accuracy': 0.8813559322033898, 'eval_runtime': 3.8486, 'eval_samples_per_second': 291.273, 'eval_steps_per_second': 18.448, 'epoch': 5.0}

DistilBERT Test Accuracy: 0.8725490196078431

Classification Report (DistilBERT):
                    precision    recall  f1-score   support

       travel blog       0.82      0.84      0.83       204
science&technology       0.77      0.93      0.85       220
              food       0.92      0.84      0.88       191
         art&music       0.96      0.87      0.91       173
     manufacturing       0.94      0.88      0.91       162
           history       0.90      0.88      0.89       172

          accuracy                           0.87      1122
         macro avg       0.89      0.87      0.88      1122
      weighted avg       0.88      0.87      0.87      1122


Confusion Matrix (DistilBERT):
[[171  17   6   3   4   3]
 [  5 205   3   1   0   6]
 [ 15   9 160   1 

In [None]:
# XLM-RoBERTa Model Cell
print("\nTraining XLM-RoBERTa (PyTorch) model...")
roberta_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
roberta_model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(category_map))
roberta_acc, roberta_rocauc = train_pytorch_transformer(
    roberta_model, roberta_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="XLM-RoBERTa"
)
summary["XLM-RoBERTa"] = roberta_acc
summary_rocauc["XLM-RoBERTa"] = roberta_rocauc


Training XLM-RoBERTa (PyTorch) model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5443,0.511816,0.842997
2,0.4326,0.485645,0.847458
3,0.4032,0.524523,0.861731
4,0.3159,0.456897,0.874219
5,0.2811,0.492872,0.876896



XLM-RoBERTa Validation Results: {'eval_loss': 0.49287185072898865, 'eval_accuracy': 0.8768956289027654, 'eval_runtime': 7.212, 'eval_samples_per_second': 155.436, 'eval_steps_per_second': 9.845, 'epoch': 5.0}

XLM-RoBERTa Test Accuracy: 0.8698752228163993

Classification Report (XLM-RoBERTa):
                    precision    recall  f1-score   support

       travel blog       0.87      0.81      0.84       204
science&technology       0.76      0.94      0.84       220
              food       0.92      0.83      0.87       191
         art&music       0.92      0.89      0.91       173
     manufacturing       0.92      0.87      0.89       162
           history       0.89      0.87      0.88       172

          accuracy                           0.87      1122
         macro avg       0.88      0.87      0.87      1122
      weighted avg       0.88      0.87      0.87      1122


Confusion Matrix (XLM-RoBERTa):
[[165  20   8   5   3   3]
 [  2 207   1   4   1   5]
 [ 16   8 159  

In [None]:
# XLNet Model Cell
print("\nTraining XLNet (PyTorch) model...")
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(category_map))
xlnet_acc, xlnet_rocauc = train_pytorch_transformer(
    xlnet_model, xlnet_tokenizer,
    X_train, y_train, X_val, y_val, X_test, y_test,
    category_names, model_name="XLNet"
)
summary["XLNet"] = xlnet_acc
summary_rocauc["XLNet"] = xlnet_rocauc


Training XLNet (PyTorch) model...


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5128,0.455221,0.853702
2,0.3842,0.426911,0.875112
3,0.3474,0.494114,0.869759
4,0.2763,0.483836,0.876004
5,0.2235,0.54238,0.876896



XLNet Validation Results: {'eval_loss': 0.5423797965049744, 'eval_accuracy': 0.8768956289027654, 'eval_runtime': 12.0553, 'eval_samples_per_second': 92.988, 'eval_steps_per_second': 5.89, 'epoch': 5.0}

XLNet Test Accuracy: 0.875222816399287

Classification Report (XLNet):
                    precision    recall  f1-score   support

       travel blog       0.90      0.79      0.84       204
science&technology       0.80      0.93      0.86       220
              food       0.91      0.86      0.88       191
         art&music       0.88      0.90      0.89       173
     manufacturing       0.89      0.90      0.89       162
           history       0.92      0.88      0.90       172

          accuracy                           0.88      1122
         macro avg       0.88      0.88      0.88      1122
      weighted avg       0.88      0.88      0.88      1122


Confusion Matrix (XLNet):
[[162  19   8   8   4   3]
 [  2 204   3   4   3   4]
 [  8   7 164   4   5   3]
 [  3   6   3 

In [None]:
# Summary Cell
print("\n\nSummary of Test Accuracies:")
for k, v in summary.items():
    print(f"{k:<12}: {v:.4f}")

print("\nSummary of ROC AUC Scores:")
for k, v in summary_rocauc.items():
    print(f"{k:<12}: {v:.4f}")



Summary of Test Accuracies:
DeBERTa     : 0.8725
ALBERTv2    : 0.8556
DistilBERT  : 0.8725
XLM-RoBERTa : 0.8699
XLNet       : 0.8752

Summary of ROC AUC Scores:
DeBERTa     : 0.9744
ALBERTv2    : 0.9708
DistilBERT  : 0.9805
XLM-RoBERTa : 0.9769
XLNet       : 0.9790
