# **Installing the packages**

In [None]:
! pip install tqdm pandas numpy plotly scikit-learn matplotlib torch lightning transformers wandb datasets sentence-transformers
! python -m pip install git+https://github.com/osainz59/t5-encoder

Collecting git+https://github.com/osainz59/t5-encoder
  Cloning https://github.com/osainz59/t5-encoder to /tmp/pip-req-build-r72er44w
  Running command git clone --filter=blob:none --quiet https://github.com/osainz59/t5-encoder /tmp/pip-req-build-r72er44w
  Resolved https://github.com/osainz59/t5-encoder to commit f7443943fb65426a34948254a8e43bde6b700982
  Preparing metadata (setup.py) ... [?25l[?25hdone


# **Importing the libraries**

In [None]:
# Mount the drive if not mounted
from google.colab import drive
drive.mount("/content/drive/")

import os
import random
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split, TensorDataset

from datasets import load_dataset

from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict

import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, LightningModule, LightningDataModule, seed_everything
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import TQDMProgressBar
# from lightning.pytorch.strategies import DeepSpeedStrategy
from lightning.pytorch.plugins.precision import DeepSpeedPrecisionPlugin

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

# from deepspeed.ops.adam import DeepSpeedCPUAdam

import t5_encoder

import wandb
wandb.login(relogin=True)

import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# **Setting seed value for reproducibility**    


In [None]:
seed = 111
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
seed_everything(seed)

INFO: Seed set to 111
INFO:lightning.fabric.utilities.seed:Seed set to 111


111

# **Creating DataModule**

In PyTorch, a Dataset is a handy tool that lets us organize our data in an easy-to-use format. When we create a Dataset, we basically tell PyTorch how to get data and its corresponding label. This involves creating a class and defining two key methods: len and getitem. The len method tells PyTorch how many data samples we have, while the getitem method tells PyTorch how to get the n-th data sample.

The DataModule provides a high-level abstraction over the data pipeline and allows us to encapsulate all the complex data procedures from data preparation, splitting, and processing, to creating PyTorch Dataset and DataLoader objects into a single class that can be easily shared, reused, and tested. This ensures that our data pipeline is robust and that our machine learning model can focus on learning from the data, not managing it.

In [None]:
class contextualizedClassifierDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer_name_or_path, batch_size=32):
        super().__init__()

        self.tokenizer_name_or_path = tokenizer_name_or_path
        self.batch_size = batch_size

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.tokenizer_name_or_path == "distilgpt2":
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path, use_fast=True)
            self.tokenizer.pad_token = self.tokenizer.eos_token
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path, use_fast=True)

    def setup(self, stage=None):
        # Load the dataset into a pandas dataframe.
        # Load the data from a CSV file
        # Pre-processing data
        df = pd.read_csv("/content/drive/MyDrive/AA-Tutorial/data/Agora.csv", encoding='ISO-8859-1')
        # Renaming all the features of the dataframe
        df = df.rename(str.strip, axis='columns')
        # Merging the Item and Item Description using a [SEP] token
        separator = ' [SEP] '
        df['TEXT'] = df.apply(lambda row: f"{row['Item']}{separator}{row['Item Description']}", axis=1)
        # dropping Unncessary columns
        df.drop(columns=["Item", "Item Description", "Category", "Price", "Origin", "Destination", "Rating", "Remarks"], inplace=True)
        # Assuming that vendors Amsterdam100 and amsterdam100 are the same vendors
        df.Vendor = df.Vendor.apply(lambda x: x.lower())

        # Due to the extensive time required to train on over 100K+ samples, we have decided to limit our analysis to a subset of 5K samples.
        # To get these samples, we look into vendors that have 5+ advertisements and then allocate all the vendors that have less than 5 ads into a new class, "others".
        df = df.iloc[:5000]
        # Assigning a vendor ID to "others" class
        # vendors_dict["others"] = len(vendors_dict) + 1
        # Calculate advertisement frequency for each vendor
        ad_freq = df['Vendor'].value_counts()
        # Filter vendors with ad frequency less than 5
        vendors_to_replace = ad_freq[ad_freq < 5].index
        # Update DataFrame: Replace vendor names with 'others' where ad frequency is less than 5
        df['Vendor'] = df['Vendor'].apply(lambda x: 'others' if x in vendors_to_replace else x)

        # Assigning vendor IDs to vendor handles using a dictionary comprehension.
        # This approach eliminates the need for checking if a vendor already exists in the dictionary,
        # as each unique vendor will be processed once. The enumerate function provides a counter (idx),
        # which is used to assign IDs, starting from 1 for the first vendor.
        vendors_dict = {vendor: idx for idx, vendor in enumerate(df.Vendor.unique())}

        # Updating the 'Vendor' column in the DataFrame to reflect the vendor IDs.
        # The 'map' function is used to replace each vendor handle with its corresponding vendor ID
        # based on the 'vendor_to_idx_dict'. This operation is vectorized and efficient.
        df['Vendor'] = df['Vendor'].map(vendors_dict)

        text = df.TEXT.values.tolist()
        vendors = df.Vendor.values.tolist()

        # Tokenizing the data with padding and truncation
        encodings = self.tokenizer(text, add_special_tokens=True, max_length=512, padding='max_length', return_token_type_ids=False, truncation=True,
                                   return_attention_mask=True, return_tensors='pt')

        # Convert the lists into tensors.
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        labels = torch.tensor(vendors)

        # Combine the inputs into a TensorDataset.
        dataset = TensorDataset(input_ids, attention_mask, labels)

        # Getting an 0.75-0.05-0.20 split for training-val-test dataset
        train_dataset, test_dataset = random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(1111))
        train_dataset, val_dataset = random_split(train_dataset, [0.95, 0.05], generator=torch.Generator().manual_seed(1111))

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.val_dataset = val_dataset

    # Returning the pytorch-lightning default training DataLoader
    def train_dataloader(self):
        return DataLoader(self.train_dataset, sampler=RandomSampler(self.train_dataset), batch_size=self.batch_size)

    # Returning the pytorch-lightning default val DataLoader
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    # Returning the pytorch-lightning default test DataLoader
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# **Initializing Lightning Model Module**

Once we’ve defined our datamodule, the next step is to set up the training process. This involves defining how the data is fed to the model, calculating the loss, updating the model’s weights, and then evaluating the model’s performance.

While all these steps can be done using raw PyTorch, it can lead to repetition or even impose logical errors, especially when dealing with complex models and workflows. That is where PyTorch Lightning steps in, by providing the LightningModule class that encapsulates all aspects of the training logic into a single class similar to LightningDataModule. This class enables us to define the methods for the training step, validation step, test step, and configuring the optimizer while also abstracting away repetitive steps such as backward propagation and weight updates thus making our code cleaner and easier to understand. Furthermore, it makes it incredibly straightforward to experiment with different models and tune hyperparameters, thus enhancing the overall machine learning experimentation process.

In [None]:
class ClassifierModel(pl.LightningModule):
    def __init__(self, learning_rate, adam_epsilon, weight_decay, model_name_or_path, num_classes, num_training_steps, warmup_steps):
        super().__init__()

        self.save_hyperparameters()
        self.hparams.learning_rate = learning_rate
        self.hparams.eps = adam_epsilon
        self.hparams.weight_decay = weight_decay
        self.hparams.model_name_or_path = model_name_or_path
        self.hparams.num_classes = num_classes
        self.hparams.num_training_steps = num_training_steps
        self.hparams.warmup_steps = warmup_steps

        # freeze
        self._frozen = False

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.hparams.model_name_or_path == "distilgpt2":
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=False, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)
            self.model.config.pad_token_id = self.model.config.eos_token_id
        else:
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=False, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)

    def forward(self, batch):
        # The batch contains the input_ids, the input_put_mask and the labels (for training)
        input_ids = batch[0]
        input_mask = batch[1]
        labels = batch[2]

        outputs = self.model(input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]

        return loss, logits

    def training_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class stipulates you to overwrite. This we do here, by virtue of this definition
        outputs = self(batch)  # self refers to the model, which in turn acceses the forward method
        train_loss = outputs[0]
        self.log_dict({"train_loss": train_loss}, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
        # the training_step method expects a dictionary, which should at least contain the loss

    def validation_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')

        self.log_dict({"val_loss": val_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy},
                      on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss

    def test_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do test. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        test_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')

        self.log_dict({"test_loss": test_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy},
                      on_step=True, on_epoch=True, prog_bar=True, logger=True)

    def predict_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        predictions = torch.argmax(logits, dim=1)
        return predictions.detach().cpu().numpy()

    def configure_optimizers(self):
        # The configure_optimizers is a (virtual) method, specified in the interface, that the
        # pl.LightningModule class wants you to overwrite.

        # In this case we define that some parameters are optimized in a different way than others. In
        # particular we single out parameters that have 'bias', 'LayerNorm.weight' in their names. For those
        # we do not use an optimization technique called weight decay.

        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':self.hparams.weight_decay},
                                        {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.eps)
        # optimizer = DeepSpeedCPUAdam(optimizer_grouped_parameters, adamw_mode=True, lr=self.hparams.learning_rate, betas=(0.9, 0.999), eps=self.hparams.eps)

        # We also use a scheduler that is supplied by transformers.
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.num_training_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]

    def freeze(self) -> None:
        # freeze all layers, except the final classifier layers
        for name, param in self.model.named_parameters():
            if 'classifier' not in name:  # classifier layer
                param.requires_grad = False

        self._frozen = True

    def unfreeze(self) -> None:
        if self._frozen:
            for name, param in self.model.named_parameters():
                if 'classifier' not in name:  # classifier layer
                    param.requires_grad = True

        self._frozen = False

    def train_epoch_start(self):
        """pytorch lightning hook"""
        if self.current_epoch < self.hparams.nr_frozen_epochs:
            self.freeze()

        if self.current_epoch >= self.hparams.nr_frozen_epochs:
            self.unfreeze()

# **Helper functions**

In [None]:
def train_model(tokenizer_name, model_name):
  # Loading the datamodule
  dm = contextualizedClassifierDataModule(tokenizer_name_or_path=tokenizer_name)
  dm.setup()

  # Change the number of classes as you increase the size of the dataset
  num_classes = 153
  nb_epochs = 10

  num_training_steps = len(dm.train_dataloader()) * nb_epochs
  # Setting the warmup steps to 1/10th the size of training data
  warmup_steps = int(len(dm.train_dataloader()) * 10/100)

  # Loading the model
  model = ClassifierModel(learning_rate=0.0001, adam_epsilon=float(1e-6), weight_decay=0.01, model_name_or_path=model_name, num_classes=num_classes,
                            num_training_steps=num_training_steps, warmup_steps=warmup_steps)

  early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=5, verbose=False, mode="min")
  wandb_logger = WandbLogger(save_dir="logs", name=model_name, project="AA-Tutorials")

  # %% Setting up the trainer
  # Unfortunately the lr_finder functionality doesn't support DeepSpeedStrategy yet, therefore we will set up our trainer twice. Once to find the suitable
  # learning rate and secondly to train our model.
  trainer = L.Trainer(max_epochs=nb_epochs,
                      accelerator="gpu",
                      devices=1 if torch.cuda.is_available() else None,
                      fast_dev_run=False,
                      accumulate_grad_batches = 1, # To run the backward step after n batches, helps to increase the batch size
                      benchmark = True, # Fastens the training process
                      deterministic=True, # Ensures reproducibility
                      limit_train_batches=1.0, # trains on 10% of the data,
                      check_val_every_n_epoch = 1, # run val loop every 1 training epochs
                      callbacks=[early_stop_callback], # Enables model checkpoint and early stopping
                      logger = wandb_logger,
                      precision='16-mixed') # Mixed Precision system

  # Training model
  trainer.fit(model, dm)
  # Evaluating model
  trainer.test(model=model, dataloaders=dm.test_dataloader())

  return trainer, model, dm

# **Loading the Results Dataframe**

In [None]:
results_df = pd.read_csv("/content/drive/MyDrive/AA-Tutorial/data/results.csv")

[all-MiniLM-L6-v2](https://arxiv.org/abs/2002.10957)

In [None]:
_, _, _ = train_model("sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L6-v2")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: `Trainer(limit_train_batches=1.0)` was conf

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112484266666596, max=1.0…

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 22.8 M
--------------------------------------------------------
22.8 M    Trainable params
0         Non-trainable params
22.8 M    Total params
91.088    Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 22.8 M
--------------------------------------------------------
22.8 M    Trainable params
0         Non-trainable params
22.8 M    Total params
91.088    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [None]:
results_df["All-miniLM"] = [0.6475543,0.6820589, 0.7030000, 0.552629]

[DistilBERT](https://arxiv.org/abs/1910.01108)

In [None]:
_, _, _ = train_model("distilbert/distilbert-base-uncased", "distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [None]:
results_df["DistilBERT"] = [0.77843189, 0.7948137, 0.8080000, 0.70647042]

[DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base)

In [None]:
_, _, _ = train_model("distilbert/distilroberta-base", "distilbert/distilroberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False,

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [None]:
results_df["DistilRoBERTa"] = [0.7820527, 0.8015660, 0.81099999, 0.707505643]

In [None]:
results_df

Unnamed: 0,Metrics,MultinomialNB,SVC,RandomForest,LogisticRegression,MLP,All-miniLM,DistilBERT,DistilRoBERTa
0,Accuracy,0.331297,0.682322,0.412122,0.775756,0.676682,0.647554,0.778432,0.782053
1,Weighted-F1,0.524405,0.708481,0.34429,0.662284,0.752288,0.682059,0.794814,0.801566
2,Micro-F1,0.596,0.7,0.344,0.668,0.756,0.703,0.808,0.811
3,Macro-F1,0.337436,0.671945,0.283832,0.590027,0.646615,0.552629,0.70647,0.707506


In [None]:
results_df.to_csv('/content/drive/MyDrive/AA-Tutorial/data/results.csv', encoding='utf-8', index=False)