# **Installing the packages**

In [None]:
! pip install tqdm pandas numpy plotly scikit-learn matplotlib torch lightning transformers datasets faiss-cpu
# ! python -m pip install git+https://github.com/osainz59/t5-encoder

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/00/4e/6258fc3b26f1f7abd1b2e75b1e9e4f12f13584136e2e1549f995ff4c6b7b/plotly-5.20.0-py3-none-any.whl.metadata
  Downloading plotly-5.20.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lightning
  Obtaining dependency information for lightning from https://files.pythonhosted.org/packages/a0/4a/b7d4f62449d940ce43d4657322a14f5718815b648f9d2b0b23a195acb646/lightning-2.2.1-py3-none-any.whl.metadata
  Downloading lightning-2.2.1-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hCollecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.wh

# **Importing the libraries**

In [None]:
# Mount the drive if not mounted
from google.colab import drive
drive.mount("/content/drive/")

import os
import random
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split, TensorDataset

from datasets import load_dataset

from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict

import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, LightningModule, LightningDataModule, seed_everything
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import TQDMProgressBar
# from lightning.pytorch.strategies import DeepSpeedStrategy
from lightning.pytorch.plugins.precision import DeepSpeedPrecisionPlugin

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

# from deepspeed.ops.adam import DeepSpeedCPUAdam

# import t5_encoder

import faiss

# import wandb
# wandb.login(relogin=True)

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# **Setting seed value for reproducibility**    

In [None]:
seed = 111
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
seed_everything(seed)

Seed set to 111


111

# **Creating DataModule**

In [None]:
class contextualizedClassifierDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer_name_or_path, batch_size=32):
        super().__init__()

        self.tokenizer_name_or_path = tokenizer_name_or_path
        self.batch_size = batch_size

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.tokenizer_name_or_path == "distilgpt2":
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path, use_fast=True)
            self.tokenizer.pad_token = self.tokenizer.eos_token
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name_or_path, use_fast=True)

    def setup(self, stage=None):
        # Load the dataset into a pandas dataframe.
        # Load the data from a CSV file
        # Pre-processing data
        df = pd.read_csv("data/Agora.csv", encoding='ISO-8859-1')
        # df = pd.read_csv("/content/drive/MyDrive/AA-Tutorial/data/Agora.csv", encoding='ISO-8859-1')
        # Renaming all the features of the dataframe
        df = df.rename(str.strip, axis='columns')
        # Merging the Item and Item Description using a [SEP] token
        separator = ' [SEP] '
        df['TEXT'] = df.apply(lambda row: f"{row['Item']}{separator}{row['Item Description']}", axis=1)
        # dropping Unncessary columns
        df.drop(columns=["Item", "Item Description", "Category", "Price", "Origin", "Destination", "Rating", "Remarks"], inplace=True)
        # Assuming that vendors Amsterdam100 and amsterdam100 are the same vendors
        df.Vendor = df.Vendor.apply(lambda x: x.lower())

        # Due to the extensive time required to train on over 100K+ samples, we have decided to limit our analysis to a subset of 5K samples.
        # To get these samples, we look into vendors that have 5+ advertisements and then allocate all the vendors that have less than 5 ads into a new class, "others".
        df = df.iloc[:5000]
        # Assigning a vendor ID to "others" class
        # vendors_dict["others"] = len(vendors_dict) + 1
        # Calculate advertisement frequency for each vendor
        ad_freq = df['Vendor'].value_counts()
        # Filter vendors with ad frequency less than 5
        vendors_to_replace = ad_freq[ad_freq < 5].index
        # Update DataFrame: Replace vendor names with 'others' where ad frequency is less than 5
        df['Vendor'] = df['Vendor'].apply(lambda x: 'others' if x in vendors_to_replace else x)

        # Assigning vendor IDs to vendor handles using a dictionary comprehension.
        # This approach eliminates the need for checking if a vendor already exists in the dictionary,
        # as each unique vendor will be processed once. The enumerate function provides a counter (idx),
        # which is used to assign IDs, starting from 1 for the first vendor.
        vendors_dict = {vendor: idx for idx, vendor in enumerate(df.Vendor.unique())}

        # Updating the 'Vendor' column in the DataFrame to reflect the vendor IDs.
        # The 'map' function is used to replace each vendor handle with its corresponding vendor ID
        # based on the 'vendor_to_idx_dict'. This operation is vectorized and efficient.
        df['Vendor'] = df['Vendor'].map(vendors_dict)

        text = df.TEXT.values.tolist()
        vendors = df.Vendor.values.tolist()

        # Tokenizing the data with padding and truncation
        encodings = self.tokenizer(text, add_special_tokens=True, max_length=512, padding='max_length', return_token_type_ids=False, truncation=True,
                                   return_attention_mask=True, return_tensors='pt')

        # Convert the lists into tensors.
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        labels = torch.tensor(vendors)

        # Combine the inputs into a TensorDataset.
        dataset = TensorDataset(input_ids, attention_mask, labels)

        # Getting an 0.75-0.05-0.20 split for training-val-test dataset
        train_dataset, test_dataset = random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(1111))
        train_dataset, val_dataset = random_split(train_dataset, [0.95, 0.05], generator=torch.Generator().manual_seed(1111))

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.val_dataset = val_dataset

    # Returning the pytorch-lightning default training DataLoader
    def train_dataloader(self):
        return DataLoader(self.train_dataset, sampler=RandomSampler(self.train_dataset), batch_size=self.batch_size)

    # Returning the pytorch-lightning default val DataLoader
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    # Returning the pytorch-lightning default test DataLoader
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# **Initializing Lightning Model Module**

In [None]:
class ClassifierModel(pl.LightningModule):
    def __init__(self, learning_rate, adam_epsilon, weight_decay, model_name_or_path, num_classes, num_training_steps, warmup_steps):
        super().__init__()

        self.save_hyperparameters()
        self.hparams.learning_rate = learning_rate
        self.hparams.eps = adam_epsilon
        self.hparams.weight_decay = weight_decay
        self.hparams.model_name_or_path = model_name_or_path
        self.hparams.num_classes = num_classes
        self.hparams.num_training_steps = num_training_steps
        self.hparams.warmup_steps = warmup_steps

        # freeze
        self._frozen = False

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.hparams.model_name_or_path == "distilgpt2":
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)
            self.model.config.pad_token_id = self.model.config.eos_token_id
        else:
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)

    def forward(self, batch):
        # The batch contains the input_ids, the input_put_mask and the labels (for training)
        input_ids = batch[0]
        input_mask = batch[1]
        labels = batch[2]
        outputs = self.model(input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        return loss, logits, outputs["hidden_states"], outputs["attentions"]

    def training_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class stipulates you to overwrite. This we do here, by virtue of this definition
        outputs = self(batch)  # self refers to the model, which in turn acceses the forward method
        train_loss = outputs[0]
        self.log_dict({"train_loss": train_loss}, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
        # the training_step method expects a dictionary, which should at least contain the loss

    def validation_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')

        self.log_dict({"val_loss": val_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy},
                      on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss

    def test_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do test. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        test_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')

        self.log_dict({"test_loss": test_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy},
                      on_step=False, on_epoch=True, prog_bar=True, logger=True)

    def predict_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        predictions = torch.argmax(logits, dim=1)
        return predictions.detach().cpu().numpy()

    def configure_optimizers(self):
        # The configure_optimizers is a (virtual) method, specified in the interface, that the
        # pl.LightningModule class wants you to overwrite.

        # In this case we define that some parameters are optimized in a different way than others. In
        # particular we single out parameters that have 'bias', 'LayerNorm.weight' in their names. For those
        # we do not use an optimization technique called weight decay.

        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':self.hparams.weight_decay},
                                        {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.eps)
        # optimizer = DeepSpeedCPUAdam(optimizer_grouped_parameters, adamw_mode=True, lr=self.hparams.learning_rate, betas=(0.9, 0.999), eps=self.hparams.eps)

        # We also use a scheduler that is supplied by transformers.
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.num_training_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]

    def freeze(self) -> None:
        # freeze all layers, except the final classifier layers
        for name, param in self.model.named_parameters():
            if 'classifier' not in name:  # classifier layer
                param.requires_grad = False

        self._frozen = True

    def unfreeze(self) -> None:
        if self._frozen:
            for name, param in self.model.named_parameters():
                if 'classifier' not in name:  # classifier layer
                    param.requires_grad = True

        self._frozen = False

    def train_epoch_start(self):
        """pytorch lightning hook"""
        if self.current_epoch < self.hparams.nr_frozen_epochs:
            self.freeze()

        if self.current_epoch >= self.hparams.nr_frozen_epochs:
            self.unfreeze()

# **Helper functions**

In [None]:
def train_model(tokenizer_name, model_name):
    # Loading the datamodule
    dm = contextualizedClassifierDataModule(tokenizer_name_or_path=tokenizer_name)
    dm.setup()

    # Change the number of classes as you increase the size of the dataset
    num_classes = 153
    nb_epochs = 10

    num_training_steps = len(dm.train_dataloader()) * nb_epochs
    # Setting the warmup steps to 1/10th the size of training data
    warmup_steps = int(len(dm.train_dataloader()) * 10/100)

    # Loading the model
    model = ClassifierModel(learning_rate=0.0001, adam_epsilon=float(1e-6), weight_decay=0.01, model_name_or_path=model_name, num_classes=num_classes,
                        num_training_steps=num_training_steps, warmup_steps=warmup_steps)

    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=5, verbose=False, mode="min")
    # wandb_logger = WandbLogger(save_dir="logs", name=model_name, project="AA-Tutorials")

    # %% Setting up the trainer
    # Unfortunately the lr_finder functionality doesn't support DeepSpeedStrategy yet, therefore we will set up our trainer twice. Once to find the suitable
    # learning rate and secondly to train our model.
    trainer = L.Trainer(max_epochs=nb_epochs,
                  accelerator="gpu",
                  devices=1 if torch.cuda.is_available() else None,
                  fast_dev_run=False,
                  accumulate_grad_batches = 1, # To run the backward step after n batches, helps to increase the batch size
                  benchmark = True, # Fastens the training process
                  deterministic=True, # Ensures reproducibility
                  limit_train_batches=1.0, # trains on 10% of the data,
                  check_val_every_n_epoch = 10, # run val loop every 1 training epochs
                  callbacks=[early_stop_callback], # Enables model checkpoint and early stopping
                  # logger = wandb_logger,
                  precision='16-mixed') # Mixed Precision system

    # Training model
    trainer.fit(model, dm)
    # Evaluating model
    trainer.test(model=model, dataloaders=dm.test_dataloader())

    return trainer, model, dm

[DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base)

In [None]:
_, model, dm = train_model("distilbert/distilroberta-base", "distilbert/distilroberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | RobertaForSequenceClassification | 82.2 M
-----------------------------------------------------------
82.2 M    Trainable params
0         Non-trainable params

Epoch 9: 100%|██████████| 119/119 [00:10<00:00, 11.13it/s, v_num=5, train_loss=0.049] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:  14%|█▍        | 1/7 [00:00<00:00, 24.90it/s][A
Validation DataLoader 0:  29%|██▊       | 2/7 [00:00<00:00, 25.50it/s][A
Validation DataLoader 0:  43%|████▎     | 3/7 [00:00<00:00, 25.69it/s][A
Validation DataLoader 0:  57%|█████▋    | 4/7 [00:00<00:00, 25.77it/s][A
Validation DataLoader 0:  71%|███████▏  | 5/7 [00:00<00:00, 25.86it/s][A
Validation DataLoader 0:  86%|████████▌ | 6/7 [00:00<00:00, 25.87it/s][A
Validation DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 28.05it/s][A
Epoch 9: 100%|██████████| 119/119 [00:12<00:00,  9.71it/s, v_num=5, train_loss=0.0445, val_loss=0.765, accuracy=0.818, macro-F1=0.770, micro-F1=0.830, weighted-F1=0.819]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 119/119 [00:12<00:00,  9.69it/s, v_num=5, train_loss=0.0445, val_loss=0.765, accuracy=0.818, macro-F1=0.770, micro-F1=0.830, weighted-F1=0.819]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 19.74it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.7884917855262756
        macro-F1            0.7175610065460205
        micro-F1            0.8190000057220459
        test_loss            0.932634711265564
       weighted-F1          0.8095340728759766
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


# **Extracting sentence representations (through mean pooling)**

In [None]:
def extract_representations(model, test_dataloader, pooling_type="mean"):
    pooled_output_list, labels_list = [], []

    pbar = tqdm(total=len(test_dataloader))
    with torch.no_grad():
        for _, batch in enumerate(test_dataloader):
            attention_mask = batch[1]
            labels = batch[2]

            _, _, hidden_states, _ = model(batch)
            # Extracting the output from last hidden state and attention matrix
            # hidden_states = torch.stack(hidden_states)[-1]
            # attention_mask = torch.stack(attention_mask)[-1]
            hidden_states = torch.stack(hidden_states)[-1]

            # Generating the pooled output
            if pooling_type == "mean":
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                pooled_output = sum_embeddings / sum_mask
            elif pooling_type == "max":
                last_hidden_state = hidden_states
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                last_hidden_state[input_mask_expanded == 0] = float("-inf")  # Set padding tokens to large negative value
                pooled_output = torch.max(last_hidden_state, 1)[0]
            else:
                # Mean-max pooling
                last_hidden_state = hidden_states
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                mean_pooled_output = sum_embeddings / sum_mask
                last_hidden_state[input_mask_expanded == 0] = float("-inf")  # Set padding tokens to large negative value
                max_pooled_output = torch.max(last_hidden_state, 1)[0]
                pooled_output = torch.cat((mean_pooled_output, max_pooled_output), 1)

            pooled_output_list.append(pooled_output)
            labels_list.append(labels)
            pbar.update(1)
        pbar.close()

    # Concatenate the pooled outputs and labels into tensors
    pooled_outputs = torch.cat(pooled_output_list)
    labels = torch.cat(labels_list)

    return pooled_outputs, labels

In [None]:
train_pooled_outputs, train_labels = extract_representations(model, dm.train_dataloader())

100%|██████████| 119/119 [04:28<00:00,  2.26s/it]


In [None]:
test_pooled_outputs, test_labels = extract_representations(model, dm.test_dataloader())

100%|██████████| 32/32 [01:10<00:00,  2.22s/it]


# **Helper functions to generate recall@k, precision@k, and mean average precision@k results**

In [None]:
def recall_at_k(actual, predicted, k=10):
    """
    Computes Recall at k for a set of samples.

    Recall at k measures the proportion of relevant items found in the top-k predictions. It's a way to evaluate
    how good a model is at retrieving relevant items, considering only the top-k items it has predicted.

    Parameters
    ----------
    actual : list of np.array
        A list where each element is an array of correct recommendations for a given sample. These are the items
        that are relevant to the user's preferences or needs. Order does not matter in these arrays.

    predicted : list of np.array
        A list where each element is an array of predicted recommendations for a given sample, ordered by decreasing
        confidence. These are the model's top predictions for what the user might prefer or need.

    k : int, optional
        The number of top predictions to consider when calculating recall. Defaults to 10. This parameter allows
        evaluation at different levels of recommendation list length.

    Returns
    -------
    recall_scores : list
        A list of recall scores for each sample. Each score is a float between 0 and 1, inclusive, representing
        the proportion of relevant items that were included in the top-k predicted recommendations.
    """
    recall_scores = []
    for true_labels, predicted_labels in zip(actual, predicted):
        num_relevant = len(set(true_labels))  # Count unique relevant items
        if num_relevant == 0:  # Check to avoid division by zero if there are no relevant items
            recall = 0.0  # If there are no relevant items, recall is undefined; we define it as 0 for practical purposes
        else:
            # Count how many of the top-k predicted items are relevant
            num_retrieved_relevant = len(set(predicted_labels[:k]).intersection(set(true_labels)))
            recall = num_retrieved_relevant / float(num_relevant)  # Calculate recall
        recall_scores.append(recall)
    return recall_scores


def precision_at_k(y_true, y_pred, k=10):
    """
    Computes Precision at k for a set of samples.

    Precision at k is a measure that calculates the proportion of recommended items in the top-k set that are relevant.
    It focuses on the accuracy of the top-k recommendations provided by the model, disregarding the order of
    recommendations beyond the scope of k. This metric is useful for evaluating the quality of a recommendation system
    where the goal is to present the most relevant items to a user within a limited set of top-k items.

    Parameters
    ----------
    y_true: list of np.array
        A list where each element is an array of correct recommendations for a given sample. These represent the items
        that are actually relevant to the user. The order of items in these arrays does not matter because precision
        at k does not take into account the ranking of the correct recommendations, only their presence within the top k.

    y_pred: list of np.array
        A list where each element is an array of predicted recommendations for a given sample, ranked by the model's
        confidence in those recommendations being relevant. The order of recommendations is crucial here because the
        precision at k calculation only considers the relevance of the items in the top k positions of this list.

    k: int, optional
        The number of top predictions to evaluate against the actual recommendations. Defaults to 10. This parameter
        dictates how deep into the list of recommendations the precision calculation will go, effectively setting a
        threshold for what is considered a "top" recommendation.

    Returns
    -------
    precision_list: list
        A list of precision scores for each sample, where each score is a float value representing the proportion of
        relevant recommendations found within the top k predictions. The score ranges from 0 to 1, where 0 indicates
        no relevant recommendations were found in the top k, and 1 indicates that all top k recommendations were relevant.
    """
    precision_list = []
    for index, _ in enumerate(y_true):
        intersection = np.intersect1d(y_true[index], y_pred[index][:k])  # Find the common items in actual and predicted top k
        precision = len(intersection) / k  # Calculate precision at k
        precision_list.append(precision)
    return precision_list


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# **Retrieval through [FAISS](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/train_pooled_outputs) similarity search**

Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning.

Faiss is written in C++ with complete wrappers for Python. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at FAIR, the fundamental AI research team of Meta.

In [None]:
def generate_retrieval_results(train_embeddings, test_embeddings, train_labels, test_labels):
    """
    Generates retrieval results for given train and test embeddings and labels,
    calculating precision, recall, and mean average precision (MAP) at various levels of k.

    Parameters
    ----------
    train_embeddings : torch.Tensor
        Embeddings of the training set items.

    test_embeddings : torch.Tensor
        Embeddings of the test set items.

    train_labels : torch.Tensor
        Labels corresponding to the training set embeddings.

    test_labels : torch.Tensor
        Labels corresponding to the test set embeddings.

    Returns
    -------
    results_df : pandas.DataFrame
        DataFrame containing the mean and standard deviation of precision, recall,
        and MAP for different values of k.
    """

    # Define the dimensionality of the embeddings and initialize a FAISS index for L2 distance.
    dim = train_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(train_embeddings.numpy())  # Add train embeddings to the FAISS index for later retrieval.

    # Determine the number of neighbors to retrieve with FAISS.
    k = 100
    D, I = index.search(test_embeddings.numpy(), k)  # Perform the search on the test set embeddings.

    # Initialize lists to store the true and predicted labels for each test example.
    true_label_list, predicted_label_list = ([] for i in range(2))
    for index, rank_indices in enumerate(I):
        label = test_labels[index].item()
        predicted_label_list.append(train_labels.numpy()[rank_indices])
        true_label_list.append(np.array([label] * len(rank_indices)))

    # Calculate metrics for different values of k and store results.
    results = []
    for i in [1, 3, 5, 10, 20, 25, 50, 100]:
        # Calculate mean and standard deviation of precision and recall for current k.
        mean_precision = np.mean(precision_at_k(true_label_list, predicted_label_list, k=i))
        std_precision = np.std(precision_at_k(true_label_list, predicted_label_list, k=i))
        mean_recall = np.mean(recall_at_k(true_label_list, predicted_label_list, k=i))
        std_recall = np.std(recall_at_k(true_label_list, predicted_label_list, k=i))

        # Reset the label lists for MAP calculation.
        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_actual_list = [test_labels[index].item()]
            temp_predicted_list = train_labels.numpy()[rank_indices].tolist()

            predicted_label_list.append(temp_predicted_list)
            true_label_list.append(temp_actual_list)

        # Calculate mean and standard deviation of MAP for current k.
        mean_map = np.mean([mapk([true], [pred], k=i) for true, pred in zip(true_label_list, predicted_label_list)])
        std_map = np.std([apk(a, p, k=i) for a, p in zip(true_label_list, predicted_label_list)])

        # Append results for current k to the results list.
        results.append({
            "K": i,
            "Precision Mean": mean_precision,
            "Precision Std": std_precision,
            "Recall Mean": mean_recall,
            "Recall Std": std_recall,
            "MAP Mean": mean_map,
            "MAP Std": std_map
        })

    # Convert results list to a DataFrame and return.
    results_df = pd.DataFrame(results)
    return results_df.set_index('K')


In [None]:
generate_retrieval_results(train_pooled_outputs, test_pooled_outputs, train_labels, test_labels)

Unnamed: 0_level_0,Precision Mean,Precision Std,Recall Mean,Recall Std,MAP Mean,MAP Std
K,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.812,0.390712,0.812,0.390712,0.812,0.390712
3,0.280667,0.12158,0.842,0.364741,0.8255,0.369865
5,0.17,0.071414,0.85,0.357071,0.82735,0.366298
10,0.0861,0.034595,0.861,0.345947,0.828852,0.36318
20,0.0442,0.016011,0.884,0.320225,0.830388,0.359803
25,0.03552,0.012615,0.888,0.315366,0.830558,0.359419
50,0.01832,0.005548,0.916,0.277388,0.83136,0.357594
100,0.00937,0.00243,0.937,0.242963,0.831666,0.356889
