# Sentence Transformers and Multi-Task Learning

## Imports

In [1]:
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

## Sentence Transformer for Sentence Similarity

In [2]:
# This could pretty much any language model from HuggingFace, but this is nice and small for demo purposes
# Realistically, sentence embedding similarity without task-specific training is a tough job and you'd probably
# want a decent-sized model.
model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [3]:
model = AutoModel.from_pretrained(model_name)
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [4]:
class SentenceTransformer:
    """
    Simple wrapper to run a model and tokenizer on one or more text inputs to calculate and compare embeddings
    """
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def get_embedding(self, text: str) -> np.ndarray:
        """
        Given a text input, produce its feature embedding (vector). This implementation uses the last 4 hidden layers of the model.
        After performing average pooling on each of the tokens of that hidden state, the results are concatenated into a single vector.
        """
        encoded_input = tokenizer(text, return_tensors='pt')
        outputs = model(**encoded_input, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        USE_LAST_N_LAYERS = 4  # Some research suggests that there is complementary embedding information in different hidden layers, so we can use a few
        pooled_layer_embeddings = []
        for i in range(1, USE_LAST_N_LAYERS+1):
            hidden_state = hidden_states[-i][0].detach().numpy()
            # There are many ways to go from n output tokens down to a fixed-length embedding
            # For certain pre-trained models there is even a recommended output token to use exclusively
            # But average pooling over all output tokens should be a fairly safe model-agnostic approach
            pooled_hidden_state = np.mean(hidden_state, axis=0) 
            pooled_layer_embeddings.append(pooled_hidden_state)
        
        return np.concatenate(pooled_layer_embeddings)

    def get_similarity(self, text1: str, text2: str) -> float:
        """
        Given two text inputs, produce their similarity score (based on cosine similarity)
        """
        embedding1 = self.get_embedding(text1)
        embedding2 = self.get_embedding(text2)
        return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

In [5]:
# Simple usage example
sentence_transformer = SentenceTransformer(model=model, tokenizer=tokenizer)
embedding = sentence_transformer.get_embedding("Well, hello there")
embedding.shape

(3072,)

In [6]:
# These sentences should hopefully have a fairly high similarity
sentence_transformer.get_similarity("Well, hello there", "Hi!")

0.87508154

In [7]:
# These sentences should hopefully have lower similarity than the last.
# But, we aren't using a very powerful model so the difference is smaller than ideal
sentence_transformer.get_similarity("Well, hello there", "I love pancakes")

0.81927824

## Multi-Task Learning

In [8]:
from typing import Optional, Tuple

class MultiTaskTransformer(nn.Module):
    """
    Simple wrapper to run a model and tokenizer on one or more text inputs to calculate and compare embeddings
    """
    def __init__(self, backbone: nn.Module, num_sequence_classes: int, num_ner_classes: int):
        super(MultiTaskTransformer, self).__init__()
        self.backbone = backbone
        # Note: this strategy to get the final output size will not work for all backbone modules,
        # as unfortunately it is not standard across all Hugging Face models
        # It will work for Distilbert though
        final_hidden_size = backbone.config.dim
        self.classification_head = nn.Linear(final_hidden_size, num_sequence_classes)
        num_ner_labels = 2 * num_ner_classes + 1 # With IOB labeling, we need "I"(nside) and "B"(egin) labels for each class, plus an "O"(utside) label
        self.ner_head = nn.Linear(final_hidden_size, num_ner_labels)

    def forward(self, 
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # I'm leaving out a lot of possible configurability that you would get in, like, a Hugging Face implementation
        # But input token ids and attention mask should be enough to do basic forward/backward passes with batching for training
        # I've also left out calculation of loss. You could put that inside or outside the model,
        # but either way we don't need it for forward() only

        # Get the last hidden state as input to the task layers
        backbone_output = self.backbone(input_ids=input_ids, attention_mask=attention_mask)[0]
        
        # Get the first token output to use for sequence classification
        first_token_output = backbone_output[:, 0]
        sequence_classification_logits = self.classification_head(first_token_output)

        # Get ner token predictions, per token
        # Note that to actually label a sentence with this model you'd need to convert IOB labels to annotations but this isn't hard
        ner_logits = self.ner_head(backbone_output)
        
        return sequence_classification_logits, ner_logits

In [9]:
multi_task_transformer = MultiTaskTransformer(backbone=model, num_sequence_classes=3, num_ner_classes=5)
encoded_input = tokenizer("hello there", return_tensors='pt')
sequence_classification_logits, ner_logits = multi_task_transformer(**encoded_input)
# This should be [1, num_sequence_classes], [1, token_count, num_ner_labels]
sequence_classification_logits.shape, ner_logits.shape

(torch.Size([1, 3]), torch.Size([1, 4, 11]))

## Discussion Questions

*1. Consider the scenario of training the multi-task sentence transformer that you
implemented in Task 2. Specifically, discuss how you would decide which portions of the
network to train and which parts to keep frozen. For example,*
- *When would it make sense to freeze the transformer backbone and only train the
task specific layers?*
- *When would it make sense to freeze one head while training the other?*

My default plan, if I only got to try one thing, would be to leave all layers unfrozen. The main advantage of freezing any layers is to prevent overfitting and/or catastrophic forgetting. In my personal experience (especially with this newer, larger LLMs that are highly overparameterized and are fine-tuned with very low learning rates) you can prevent those problems by training for a small number of epochs, and perhaps adding some regularization like dropout if you really happen to need it. Freezing the layers can occasionally make the training pretty constrained and unable to fit the data very easily, depending on how much of it you have. Another possibility short of a full freeze is to do LoRA training, which will speed up big model training and constrain the magnitude of a model update without restricting it as much as fully freezing it.

If you had precious little data to train on, I could maybe see freezing the backbone as a good option, and I'm certainly not opposed to trying it.

You could freeze one head while training the other if you wanted. It would definitely make sense if you have disjoint training data in which case you can't update both objectives simultaneously even if you wanted to. Then you'd have to figure out a good schedule for training that interleaves samples of one task with samples of another.

*2. Discuss how you would decide when to implement a multi-task model like the one in this
assignment and when it would make more sense to use two completely separate models
for each task.*

Historically, I would have said model capacity is the main determining factor whether to use two models. That is, two highly disparate tasks might not be easily contained in one model (but two related tasks might benefit from the shared learnings!). Now, the models are so big and the baseline language understanding that you need for most tasks is so similar, that deciding whether to use two models or one tends to be dictated, in my opinion, by more classical engineering considerations.

It's easier to maintain two models (can update one task without having to regression test the other, which is a painful process in ML). But, two models is more expensive to run because you are deploying them separately (You'll need about double the GPUs, since it is about double the memory and double the compute. Latency-wise, it will be about a wash though).

*3. When training the multi-task model, assume that Task A has abundant data, while Task
B has limited data. Explain how you would handle this imbalance.*

Depending on how much the tasks have in common, Task B may benefit from the shared knowledge implicit in Task A training, which is an argument to try training them as a single multi-task model. Past that you will need to separately optimize the amount of training for each task, as it probably won't make sense to train with the same schedule (same epochs and learning rate) if the data doesn't line up. As a starting point, I'd probably optimize number of training epochs for each separately and then interleave the training so that the distributions are balanced across the whole training time. But there's other interesting things to try. Maybe you train task A first and see if that fine-tuning benefits Task B being trained subsequently.