# Set and verify the number of labels 

Make sure the number of classes predicted from the model matches with the number of labels specified to the model.


In [1]:
import os
import sys
import numpy as np
import transformers
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    TFDistilBertForSequenceClassification,
)

# --------------------------------------------------------------------------------
# Control log level (https://huggingface.co/transformers/main_classes/logging.html)
# --------------------------------------------------------------------------------
os.environ['TRANSFORMERS_VERBOSITY'] = "error"
transformers.logging.set_verbosity(transformers.logging.ERROR)

2021-07-08 09:16:58.227350: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-08 09:16:58.227405: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
MODEL_NAME = 'distilbert-base-uncased'
NUM_LABELS = 2

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)


def tokenize(sentences, max_length=256, padding='max_length'):
    """Tokenize using the Huggingface tokenizer
    Args: 
        sentences: String or list of string to tokenize
        padding: Padding method ['do_not_pad'|'longest'|'max_length']
    """
    return tokenizer(
        sentences,
        truncation=True,
        padding=padding,
        max_length=max_length,
        return_tensors="tf"
    )

def decode(tokens):
    return tokenizer.decode(tokens)

In [5]:
sample_tokens = tokenize(
    [   # Two example seenteces
        "i say hello", 
        "you say good bye",
    ],
    padding='longest'
)

2021-07-08 09:18:43.897050: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-08 09:18:43.898110: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-08 09:18:43.898613: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ubuntu): /proc/driver/nvidia/version does not exist
2021-07-08 09:18:43.910730: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Base model

[TFDistilBertModel][2] is the bare base model with the name ```distilbert```.

```
Model: "tf_distil_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
distilbert (TFDistilBertMain multiple                  66362880  
=================================================================
Total params: 66,362,880
Trainable params: 66,362,880
Non-trainable params: 0
```

[TFDistilBertModel][2] generates an instance of TFBaseModelOutput whose ```last_hidden_state``` parameter is the output from the model last layer. 
```
TFBaseModelOutput([(
    'last_hidden_state',
    <tf.Tensor: shape=(batch_size, sequence_lendgth, 768), dtype=float32, numpy=array([[[...]]], dtype=float32)>
)])
```


[2]: https://huggingface.co/transformers/main_classes/output.html#tfbasemodeloutput\

Note that the **num_labels** parameter has no effect because the base model will not generate any classification. The outputs are simply logits to be further utilized for domain tasks. In another words, the base model is a flexible half-baked foundation for general purpose. 

In [6]:
base = TFDistilBertModel.from_pretrained(
    MODEL_NAME,
    # num_labels=NUM_LABELS # No effect
)

2021-07-08 09:18:53.357155: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [13]:
base_output = base(sample_tokens)
print(base_output)

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.08046143, -0.05070581,  0.16511475, ...,  0.02624902,
          0.32072207,  0.29028514],
        [ 0.38065866,  0.14633194,  0.3267995 , ..., -0.03747689,
          0.6750763 ,  0.26754186],
        [-0.1375587 ,  0.23116127,  0.9399939 , ..., -0.4190007 ,
          0.03066482,  0.14690849],
        [-0.46870375, -0.15961105,  0.7064866 , ...,  0.09092102,
          0.00915718, -0.02297045],
        [ 0.9486712 ,  0.28354222, -0.34940282, ...,  0.22489336,
         -0.41922688, -0.44231772],
        [ 0.11706736,  0.02730646,  0.36414525, ...,  0.22984353,
          0.05139103,  0.08807824]],

       [[-0.13671048, -0.07976633,  0.13336904, ..., -0.09188621,
          0.18120913,  0.14311402],
        [-0.18604954,  0.00228937,  0.87482053, ...,  0.25758246,
          0.51748097,  0.07797477],
        [ 0.04087289,  0.11673979,  1.1603653 , ..., -0.4042866 ,
          0.04986721,  0.20


### [CLS] ###
For classification tasks, extract the ```[CLS]``` embedding from ```last_hidden_state``` where each sentence is encoded into a sequence of embedding vectors of shape ```(max_sequence_length, 768)``` with the structure of ```[CLS]...[SEP]```. 

In [16]:
# Extract 0-th [CLS] embeddings from the batch resulting in shape (batch_size, 768)
cls = base_output['last_hidden_state'][:, 0, :]
print(cls)

tf.Tensor(
[[-0.08046143 -0.05070581  0.16511475 ...  0.02624902  0.32072207
   0.29028514]
 [-0.13671048 -0.07976633  0.13336904 ... -0.09188621  0.18120913
   0.14311402]], shape=(2, 768), dtype=float32)


---
# Model for Training

Huggingfacxe provides [TFDistilBertForSequenceClassification][1] which has classification heads added on top of the base model.
```
Model: "tf_distil_bert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_59 (Dropout)         multiple                  0         
=================================================================
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
```

[1]: https://huggingface.co/transformers/model_doc/distilbert.html#tfdistilbertforsequenceclassification
[3]: https://huggingface.co/transformers/model_doc/distilbert.html#tfdistilbertmodel

## Output classes

Verify the number of classes the model generates matches with the NUM_LABELS specified at the model creation.

In [10]:
NUM_LABELS = 3

model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS    # Tell the model how many classes it needs to classify
)
output = model(sample_tokens)
print(output)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-0.02264021,  0.0062665 ,  0.00076615],
       [-0.0365167 ,  0.02276366,  0.01284292]], dtype=float32)>, hidden_states=None, attentions=None)


In [9]:
print(f"output['logits'] {output['logits']}")
print(f"output['logits'].shape {output['logits'].shape}")

assert output['logits'].shape[-1] == NUM_LABELS

output['logits'] [[-0.05688253 -0.05908322 -0.04925276]
 [-0.05387952 -0.07298677 -0.08901386]]
output['logits'].shape (2, 3)
