####  Load CamemBERT and its sub-word tokenizer : 

In [12]:
from transformers import CamembertModel, CamembertTokenizer

In [13]:

# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")

camembert.eval()  # disable dropout (or leave in train mode to finetune)

Some weights of the model checkpoint at camembert/camembert-base-wikipedia-4gb were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CamembertModel(
  (embeddings): CamembertEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CamembertEncoder(
    (layer): ModuleList(
      (0-11): 12 x CamembertLayer(
        (attention): CamembertAttention(
          (self): CamembertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): CamembertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     

#### Filling masks using pipeline 

In [14]:
from transformers import pipeline 

camembert_fill_mask  = pipeline("fill-mask", model="camembert/camembert-base-wikipedia-4gb", tokenizer="camembert/camembert-base-wikipedia-4gb")
results = camembert_fill_mask("Le camembert est un fromage de <mask>!")
results

[{'score': 0.4937813878059387,
  'token': 19370,
  'token_str': 'chèvre',
  'sequence': 'Le camembert est un fromage de chèvre!'},
 {'score': 0.06255923956632614,
  'token': 30616,
  'token_str': 'brebis',
  'sequence': 'Le camembert est un fromage de brebis!'},
 {'score': 0.043401967734098434,
  'token': 2364,
  'token_str': 'montagne',
  'sequence': 'Le camembert est un fromage de montagne!'},
 {'score': 0.02823261171579361,
  'token': 3236,
  'token_str': 'Noël',
  'sequence': 'Le camembert est un fromage de Noël!'},
 {'score': 0.02135733887553215,
  'token': 12329,
  'token_str': 'vache',
  'sequence': 'Le camembert est un fromage de vache!'}]

####  Extract contextual embedding features from Camembert output 

In [47]:
import torch
# Tokenize in sub-words with SentencePiece
tokenized_sentence = tokenizer.tokenize("J'aime le camembert !")
# ['▁J', "'", 'aime', '▁le', '▁ca', 'member', 't', '▁!'] 

# 1-hot encode and add special starting and end tokens 
encoded_sentence = tokenizer.encode(tokenized_sentence)
# [5, 221, 10, 10600, 14, 8952, 10540, 75, 1114, 6]
# NB: Can be done in one step : tokenize.encode("J'aime le camembert !")

# Feed tokens to Camembert as a torch tensor (batch dim 1)
encoded_sentence = torch.tensor(encoded_sentence).unsqueeze(0)
embeddings = camembert(encoded_sentence)[0]

In [54]:
embeddings.detach()

tensor([[[-0.0928,  0.0506, -0.0094,  ..., -0.2388,  0.1177, -0.1302],
         [ 0.0662,  0.1030, -0.2355,  ..., -0.4224, -0.0574, -0.2802],
         [-0.0729,  0.0547,  0.0192,  ..., -0.1743,  0.0998, -0.2677],
         ...,
         [-0.0033, -0.1228, -0.2961,  ...,  0.3828,  0.0659,  0.2365],
         [ 0.1366,  0.2483,  0.1271,  ...,  0.1985, -0.1444, -0.4721],
         [ 0.0778,  0.1016,  0.0436,  ...,  0.0237, -0.1466, -0.1009]]])

####  Extract contextual embedding features from all Camembert layers 

In [55]:
from transformers import CamembertConfig

In [56]:
# (Need to reload the model with new config)
config = CamembertConfig.from_pretrained("camembert/camembert-base-wikipedia-4gb", output_hidden_states=True)
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb", config=config)

Some weights of the model checkpoint at camembert/camembert-base-wikipedia-4gb were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
embeddings = camembert(encoded_sentence)[0]
all_layer_embeddings = camembert(encoded_sentence)[2]
#  all_layer_embeddings list of len(all_layer_embeddings) == 13 (input embedding layer + 12 self attention layers)
all_layer_embeddings[5]
# layer 5 contextual embedding : size torch.Size([1, 10, 768])
#tensor([[[-0.0059, -0.0227,  0.0065,  ..., -0.0770,  0.0369,  0.0095],
#         [ 0.2838, -0.1531, -0.3642,  ..., -0.0027, -0.8502, -0.7914],
#         [-0.0073, -0.0338, -0.0011,  ...,  0.0533, -0.0250, -0.0061],
#         ...,


tensor([[[-0.0059, -0.0227,  0.0065,  ..., -0.0770,  0.0369,  0.0095],
         [ 0.2838, -0.1531, -0.3642,  ..., -0.0027, -0.8502, -0.7914],
         [-0.0073, -0.0338, -0.0011,  ...,  0.0533, -0.0250, -0.0061],
         ...,
         [-0.1932,  0.0468,  0.2520,  ...,  0.8156, -0.4552,  0.2495],
         [ 0.2540,  0.9947,  0.4313,  ..., -0.3552, -0.0192, -1.1114],
         [ 0.1387,  0.2407,  0.0506,  ..., -0.6924,  0.2260, -0.6020]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
embeddings{}