In [1]:
import os
import librosa
import re

import torch
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states = True,
                                 ) # change model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppteer."
text_3 = "Hello world"
text_4 = "Goodbye cruel world"

text = [text_1, text_2, text_3, text_4]

indexed_token1 = tokenizer.encode(text_3, add_special_tokens=True)
indexed_token2 = tokenizer.encode(text_4, add_special_tokens=True)

indexed_tokens = tokenizer(text,
                           add_special_tokens=True,
                           padding=True,)

# return BatchEncoding class, see indexed_tokens.data
indexed_tokens.data

{'input_ids': [[101, 2040, 2001, 3958, 27227, 1029, 102, 0, 0, 0],
  [101, 3958, 27227, 2001, 1037, 26781, 13876, 11510, 1012, 102],
  [101, 7592, 2088, 102, 0, 0, 0, 0, 0, 0],
  [101, 9119, 10311, 2088, 102, 0, 0, 0, 0, 0]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

In [5]:
# Convert inputs to PyTorch tensors

input_tensor = torch.tensor(indexed_tokens.data['input_ids'])
token_type_ids_tensor = torch.tensor(indexed_tokens.data['token_type_ids'])
attention_mask_tensor = torch.tensor(indexed_tokens.data['attention_mask'])


In [6]:
with torch.no_grad():
    x = model(input_tensor,
              token_type_ids=token_type_ids_tensor,
              attention_mask=attention_mask_tensor,)

In [7]:
# x[2] => token_embeddings is hidden states of the model.
# ( num_layer * batch_size * num_token * num_nodes )
# A tensor with (num_token * num_nodes) represents a single input text.

token_hidden = torch.stack(x[2], dim=0).permute(0, 1, 2, 3) # exchange dims
token_hidden.shape

torch.Size([13, 4, 10, 768])

In [8]:
# mean of the second to last layer. (batch_size * num_nodes)
# TODO: test other pooling strategies

sentences_embed_mstl = torch.mean(token_hidden[-2], dim=1)
sentences_embed_mstl.shape


torch.Size([4, 768])

In [9]:
sentences_embed_mstl

tensor([[-0.4064, -0.2781, -0.0824,  ..., -0.0253,  0.2613, -0.1223],
        [-0.3894, -0.1302, -0.4098,  ..., -0.1285,  0.4572,  0.0334],
        [-0.7138, -0.2371,  0.5877,  ..., -0.7257, -0.2156,  0.1068],
        [-0.1726,  0.1058,  1.0248,  ..., -0.5645, -0.4218, -0.1552]])