In [1]:
import os
import librosa
import re
import pickle

import torch
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',
                                 output_hidden_states = True,
                                 ) # change model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
with open('data/description.pkl', 'rb') as f:
    data = pickle.load(f)

In [5]:
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppteer."
text_3 = "Hello world"
text_4 = "Goodbye cruel world"

text = data.sentence.tolist() # 5천개 하면 개오래걸림
print(data)

indexed_token1 = tokenizer.encode(text_3, add_special_tokens=True)
indexed_token2 = tokenizer.encode(text_4, add_special_tokens=True)

indexed_tokens = tokenizer(text,
                           add_special_tokens=True,
                           padding=True,)

# return BatchEncoding class, see indexed_tokens.data
# indexed_tokens.data

                        name  \
144      Ses01F_impro01_F000   
145      Ses01F_impro01_F001   
146      Ses01F_impro01_F002   
147      Ses01F_impro01_F003   
16       Ses01F_impro02_F000   
...                      ...   
1120  Ses05M_script03_2_F008   
1121  Ses05M_script03_2_F009   
1122  Ses05M_script03_2_F010   
1123  Ses05M_script03_2_F011   
1124  Ses05M_script03_2_F012   

                                               wav_path  \
144   data/IEMOCAP_full_release/Session1/sentences/w...   
145   data/IEMOCAP_full_release/Session1/sentences/w...   
146   data/IEMOCAP_full_release/Session1/sentences/w...   
147   data/IEMOCAP_full_release/Session1/sentences/w...   
16    data/IEMOCAP_full_release/Session1/sentences/w...   
...                                                 ...   
1120  data/IEMOCAP_full_release/Session5/sentences/w...   
1121  data/IEMOCAP_full_release/Session5/sentences/w...   
1122  data/IEMOCAP_full_release/Session5/sentences/w...   
1123  data/IEMOCAP_full_r

In [6]:
# Convert inputs to PyTorch tensors

input_tensor = torch.tensor(indexed_tokens.data['input_ids'])
token_type_ids_tensor = torch.tensor(indexed_tokens.data['token_type_ids'])
attention_mask_tensor = torch.tensor(indexed_tokens.data['attention_mask'])

input_tensor.shape


torch.Size([1391, 126])

In [7]:
with torch.no_grad():
    x = model(input_tensor,
              token_type_ids=token_type_ids_tensor,
              attention_mask=attention_mask_tensor,)

In [8]:
# x[2] => token_embeddings is hidden states of the model.
# ( num_layer * batch_size * num_token * num_nodes )
# A tensor with (num_token * num_nodes) represents a single input text.

token_hidden = torch.stack(x[2], dim=0).permute(0, 1, 2, 3) # exchange dims
token_hidden.shape

torch.Size([13, 1391, 126, 768])

In [9]:
# mean of the second to last layer. (batch_size * num_nodes)
# TODO: test other pooling strategies

sentences_embed_mstl = torch.mean(token_hidden[-2], dim=1)
sentences_embed_mstl.shape


torch.Size([1391, 768])

In [10]:
sentences_embed_mstl

tensor([[ 0.3411, -0.2357,  1.1749,  ...,  0.5249, -0.5267,  0.4661],
        [-0.1666,  0.6816,  0.9921,  ...,  0.0564,  0.0688, -0.0568],
        [ 0.3371, -0.2350,  0.3715,  ..., -0.0317, -0.3865,  0.0636],
        ...,
        [ 0.0419, -0.0738,  0.0595,  ...,  0.0672, -0.0792, -0.1420],
        [ 0.3923,  0.3044,  0.5588,  ...,  0.0269, -0.3092,  0.3782],
        [ 0.3316, -0.4979,  0.4228,  ..., -0.4598, -0.4337, -0.1691]])