In [1]:
%load_ext autoreload
%autoreload 2
from IPython.display import display

In [2]:
import pandas as pd
import numpy as np
import torch


In [3]:
from transformers import BertModel, BertTokenizer
model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def bert_text_preparation(text, tokenizer):
    """
    Preprocesses text input in a way that BERT can interpret.
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    
    display(tokenized_text)
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    display(indexed_tokens)
    
    segments_ids = [1]*len(indexed_tokens)
    
    display(segments_ids)
    
    # convert inputs to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    display(tokens_tensor)
    segments_tensor = torch.tensor([segments_ids])
    display(segments_tensor)
    return tokenized_text, tokens_tensor, segments_tensor



In [None]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains BERT embeddings for tokens.
    """
    # gradient calculation id disabled
    with torch.no_grad():
      # obtain hidden states
      outputs = model(tokens_tensor, segments_tensor)
      hidden_states = outputs.hidden_states
    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)
    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1,0,2)
    # intialized list to store embeddings
    token_vecs_sum = []
    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the sentence
    # loop over tokens in sentence
    for token in token_embeddings:
    # "token" is a [12 x 768] tensor
    # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    return token_vecs_sum

In [None]:
sentences = ["bank",
         "he eventually sold the shares back to the bank at a premium.",
         "the bank strongly resisted cutting interest rates.",
         "the bank will supply and buy back foreign currency.",
         "the bank is pressing us for repayment of the loan.",
         "the bank left its lending rates unchanged.",
         "the river flowed over the bank.",
         "tall, luxuriant plants grew along the river bank.",
         "his soldiers were arrayed along the river bank.",
         "wild flowers adorned the river bank.",
         "two fox cubs romped playfully on the river bank.",
         "the jewels were kept in a bank vault.",
         "you can stow your jewellery away in the bank.",
         "most of the money was in storage in bank vaults.",
         "the diamonds are shut away in a bank vault somewhere.",
         "thieves broke into the bank vault.",
         "can I bank on your support?",
         "you can bank on him to hand you a reasonable bill for your services.",
         "don't bank on your friends to help you out of trouble.",
         "you can bank on me when you need money.",
         "i bank on your help."
         ]
from collections import OrderedDict
context_embeddings = []
context_tokens = []
for sentence in sentences:
  tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
  list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
  # make ordered dictionary to keep track of the position of each   word
  tokens = OrderedDict()
  # loop over tokens in sensitive sentence
  for token in tokenized_text[1:-1]:
    # keep track of position of word and whether it occurs multiple times
    if token in tokens:
      tokens[token] += 1
    else:
      tokens[token] = 1
  # compute the position of the current token
    token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
    current_index = token_indices[tokens[token]-1]
  # get the corresponding embedding
    token_vec = list_token_embeddings[current_index]
    
    # save values
    context_tokens.append(token)
    context_embeddings.append(token_vec)

In [None]:
context_tokens

In [None]:
tokenizer.tokenize("Here is my text, some gibberish: sdsdasdaf, and animals, hippos etc. [1]")

In [4]:
tokenized = tokenizer.encode("Here is my text, some gibberish: sdsdasdaf, and animals, hippos etc. [1]", return_tensors="pt")
print(type(tokenized))
print(tokenized)
print(tokenized.shape)
embedded = model(input_ids=tokenized)
last = embedded.last_hidden_state
print(last)
print(last.shape)
last.squeeze()
print(last.shape)

<class 'torch.Tensor'>
tensor([[  101,  2182,  2003,  2026,  3793,  1010,  2070, 21025, 29325,  4509,
          1024, 17371, 16150,  3022,  2850,  2546,  1010,  1998,  4176,  1010,
          5099,  6873,  2015,  4385,  1012,  1031,  1015,  1033,   102]])
torch.Size([1, 29])
tensor([[[-0.0725,  0.2891, -0.2205,  ..., -0.3550,  0.5276,  0.5011],
         [-1.0338, -0.0576,  0.0736,  ..., -0.3047,  0.4641, -0.0979],
         [-0.6666, -0.1912,  0.5514,  ...,  0.2056, -0.1143,  0.3831],
         ...,
         [-0.1139,  0.2144,  0.5290,  ..., -0.1144,  0.7179,  1.0353],
         [ 0.0453,  0.3491,  0.3005,  ..., -0.6518, -0.2924,  0.1984],
         [ 0.5270,  0.6090, -0.2633,  ...,  0.1840, -0.5477, -0.1830]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 29, 768])
torch.Size([1, 29, 768])


In [10]:
tokenized = tokenizer.encode("Here is my text, some gibberish: sdsdasdaf, and animals, hippos etc. [1]", return_tensors="pt")
print(type(tokenized))
print(tokenized)
print(tokenized.shape)
embedded = model(input_ids=tokenized)
last = embedded.hidden_states
print(last)
print(len(last))
for layer in last:
    print(type(layer))
    print(layer.shape)


<class 'torch.Tensor'>
tensor([[  101,  2182,  2003,  2026,  3793,  1010,  2070, 21025, 29325,  4509,
          1024, 17371, 16150,  3022,  2850,  2546,  1010,  1998,  4176,  1010,
          5099,  6873,  2015,  4385,  1012,  1031,  1015,  1033,   102]])
torch.Size([1, 29])
(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.3557,  0.9575, -1.3448,  ...,  0.5972,  0.7392,  0.2581],
         [-0.6270, -0.0633, -0.3143,  ...,  0.3427,  0.4636,  0.4594],
         ...,
         [ 0.0116,  0.4963,  0.2872,  ..., -0.4949,  0.7428,  0.3255],
         [ 0.7604,  0.6366,  0.2540,  ...,  0.1930,  0.8327, -0.8258],
         [-0.4342,  0.1415,  0.2393,  ..., -0.4481, -0.0569, -0.2665]]],
       grad_fn=<NativeLayerNormBackward0>), tensor([[[ 0.0399, -0.0126, -0.1802,  ...,  0.2281, -0.1365, -0.0331],
         [-0.9734,  0.9356, -1.4833,  ...,  0.1141,  0.7378,  0.3468],
         [-1.1578, -0.3954, -0.7789,  ...,  0.1855,  0.3490,  0.2681],
         ...,
         [-

tensor([[[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [-3.5574e-01,  9.5751e-01, -1.3448e+00,  ...,  5.9716e-01,
            7.3916e-01,  2.5809e-01],
          [-6.2703e-01, -6.3313e-02, -3.1428e-01,  ...,  3.4265e-01,
            4.6361e-01,  4.5937e-01],
          ...,
          [ 1.1559e-02,  4.9627e-01,  2.8718e-01,  ..., -4.9486e-01,
            7.4276e-01,  3.2547e-01],
          [ 7.6038e-01,  6.3660e-01,  2.5401e-01,  ...,  1.9296e-01,
            8.3270e-01, -8.2576e-01],
          [-4.3419e-01,  1.4149e-01,  2.3928e-01,  ..., -4.4813e-01,
           -5.6863e-02, -2.6652e-01]]],


        [[[ 3.9913e-02, -1.2561e-02, -1.8017e-01,  ...,  2.2813e-01,
           -1.3654e-01, -3.3117e-02],
          [-9.7344e-01,  9.3563e-01, -1.4833e+00,  ...,  1.1412e-01,
            7.3782e-01,  3.4675e-01],
          [-1.1578e+00, -3.9543e-01, -7.7889e-01,  ...,  1.8548e-01,
            3.4905e-01,  2.6808e-01],
          ...,
   

In [11]:
token_embeddings = torch.stack(last, dim=0)
token_embeddings.shape

torch.Size([13, 1, 29, 768])

In [12]:
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.shape

torch.Size([13, 29, 768])

In [94]:
tokenized = tokenizer.encode("Here", return_tensors="pt")
print(type(tokenized))
print(tokenized)
print(tokenized.shape)
tokens = tokenizer.convert_ids_to_tokens(tokenized.squeeze(dim=0))
print(tokens)
print(type(tokens))
embedded = model(input_ids=tokenized)
last = embedded.last_hidden_state
print(last)
last
print(last.squeeze().shape)

<class 'torch.Tensor'>
tensor([[ 101, 2182,  102]])
torch.Size([1, 3])
['[CLS]', 'here', '[SEP]']
<class 'list'>
tensor([[[-0.2151,  0.0988, -0.0935,  ..., -0.2806,  0.2266,  0.3764],
         [-0.6091, -0.2245, -0.8580,  ...,  0.2867,  0.6157, -0.5735],
         [ 1.0697,  0.0667, -0.3977,  ...,  0.0703, -0.6323, -0.2769]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([3, 768])


In [73]:
tokenizer.convert_ids_to_tokens(tokenized.squeeze(), skip_special_tokens=True)

['here',
 'is',
 'my',
 'text',
 ',',
 'some',
 'gi',
 '##bber',
 '##ish',
 ':',
 'sd',
 '##sd',
 '##as',
 '##da',
 '##f',
 ',',
 'and',
 'animals',
 ',',
 'hip',
 '##po',
 '##s',
 'etc',
 '.',
 '[',
 '1',
 ']']

In [77]:
tokenized = tokenizer("Here is my text, some gibberish: sdsdasdaf, and animals, hippos etc. [1]", return_tensors="pt")
print(type(tokenized))
print(tokenized)
embedded = model(**tokenized)
last = embedded.last_hidden_state
print(last)
print(last.shape)

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': tensor([[  101,  2182,  2003,  2026,  3793,  1010,  2070, 21025, 29325,  4509,
          1024, 17371, 16150,  3022,  2850,  2546,  1010,  1998,  4176,  1010,
          5099,  6873,  2015,  4385,  1012,  1031,  1015,  1033,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}
tensor([[[-0.0725,  0.2891, -0.2205,  ..., -0.3550,  0.5276,  0.5011],
         [-1.0338, -0.0576,  0.0736,  ..., -0.3047,  0.4641, -0.0979],
         [-0.6666, -0.1912,  0.5514,  ...,  0.2056, -0.1143,  0.3831],
         ...,
         [-0.1139,  0.2144,  0.5290,  ..., -0.1144,  0.7179,  1.0353],
         [ 0.0453,  0.3491,  0.3005,  ..., -0.6518, -0.2924,  0.1984],
         [ 0.5270,  0.6090, -0.2633,  ...,  0.1840, -0.5477, -0.1830]]]

In [1]:
import torch
from torch import Tensor

t = Tensor([1,2,3,4])
print(t.shape)
print(t)

torch.Size([4])
tensor([1., 2., 3., 4.])


In [2]:
stacked = torch.stack([t,t])
print([t,t])
print(stacked)
print(stacked.shape)

[tensor([1., 2., 3., 4.]), tensor([1., 2., 3., 4.])]
tensor([[1., 2., 3., 4.],
        [1., 2., 3., 4.]])
torch.Size([2, 4])


In [63]:
t2 = torch.randn(1,3,1,3)
print(t2.shape)
print(t2)
squeezed = torch.squeeze(t2)
print(squeezed.shape)
print(squeezed)

torch.Size([1, 3, 1, 3])
tensor([[[[-2.1386, -0.3540, -0.8361]],

         [[-0.0905, -0.3460, -2.0942]],

         [[-1.3964, -0.0758, -0.8706]]]])
torch.Size([3, 3])
tensor([[-2.1386, -0.3540, -0.8361],
        [-0.0905, -0.3460, -2.0942],
        [-1.3964, -0.0758, -0.8706]])


In [68]:
print(t2.permute(0, 1, 2, 3))
print(t2.permute(1, 2, 3, 0))

tensor([[[[-2.1386, -0.3540, -0.8361]],

         [[-0.0905, -0.3460, -2.0942]],

         [[-1.3964, -0.0758, -0.8706]]]])
tensor([[[[-2.1386],
          [-0.3540],
          [-0.8361]]],


        [[[-0.0905],
          [-0.3460],
          [-2.0942]]],


        [[[-1.3964],
          [-0.0758],
          [-0.8706]]]])
