# Notebook for extracting BERT word embeddings from text (using Google Colab)


Set which BERT model you would like to use

In [263]:
import os
os.chdir("..")
os.chdir("..")


# ALBERT uses a different tokenizer (sentencepiece instead of usual wordpiece) therefore use distilbert
# bert/distilbert/roberta/sbert (sentencebert)
# normal/smaller/larger
bert_model = "roberta"

# train/devel/test
data_set = "test"

# Use subset to test methods
small_subset = False

# single sentence for detailed inspection, overwrites small_subset
single_sentence = False

# Save new pickle file, if small_subset == True, or single_sentence == True, then no new pickle is made
new_pickle = True

# include punctuation in averaging subwords
with_punctuation = False

# location of files
data_loc = f"data/features_csv/{data_set}_sentences.csv"
pickle_write_to = f"data/embeddings_pickle/{bert_model}_{data_set}"

# UNCOMMENT THIS IF USING COLAB
# !pip install transformers

Upload the necessary files, if using Google Colab

In [264]:
# from google.colab import files
# Upload train, devel and test set
# uploaded = files.upload()

Enable the GPU, if it is available

In [265]:
import torch

# use GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


Install BERT model and set file locations

In [266]:
# Use standard BERT model
if bert_model == "bert":
    from transformers import BertTokenizer, BertModel

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True).to(device)

# Use smaller BERT model
elif bert_model == "distilbert":
    from transformers import DistilBertTokenizer, DistilBertModel

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained("distilbert-base-uncased", output_hidden_states=True).to(device)

elif bert_model == "roberta":
    from transformers import RobertaModel, RobertaTokenizer

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base", output_hidden_states=True).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [267]:
"""IMPORTANT: This method is not necessary when we don't need to pad the input 
sentences to match the longest sentence. We don't need to do this since only 
need to obtain embeddings, and not train the model."""

# import pandas as pd

# # Determine the max input length of a dataset
# def max_length_of_set(dataset: str):
#   df = pd.read_csv(dataset)
#   sentences = df.sentence.values
#   max_len = 0
#   for sent in sentences:
#     # Tokenize text and add special tokens [CLS] and [SEP]
#     input_ids = tokenizer.encode(sent, add_special_tokens=True)

#     # Update max input length to determine padding
#     max_len = max(max_len, len(input_ids))
  
#   return max_len

# len_train = max_length_of_set("train_sentences.csv")
# len_devel = max_length_of_set("devel_sentences.csv")
# len_test = max_length_of_set("test_sentences.csv")

# max_len_all = max([len_train, len_devel, len_test])

# print('Max sentence length: ', max_len_all)

"IMPORTANT: This method is not necessary when we don't need to pad the input \nsentences to match the longest sentence. We don't need to do this since only \nneed to obtain embeddings, and not train the model."

Convert the words in each utterance to tokens, interpretable by BERT

In [268]:
import pandas as pd

df = pd.read_csv(data_loc)
sentences = df.sentence.values
sentences = sentences

# Save token IDs 
input_ids = []

# Use this to test methods
if small_subset:
    sentences = sentences[:30]

if single_sentence:
    sentences = ["Hopefully, this sentence, consists of multiple's embeddings, extraterrestrial."]

# Tokenize text as before, and append the token IDs to list
for sent in sentences:
    encoded_dict = tokenizer.encode(sent,
                                    add_special_tokens=True,
                                    # max_length = max_len_all,
                                    pad_to_max_length=False,
                                    return_tensors='pt')
    input_ids.append(encoded_dict)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Well, I look very helpful to tell folks to lie, because I needed to be to register and Adele University of Charlotte Russe in North Carolina and took me a lot of time to to acquire certain skills, like reading and listening speaking as well, and a good experience because it improves. I'm all my, my reading skills and my listening skills, and I hope I will be best prepared. And I think that top workers well needed phone to  to reach the school.
Token IDs: tensor([[    0,  8346,     6,    38,   356,   182,  7163,     7,  1137,  5450,
             7,  6105,     6,   142,    38,   956,     7,    28,     7,  5124,
             8,  1614,  6902,   589,     9,  5420, 14762,  1090,    11,   369,
          1961,     8,   362,   162,    10,   319,     9,    86,     7,     7,
          6860,  1402,  2417,     6,   101,  2600,     8,  6288,  2686,    25,
           157,     6,     8,    10,   205,   676,   142,    24, 15296,     4,
            38,   437,    70,   127,     6,   127,  2600

Obtain contextualized word embeddings by summing the last 4 hidden layers

In [269]:
from time import perf_counter
import datetime
import sys
from os import path
import pickle

# Measure time to obtain all word embeddings of each sentence
t1_start = perf_counter()

context_embeddings = []

raw_embedding_loc = f"data/embeddings_pickle/{bert_model}_{data_set}_raw.pickle"
if path.exists(raw_embedding_loc):
    with open(raw_embedding_loc, "rb") as f:
        context_embeddings = pickle.load(f)
else:

    index = 1
    # input_ids = input_ids.to(device)
    for i_id in input_ids:
        with torch.no_grad():
            outputs = model(i_id.to(device))

            # Sum the last 4 layers of the transformer model
            summed_last_4_layers = torch.stack(outputs.hidden_states[-4:]).sum(0).squeeze(0)

        # Update progressbar every 50 utterances
        if index % 50 == 0:
            sys.stdout.write(f"\rProgress: extracted %i/{len(input_ids)} text embeddings." % index)
            sys.stdout.flush()
            # print(summed_last_4_layers.size())
        context_embeddings.append(summed_last_4_layers)
        index += 1

    with open(raw_embedding_loc, "wb") as f:
        pickle.dump(context_embeddings, f)

sys.stdout.flush()
sys.stdout.write(f"Progress: extracted {len(input_ids)}/{len(input_ids)} text embeddings.")

t1_stop = perf_counter()
time_difference = t1_stop - t1_start
print(f"\nTime necessary to extract all embeddings: {str(datetime.timedelta(seconds=time_difference))}.")

Progress: extracted 867/867 text embeddings.
Time necessary to extract all embeddings: 0:00:00.387181.


BERT uses sub-word tokenization, meaning that some words are split in multiple fragments. We want to find those fragments, since we are interested in word embeddings. Use regex to split the sentences including punctuation, similar to BERT. To test whether the tokenization is equal to BERT's tokenization, compare the two tokenized sentences.

In [270]:
import re
import itertools

for sent in sentences:
    # Use Regex to split including punctuation
    # [\w]+|[^\s\w]  # all punctuation
    # [\w']+|[.,!?;]
    # FIX RoBERTA
    # tokens_re = re.findall(r"[\w']+|[.,!?;]", sent)
    tokens_re = sent.split()

    summed = []
    for i in tokens_re:
        # Use BERT tokenizer on individual words to obtain sub-word tokenization
        token = tokenizer.tokenize(i)
        summed.append(token)
    summed = itertools.chain.from_iterable(summed)
    tokens_regex = list(summed)
    # while "▁" in tokens_regex: tokens_regex.remove("▁")

    tokens_bert = tokenizer.tokenize(sent)
    # while "▁" in tokens_bert: tokens_bert.remove("▁")

    # We will only reach the print statements if the tokenization is not equal
    if len(tokens_bert) != len(tokens_regex) and bert_model != "roberta":
        print(tokens_bert)
        print(tokens_regex)
        print("Token sequence is not equal")

Since we don't obtain any prints from the last section, we can conclude that the tokenization is comparable. Now use this tokenization to find out the position of the fragmented words, and how many fragments are needed for the whole word.

In [271]:
from itertools import groupby
from operator import itemgetter


all_subword_indices = []

for sent in sentences:
    token_index_and_length = []

    if bert_model == "roberta":
        tokens = tokenizer.tokenize(sent)
        index = 0
        non_starting_tokens = []
        for t in tokens:
            if not t.startswith('Ġ') and index != 0:
                non_starting_tokens.append(index)
            index += 1
        for k, g in groupby(enumerate(non_starting_tokens), lambda ix : ix[0] - ix[1]):
            consecutive_tokens = list(map(itemgetter(1), g))
            token_index_and_length.append((consecutive_tokens[0] -1, len(consecutive_tokens) + 1))
        all_subword_indices.append(token_index_and_length)
            # token_index_and_length.append()


    else:
        # Use regex to split with punctuation, similar to tokenize
        # regex_words = re.findall(r"[\w]+|[^\s\w]", sent)
        regex_words = sent.split()

        index = 0

        # Find which words are split in subwords according to the tokenizer, save their position in the sentence, and their length
        for word in regex_words:
            model_tokens = tokenizer.tokenize(word)
            if len(model_tokens) > 1:
                token_index_and_length.append((index, len(model_tokens)))
                index += len(model_tokens) - 1  #  Move index according to the number of subwords
            index += 1

        all_subword_indices.append(token_index_and_length)
print(tokenizer.tokenize(sentences[0]))
print(all_subword_indices[:10])

['Well', ',', 'ĠI', 'Ġlook', 'Ġvery', 'Ġhelpful', 'Ġto', 'Ġtell', 'Ġfolks', 'Ġto', 'Ġlie', ',', 'Ġbecause', 'ĠI', 'Ġneeded', 'Ġto', 'Ġbe', 'Ġto', 'Ġregister', 'Ġand', 'ĠAd', 'ele', 'ĠUniversity', 'Ġof', 'ĠCharlotte', 'ĠRus', 'se', 'Ġin', 'ĠNorth', 'ĠCarolina', 'Ġand', 'Ġtook', 'Ġme', 'Ġa', 'Ġlot', 'Ġof', 'Ġtime', 'Ġto', 'Ġto', 'Ġacquire', 'Ġcertain', 'Ġskills', ',', 'Ġlike', 'Ġreading', 'Ġand', 'Ġlistening', 'Ġspeaking', 'Ġas', 'Ġwell', ',', 'Ġand', 'Ġa', 'Ġgood', 'Ġexperience', 'Ġbecause', 'Ġit', 'Ġimproves', '.', 'ĠI', "'m", 'Ġall', 'Ġmy', ',', 'Ġmy', 'Ġreading', 'Ġskills', 'Ġand', 'Ġmy', 'Ġlistening', 'Ġskills', ',', 'Ġand', 'ĠI', 'Ġhope', 'ĠI', 'Ġwill', 'Ġbe', 'Ġbest', 'Ġprepared', '.', 'ĠAnd', 'ĠI', 'Ġthink', 'Ġthat', 'Ġtop', 'Ġworkers', 'Ġwell', 'Ġneeded', 'Ġphone', 'Ġto', 'Ġ', 'Ġto', 'Ġreach', 'Ġthe', 'Ġschool', '.']
[[(0, 2), (10, 2), (20, 2), (25, 2), (41, 2), (49, 2), (57, 2), (59, 2), (62, 2), (70, 2), (79, 2), (95, 2)], [(5, 2), (18, 2), (20, 3), (25, 2), (28, 2), (41, 2), 

In [272]:
print(tokenizer.tokenize("the hardest thing for me was to"))

['the', 'Ġhardest', 'Ġthing', 'Ġfor', 'Ġme', 'Ġwas', 'Ġto']


Using the indices we just obtained, we calculate word embeddings by taking the mean of the individual word segments.

In [273]:
import copy
import numpy as np

context_embeddings_copy = copy.deepcopy(context_embeddings)

# Loop over all utterances
row_index = 0
word_tensors = []
sentence_tensors = []
for tensor_row in context_embeddings_copy:
    # print(tensor_row.size())

    # Loop over all words that contain subwords
    embeddings_to_remove = []
    tokenized_sent = tokenizer.tokenize((sentences[row_index]))

    for subword_tuples in all_subword_indices[row_index]:

        start_index = subword_tuples[0] + 1  # "+ 1" to compensate for [CLS] token
        end_index = start_index + subword_tuples[1] - 1

        # track indices of subwords we need to remove
        embeddings_to_remove.extend(list(range(start_index + 1, end_index + 1)))

        if with_punctuation:
            tensor_row[start_index] = torch.mean(context_embeddings[row_index][start_index:end_index + 1], 0)
        else:

            mean_indices = []
            for subword_index in range(start_index - 1,
                                       end_index):  # No need for + 1 since we use tokenize instead of encode
                if tokenized_sent[subword_index] not in [".", ",", "'"]:
                    mean_indices.append(subword_index + 1)

            # print(torch.sum(tensor_row[start_index]))
            # Replace the first subword segment with the mean of all subwords
            # replacing values of an item while looping over said item is bad practice, however since we won't continue with the looped item, we allow it for now
            tensor_row[start_index] = torch.mean(context_embeddings[row_index][mean_indices], 0)
            # print(mean_indices)
            # print(torch.sum(tensor_row[start_index]))


    # Take selection of indices we want to keep
    all_embeddings = list(range(0, tensor_row.size(0)))
    selected_embeddings = [x for x in all_embeddings if x not in embeddings_to_remove]

    # Remove [CLS] and [SEP] tokens
    selected_embeddings.pop(0)
    selected_embeddings.pop()

    # Take subset of original tensor using the subset of indices
    new_tensor_row = tensor_row[selected_embeddings]
    new_tensor_row = new_tensor_row.detach().cpu().numpy()
    word_tensors.append(new_tensor_row)

    # Make sentence embedding by averaging the obtained word embeddings
    row_mean = np.mean(new_tensor_row, axis=0)
    row_mean.reshape(1, 768)
    sentence_tensors.append(row_mean)

    row_index += 1
print(word_tensors[:3])

[array([[ 0.23204581, -0.7001724 ,  0.5767639 , ..., -2.5835745 ,
        -0.6695212 ,  0.7478281 ],
       [ 0.60015434, -0.24431926, -0.15470025, ...,  1.2606547 ,
        -0.8145637 ,  1.1486393 ],
       [ 1.3432791 , -0.13238513,  1.2819518 , ...,  0.44860315,
        -0.24987164,  1.8757869 ],
       ...,
       [ 0.59417397, -1.7130679 , -0.88042915, ..., -0.07725899,
         0.046603  ,  0.16554949],
       [ 0.25066665, -2.0049272 , -1.3881013 , ...,  0.7843673 ,
         1.0094335 ,  1.1845194 ],
       [ 0.02390254, -1.668838  , -1.8492166 , ...,  3.5352306 ,
         0.66034067,  1.099202  ]], dtype=float32), array([[-0.546425  , -0.8482804 , -0.5099096 , ...,  2.785665  ,
         1.4938635 ,  2.5296102 ],
       [ 1.0407043 , -0.00374939,  0.37668598, ..., -0.42417538,
         2.0281856 ,  1.1477307 ],
       [ 2.8468757 ,  0.6816466 , -0.3469349 , ...,  1.9391967 ,
         0.25731713,  2.6264846 ],
       ...,
       [ 0.6620666 ,  1.8678962 ,  1.2046592 , ..., -1.338

Now, we have obtained the individual word embeddings, with averaged subword embeddings and stripped special tokens. Finally, we pickle the data.

In [274]:
pickle_write_to = f"data/embeddings_pickle/{bert_model}_{data_set}"

import pickle

# tensor to np1!!!!!

if new_pickle and not single_sentence and not small_subset:
    print("Creating new pickle")
    with open(f"{pickle_write_to}_words.pickle", 'wb') as f:
        pickle.dump(word_tensors, f)

    with open(f"{pickle_write_to}_sentences.pickle", 'wb') as f:
        pickle.dump(sentence_tensors, f)

# UNCOMMENT THIS IF USING COLAB
# files.download("word_embeddings_with_punctuation.pickle")

Creating new pickle


In [275]:
import os
b = os.path.getsize(f"{pickle_write_to}_words.pickle")
print(f"Filesize: {round(b/(1024*1024), 2)} MB")

Filesize: 203.33 MB
