In [84]:
import torch
import numpy as np

In [55]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."

In [56]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [58]:
marked_text = "[CLS] " + text + " [SEP]"
# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'after', 'stealing', 'money', 'from', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '.', '[SEP]']


In [59]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [60]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [61]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [62]:
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [64]:
encoded_layers[-1]

tensor([[[-0.2645,  0.0118, -0.5196,  ..., -0.0293,  0.3886,  0.5092],
         [-0.2265, -0.2834, -0.0875,  ..., -0.5227, -0.0107, -0.2487],
         [-0.4162, -0.4183,  0.0928,  ..., -0.3227, -0.2965, -0.2034],
         ...,
         [ 0.2947, -0.2835, -0.0351,  ..., -0.5340, -0.2271,  0.1018],
         [ 0.6511, -0.0945, -0.1933,  ...,  0.0790, -0.4076, -0.2446],
         [-0.1565, -0.0090, -0.1407,  ...,  0.4275,  0.2381, -0.4106]]])

In [65]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0
print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0
print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 22
Number of hidden units: 768


In [69]:
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.shape

torch.Size([22, 12, 768])

In [70]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# For each token in the sentence...
for token in token_embeddings:
    # `token` is a [12 x 768] tensor
    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [71]:
for i, token_str in enumerate(tokenized_text):
    print(i, token_str)

0 [CLS]
1 after
2 stealing
3 money
4 from
5 the
6 bank
7 vault
8 ,
9 the
10 bank
11 robber
12 was
13 seen
14 fishing
15 on
16 the
17 mississippi
18 river
19 bank
20 .
21 [SEP]


In [72]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

First 5 vector values for each instance of "bank".

bank vault    tensor([ 2.1319, -2.1413, -1.6260,  0.8638,  3.3173])
bank robber   tensor([ 1.1868, -1.5298, -1.3770,  1.0648,  3.1446])
river bank    tensor([ 1.1295, -1.4725, -0.7296, -0.0901,  2.4970])


In [73]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.95
Vector similarity for *different* meanings:  0.68


# Tokenize 

In [73]:
from pytorch_pretrained_bert import BertModel, BertTokenizer
class BertEmbedding:
    
    def __init__(self):
        self.model = BertModel.from_pretrained('bert-base-uncased').eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def preprocess(self, text):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        return tokens_tensor, segments_tensors
    
    def embed(self, token_embeddings):
        token_vecs_sum = []
        for token in token_embeddings:
            sum_vec = torch.sum(token[-4:], dim=0)
            token_vecs_sum.append(sum_vec)
        return token_vecs_sum
    
    def transform(self, text):
        tokens_tensor, segments_tensors = self.preprocess(text)
        with torch.no_grad():
            encoded_layers, _ = self.model(tokens_tensor, segments_tensors)
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        embedding = self.embed(token_embeddings)
        return embedding

In [74]:
# Initialize BertEmbeder
EMBert = BertEmbedding()

In [75]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
embbeding = EMBert.transform(text)

In [None]:
from scipy.spatial.distance import cosine
diff_bank = 1 - cosine(embbeding[10], embbeding[19])
same_bank = 1 - cosine(embbeding[10], embbeding[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

# Hypotesis testing

In [77]:
from scipy.spatial.distance import cosine
def measure_similarity(a, b):
    diff_ = 1 - cosine(a, b)
    print('Similarity for  *similar*  meanings: Cosine %.2f' % (diff_),end=" ")
    diff_ = np.linalg.norm(a-b)
    print('Euclidian %.2f' % (diff_))
    
def embedding_index(sentence, word):
    index = sentence.split(" ").index(word)
    return index+1

# Context hypotehesis testing
- BERT encodes words based on context meaning that the model looks at the whole sentence and encodes the word based on the meaining it has in the sentence. Words that are in a more similar context are more similar.
- We also need to define what the core meaining of the word is. The core meaning of the word is the meaining that the word has regardless of the context. The only thing the word knows is its true meaning regarding to polysemi.

In [85]:
max_sentence = "He brutally killed someone"
embedding = EMBert.transform(max_sentence)
max_embedding = embedding[embedding_index(max_sentence,"killed")]

In [86]:
min_sentence = "He accidentaly killed someone"
embedding = EMBert.transform(min_sentence)
min_embedding = embedding[embedding_index(min_sentence,"killed")]

In [87]:
indif_sentence = "He killed someone"
embedding = EMBert.transform(indif_sentence)
indif_embedding = embedding[embedding_index(indif_sentence,"killed")]

In [88]:
huge_sentence = "Someone was killed in a horrifying manner"
embedding = EMBert.transform(huge_sentence)
huge_embedding = embedding[embedding_index(huge_sentence, "killed")]

In [89]:
measure_similarity(max_embedding, huge_embedding)
measure_similarity(min_embedding, huge_embedding)
measure_similarity(indif_embedding, huge_embedding)

Similarity for  *similar*  meanings: Cosine 0.78 Euclidian 42.97
Similarity for  *similar*  meanings: Cosine 0.23 Euclidian 82.50
Similarity for  *similar*  meanings: Cosine 0.78 Euclidian 43.36


In [91]:
core_sentence = "killed"
embedding = EMBert.transform(core_sentence)
core_embedding = embedding[0]

In [75]:
measure_similarity(max_embedding, core_embedding)
measure_similarity(min_embedding, core_embedding)
measure_similarity(indif_embedding, core_embedding)

Similarity for  *similar*  meanings: Cosine 0.34 Euclidian 75.55
Similarity for  *similar*  meanings: Cosine 0.21 Euclidian 84.49
Similarity for  *similar*  meanings: Cosine 0.36 Euclidian 74.31


> <font size="3"> What would be the best way to represent the core meaning of a word so that we can capture the differences between the core meaning and the meaning inside the context?</font>
> - <font size="3"> We can use an embedding which we create from averaging multiple vector embbedings from the same word inside multiple contexts</font>
> - <font size="3"> We can use the basic embbeding of the word without any context at all. This would be the most robust solution since bert probably outputs the most generic embeddings.</font>

# Information retreival for vector embeddings
- <font size="3">We will use a large news corpus since our task data is also news related. The dataset can be found <a href="https://www.kaggle.com/snapcrack/all-the-news"> here </a>. It was provided by Andrew Thompson and it contains 143,000 articles from 15 American publications. </font>
- <font size="3"> To be able to create our vector embeddings we have to extract sentences that contain our words from news articles. Because of that we need to also create an effective sentence extraction system.

In [39]:
class Query:
    def __init__(self, filename, threshold = 20):
        self.news = open(filename, "r").read().replace("\n","")
        self.threshold = threshold
        
    def q(self, word):
        sentences = [sentence.strip() + '.' for sentence in self.news.split(".") if ' '+word+' ' in sentence]
        print("Before:",len(sentences))
        sentences = self.clean(sentences)
        print("After:",len(sentences))
        return sentences
    
    def clean(self, sentences):
        new = []
        for i in sentences:
            if len(i.split(" ")) < self.threshold:
                new.append(i)
        return new

Here we can experiment with multiple tresholds. My assumption is that when the senteces are smaller that the word is closer to its core meaning because there isn't to much side context.

In [38]:
filename = "datasets/News/articles1.txt"
query = Query(filename, 20)
sentences = query.q("killed")[:1000]

Before: 8372
After: 2436


# Batch training

In [32]:
from transformers import BertTokenizer

text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
padded_sequence = tokenizer.batch_encode_plus([text], return_tensors="pt", max_length=40, pad_to_max_length=True)

In [33]:
from transformers import BertModel
import torch
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = model.eval()
with torch.no_grad():
    out = model(padded_sequence['input_ids'], padded_sequence["attention_mask"])
print(len(out))

3


In [34]:
hidden_states = out[2]
token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings = token_embeddings[-4:,:]
token_embeddings = torch.sum(token_embeddings, dim=0)
token_embeddings.shape

torch.Size([1, 40, 768])

In [35]:
token_vecs_sum = token_embeddings[0,:,:]
token_vecs_sum.shape

torch.Size([40, 768])

In [36]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.94
Vector similarity for *different* meanings:  0.69


# Bert batch embedding
Since we will feed mutliple sentences in batches we need a way to preprocess the sentences in batches instead of sentence by sentence.

In [93]:
from transformers import BertModel, BertTokenizer

class BertBatchEmbedding:
    
    def __init__(self):
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True).eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    
    def transform(self, text):
        padded_sequence = tokenizer.batch_encode_plus(text, return_tensors="pt", max_length=40, pad_to_max_length=True)
        with torch.no_grad():
            out = self.model(padded_sequence['input_ids'], padded_sequence["attention_mask"])
        hidden_states = out[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = self.embed(token_embeddings)
        return token_embeddings
    
    def embed(self, token_embeddings):
        token_embeddings = token_embeddings[-4:,:]
        token_embeddings = torch.sum(token_embeddings, dim=0)
        return token_embeddings

In [94]:
text = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]
bertbatch = BertBatchEmbedding()
embeddings = bertbatch.transform(text)
token_vecs_sum = token_embeddings[0,:,:]

diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.94
Vector similarity for *different* meanings:  0.69


There are multiple ways to create vector embeddings from bert. You can avarage all the hidden layers. You can also avarage the last 4 layers. You can concat the last 4 layers instead of averaging. So this is also a part of our work that will need a lot of testing. We should probably create a table that contains all the different vector embeddings and compare their preformance. 

# Multi vector embedding creation

In [62]:
filename = "datasets/News/articles1.txt"
query = Query(filename, 20)
sentences = query.q("killed")[:1000]

Before: 8372
After: 2436


In [63]:
bertbatch = BertBatchEmbedding()
embeddings = bertbatch.transform(sentences)
embeddings.shape

torch.Size([1000, 40, 768])

In [64]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_sentences = tokenizer.batch_encode_plus(sentences)["input_ids"]

In [65]:
core_embeddings = {}
for i, tokenized_sentence in enumerate(tokenized_sentences):
    for j, token in enumerate(tokenized_sentence):
        if token not in core_embeddings:
            core_embeddings[token] = embeddings[i,:][j,:]
        else:
            core_embeddings[token] = (core_embeddings[token]+embeddings[i,:][j,:])/2  

In [66]:
core_embeddings[2730]

tensor([ 4.4054e+00, -1.0079e+00,  1.8521e-01,  9.0695e-01,  1.7104e-02,
        -1.8542e+00, -4.2765e-01,  8.5572e-01, -2.1372e+00,  1.4927e+00,
         6.3412e-01,  1.4819e+00, -1.3638e+00,  1.6461e+00, -2.7768e+00,
        -2.6784e+00, -1.9579e-01, -3.5096e+00, -3.6820e+00,  5.4707e-01,
         1.4618e+00, -7.2132e-01,  1.8020e+00,  1.6613e+00,  2.0224e+00,
        -6.2111e-02,  1.6888e+00,  5.1060e-01, -1.5262e+00, -1.8507e+00,
         8.6806e-01,  4.6912e-01,  8.8319e-01, -1.3538e+00, -8.9868e-01,
        -1.6415e+00,  3.7351e-01, -1.3028e+00, -1.5394e+00,  1.1749e+00,
        -2.0360e+00, -4.2730e+00, -1.3100e+00, -3.7897e-01,  2.2487e+00,
        -9.2968e-01,  3.9641e+00,  3.1158e+00,  4.5993e+00, -8.1057e-01,
        -2.6990e+00,  6.1405e-01,  1.7487e+00, -1.1270e+00,  1.8266e+00,
         1.1985e+00, -1.0412e+00, -3.3504e+00, -8.8455e-01, -1.1766e+00,
        -3.9828e-01,  1.1500e+00,  1.6837e+00, -2.6874e+00, -6.2709e-01,
        -2.1924e-01, -2.6101e+00,  3.8683e+00, -1.7

In [90]:
measure_similarity(max_embedding, core_embeddings[2730])
measure_similarity(min_embedding, core_embeddings[2730])
measure_similarity(indif_embedding, core_embeddings[2730])

Similarity for  *similar*  meanings: Cosine 0.65 Euclidian 52.40
Similarity for  *similar*  meanings: Cosine 0.14 Euclidian 84.00
Similarity for  *similar*  meanings: Cosine 0.65 Euclidian 51.72


In [92]:
measure_similarity(core_embeddings[2730], core_embedding)

Similarity for  *similar*  meanings: Cosine 0.30 Euclidian 76.15


# Baseline model