In [50]:
import torch
import numpy as np
from pytorch_pretrained_bert import BertModel
from transformers import BertTokenizer

In [51]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."

In [52]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [53]:
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'after', 'stealing', 'money', 'from', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '.', '[SEP]']


In [54]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [55]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [56]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [57]:
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [58]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0
print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0
print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 22
Number of hidden units: 768


In [59]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()

torch.Size([22, 12, 768])

In [60]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# For each token in the sentence...
for token in token_embeddings:
    # `token` is a [12 x 768] tensor
    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [61]:
for i, token_str in enumerate(tokenized_text):
    print(i, token_str)

0 [CLS]
1 after
2 stealing
3 money
4 from
5 the
6 bank
7 vault
8 ,
9 the
10 bank
11 robber
12 was
13 seen
14 fishing
15 on
16 the
17 mississippi
18 river
19 bank
20 .
21 [SEP]


In [62]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

First 5 vector values for each instance of "bank".

bank vault    tensor([ 2.1319, -2.1413, -1.6260,  0.8638,  3.3173])
bank robber   tensor([ 1.1868, -1.5298, -1.3770,  1.0648,  3.1446])
river bank    tensor([ 1.1295, -1.4725, -0.7296, -0.0901,  2.4970])


In [63]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.95
Vector similarity for *different* meanings:  0.68


# Tokenize 

In [64]:
class BertEmbedding:
    
    def __init__(self):
        self.model = BertModel.from_pretrained('bert-base-uncased').eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def preprocess(self, text):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        return tokens_tensor, segments_tensors
    
    def embed(self, token_embeddings):
        token_vecs_sum = []
        for token in token_embeddings:
            sum_vec = torch.sum(token[-4:], dim=0)
            token_vecs_sum.append(sum_vec)
        return token_vecs_sum
    
    def transform(self, text):
        tokens_tensor, segments_tensors = self.preprocess(text)
        with torch.no_grad():
            encoded_layers, _ = self.model(tokens_tensor, segments_tensors)
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        embedding = self.embed(token_embeddings)
        return embedding

In [65]:
# Initialize BertEmbeder
EMBert = BertEmbedding()

In [66]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
embbeding = EMBert.transform(text)

In [67]:
from scipy.spatial.distance import cosine
diff_bank = 1 - cosine(embbeding[10], embbeding[19])
same_bank = 1 - cosine(embbeding[10], embbeding[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.95
Vector similarity for *different* meanings:  0.68


# Hypotesis testing

In [68]:
from scipy.spatial.distance import cosine
def measure_similarity(a, b):
    diff_ = 1 - cosine(a, b)
    print('Similarity for  *similar*  meanings: Cosine %.2f' % (diff_),end=" ")
    diff_ = np.linalg.norm(a-b)
    print('Euclidian %.2f' % (diff_))
    
def embedding_index(sentence, word):
    index = sentence.split(" ").index(word)
    return index+1

# Context hypotehesis testing
- BERT encodes words based on context meaning that the model looks at the whole sentence and encodes the word based on the meaining it has in the sentence. Words that are in a more similar context are more similar.

In [69]:
max_sentence = "He brutally killed someone"
embedding = EMBert.transform(max_sentence)
max_embedding = embedding[embedding_index(max_sentence,"killed")]

In [70]:
min_sentence = "He accidentaly killed someone"
embedding = EMBert.transform(min_sentence)
min_embedding = embedding[embedding_index(min_sentence,"killed")]

In [71]:
indif_sentence = "He killed someone"
embedding = EMBert.transform(indif_sentence)
indif_embedding = embedding[embedding_index(indif_sentence,"killed")]

In [72]:
huge_sentence = "Someone was killed in a horrifying manner"
embedding = EMBert.transform(huge_sentence)
huge_embedding = embedding[embedding_index(huge_sentence, "killed")]

In [73]:
measure_similarity(max_embedding, huge_embedding)
measure_similarity(min_embedding, huge_embedding)
measure_similarity(indif_embedding, huge_embedding)

Similarity for  *similar*  meanings: Cosine 0.78 Euclidian 42.97
Similarity for  *similar*  meanings: Cosine 0.23 Euclidian 82.50
Similarity for  *similar*  meanings: Cosine 0.78 Euclidian 43.36


In [74]:
core_sentence = "killed"
embedding = EMBert.transform(core_sentence)
core_embedding = embedding[0]

In [75]:
measure_similarity(max_embedding, core_embedding)
measure_similarity(min_embedding, core_embedding)
measure_similarity(indif_embedding, core_embedding)

Similarity for  *similar*  meanings: Cosine 0.34 Euclidian 75.55
Similarity for  *similar*  meanings: Cosine 0.21 Euclidian 84.49
Similarity for  *similar*  meanings: Cosine 0.36 Euclidian 74.31


> <font size="3"> What would be the best way to represent the core meaning of a word so that we can capture the differences between the core meaning and the meaning inside the context?</font>
> - <font size="3"> We can use an embedding which we create from averaging multiple vector embbedings from the same word inside multiple contexts</font>
> - <font size="3"> We can use the basic embbeding of the word without any context at all. This would be the most robust solution since bert probably outputs the most generic embeddings.</font>

# Creation of vectors from multiple vector embbedings
- <font size="3">We will use a large news corpus since our task data is also news related. The dataset can be found <a href="https://www.kaggle.com/snapcrack/all-the-news"> here </a>. It was provided by Andrew Thompson and it contains 143,000 articles from 15 American publications. </font>
- <font size="3"> To be able to create our vector embeddings we have to extract sentences that contain our words from news articles. Because of that we need to also create an effective sentence extraction system.

In [1]:
import pandas as pd

In [6]:
class Query:
    def __init__(self, filename, threshold = 20):
        self.news = open(filename, "r").read().replace("\n","")
        self.threshold = threshold
        
    def q(self, word):
        sentences = [sentence.strip() + '.' for sentence in self.news.split(".") if ' '+word+' ' in sentence]
        sentences = self.clean(sentences)
        return sentences
    
    def clean(self, sentences):
        new = []
        for i in sentences:
            if len(i.split(" ")) < self.threshold:
                new.append(i)
        return new

In [7]:
filename = "datasets/News/articles1.txt"
query = Query(filename, 30)
query.q("killed")

['” Several smaller bombings elsewhere in the city on Monday killed at least 20 civilians and wounded at least 70, according to medics and police officials.',
 'Hatab was alleged to have killed American service members and sold an   rifle taken from one of them.',
 'At least two dozen of the people killed were said to be foreigners.',
 '“One year ago, from a base here in Afghanistan, our troops launched the operation that killed Osama bin Laden,” he said.',
 'Turkish officials say they know who killed at least 39 people in an assault on a nightclub last weekend, although they did not release his name.',
 'Israeli soldiers killed one and wounded the other, Mr.',
 'Ahmad Tibi, an Arab member of the Israeli Parliament, said on  Twitter that dozens of soldiers and commanders who killed Palestinians should have been convicted.',
 'Fewer people were murdered with guns in 2016, but the number of people killed by cutting rose to 73, from 50 the year before.',
 '“The defendant didn’t stop after