In [2]:
import torch
import pickle
import numpy as np
from torch import nn
from tqdm.contrib import tzip
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

In [3]:
def embedding_index(sentence, word):
    index = sentence.split(" ").index(word)
    return index

# Measures

In [4]:
from scipy.spatial.distance import cosine
def measure_similarity(a, b):
    cosine_value = 1 - cosine(a, b)
    euclid_value = np.linalg.norm(a-b)
    return cosine_value, euclid_value

# Bert

In [5]:
class BertBatchEmbedding:
    def __init__(self):
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True).eval().cuda()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def transform(self, sentences):
        padded_sequence = self.tokenizer.batch_encode_plus(sentences, return_tensors="pt", pad_to_max_length=True)
        out = self.model(padded_sequence['input_ids'].cuda(), padded_sequence["attention_mask"].cuda())
        hidden_states = out[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.flatten(token_embeddings.permute(1, 2, 0,3), start_dim=2)[:,1:-1,:]
        return token_embeddings.cpu().detach().numpy()
    
    def parameters(self):
        return self.model.parameters()
    
bert = BertBatchEmbedding()

# Custom Bert

In [6]:
class CustomTokenizer:
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
    def tokenize(self, sentences : list):
        encoded = []
        for sentence in sentences:
            tokens = self.tokenizer.tokenize(sentence)
            encoded_sentence = []
            for token in tokens:
                if token in self.vocabulary:
                    encoded_sentence.append(self.vocabulary[token])
                else:
                    encoded_sentence.append(self.vocabulary["<UNK>"])
            encoded.append(encoded_sentence)
        return torch.tensor(encoded)

In [7]:
class CustomBertEmbedding:
    def __init__(self):
        with open('datasets/weights.pickle', 'rb') as handle:
            weights = torch.tensor(pickle.load(handle))
            
        with open('datasets/vocab.pickle', 'rb') as handle:
            vocabulary = pickle.load(handle)
            
        self.model = nn.Embedding.from_pretrained(weights, padding_idx = 0, freeze = True)
        self.tokenizer = CustomTokenizer(vocabulary)
        
    def transform(self, text):
        tokenized_text = self.tokenizer.tokenize(text).long()
        embedded_text = self.model(tokenized_text)
        return embedded_text
    
    
    def parameters(self):
        return self.model.parameters()
    
custom = CustomBertEmbedding()

# Context hypotehesis testing
- BERT encodes words based on context meaning that the model looks at the whole sentence and encodes the word based on the meaining it has in the sentence. Words that are in a more similar context are more similar.
- We also need to define what the core meaining of the word is. The core meaning of the word is the meaining that the word has regardless of the context. The only thing the word knows is its true meaning regarding to polysemi.

In [8]:
max_sentence = "He brutally killed someone"
embedding = bert.transform([max_sentence])[0]
max_embedding = embedding[embedding_index(max_sentence,"killed")]

In [9]:
min_sentence = "He accidentally killed someone"
embedding = bert.transform([min_sentence])[0]
min_embedding = embedding[embedding_index(min_sentence,"killed")]

In [10]:
indif_sentence = "He killed someone"
embedding = bert.transform([indif_sentence])[0]
indif_embedding = embedding[embedding_index(indif_sentence,"killed")]

In [11]:
huge_sentence = "Someone was killed in a horrifying manner"
embedding = bert.transform([huge_sentence])[0]
huge_embedding = embedding[embedding_index(huge_sentence, "killed")]

In [12]:
print(measure_similarity(max_embedding, huge_embedding))
print(measure_similarity(min_embedding, huge_embedding))
print(measure_similarity(indif_embedding, huge_embedding))

(0.7988684177398682, 43.771984)
(0.7915605306625366, 44.648533)
(0.7995526790618896, 43.83039)


In [13]:
core_sentence = "killed"
embedding = custom.transform([core_sentence])[0].data.numpy()
core_embedding = embedding[0]

In [14]:
print(measure_similarity(max_embedding, core_embedding))
print(measure_similarity(min_embedding, core_embedding))
print(measure_similarity(indif_embedding, core_embedding))

(0.8312858157107608, 38.07012018152099)
(0.8240340527046277, 38.98820765753455)
(0.8210544978745334, 39.37731779561605)


# Examples

In [34]:
def word_similarity(propaganda, nonpropaganda, word):
    embedding = bert.transform([propaganda])[0]
    propaganda = embedding[embedding_index(propaganda, word)][-768:]
    embedding = bert.transform([nonpropaganda])[0]
    nonpropaganda = embedding[embedding_index(nonpropaganda, word)][-768:]
    embedding = bert.transform([word])[0]
    bert_core_embedding = embedding[0][-768:]
    embedding = custom.transform([word])[0].data.numpy()
    core_embedding = embedding[0][-768:]
    print("Differences between propaganda and non propaganda")
    print("Using core embeddings","cosine",abs(measure_similarity(propaganda, core_embedding)[0]-measure_similarity(nonpropaganda, core_embedding)[0]),"euclid",abs(measure_similarity(propaganda, core_embedding)[1]-measure_similarity(nonpropaganda, core_embedding)[1]))
    print("Using bert core embeddings","cosine", abs(measure_similarity(propaganda, bert_core_embedding)[0]-measure_similarity(nonpropaganda, bert_core_embedding)[0]),"euclid",abs(measure_similarity(propaganda, bert_core_embedding)[1]-measure_similarity(nonpropaganda, bert_core_embedding)[1]))
    print("Direct difference between words","cosine",1-measure_similarity(nonpropaganda, propaganda)[0], "euclid",measure_similarity(nonpropaganda, propaganda)[1])

In [36]:
propaganda = "In the future everyone will be the subject of social justice crybullying for 15 minutes."
nonpropaganda = "In the future everyone will be the subject of social justice for 15 minutes."

In [37]:
word_similarity(propaganda,nonpropaganda, "social")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.02750954261729066 euclid 0.14267734322225145
Using bert core embeddings cosine 0.04128524661064148 euclid 0.7919636
Direct difference between words cosine 0.08723562955856323 euclid 5.8261724


In [38]:
word_similarity(propaganda,nonpropaganda, "justice")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.00807510322788807 euclid 0.07085715982751317
Using bert core embeddings cosine 0.04355967044830322 euclid 0.6902571
Direct difference between words cosine 0.1435481309890747 euclid 6.940008


In [39]:
propaganda = "During the event, Patel’s performance featured commentary on his experience living in a diverse area of New York"
nonpropaganda = "During the event, Patel’s performance featured commentary on his experience living in a area of New York"

In [40]:
word_similarity(propaganda,nonpropaganda, "area")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.03688372388358874 euclid 0.9201254427506029
Using bert core embeddings cosine 0.007760718464851379 euclid 0.5483227
Direct difference between words cosine 0.22605055570602417 euclid 9.4450445


In [41]:
propaganda = "That's what Columbia snowflakes thought was offensive"
nonpropaganda = "That's what Columbia thought was offensive"

In [42]:
word_similarity(propaganda,nonpropaganda, "Columbia")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.005112637015793564 euclid 0.19526626633392397
Using bert core embeddings cosine 0.01566094160079956 euclid 0.05400467
Direct difference between words cosine 0.05840563774108887 euclid 5.2522326


In [43]:
word_similarity(propaganda,nonpropaganda, "offensive")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.14327158173455334 euclid 3.951414532332768
Using bert core embeddings cosine 0.15078258514404297 euclid 3.8719482
Direct difference between words cosine 0.7403800189495087 euclid 18.854862


In [49]:
propaganda = "It’s got to be either one of the stupidest acts that I can recall or a very wicked plan by Washington neocons to sabotage Korean peace talks."
nonpropaganda = "It’s got to be either one of the acts that I can recall or a very wicked plan by Washington neocons to sabotage Korean peace talks."

In [50]:
word_similarity(propaganda,nonpropaganda, "acts")

Differences between propaganda and non propaganda
Using core embeddings cosine 0.09871787939178611 euclid 2.281394350262664
Using bert core embeddings cosine 0.0027364641427993774 euclid 1.0827847
Direct difference between words cosine 0.40493500232696533 euclid 14.424915


> <font size="3"> What would be the best way to represent the core meaning of a word so that we can capture the differences between the core meaning and the meaning inside the context?</font>
> - <font size="3"> We can use an embedding which we create from averaging multiple vector embbedings from the same word inside multiple contexts</font>
> - <font size="3"> We can use the basic embbeding of the word without any context at all. This would be the most robust solution since bert probably outputs the most generic embeddings.</font>

# Bert batch embedding
Since we will feed mutliple sentences in batches we need a way to preprocess the sentences in batches instead of sentence by sentence.

In [104]:
text = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]
embeddings = bert.transform(text)[0]
diff_bank = 1 - cosine(embeddings[9], embeddings[18])
same_bank = 1 - cosine(embeddings[9], embeddings[5])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.92
Vector similarity for *different* meanings:  0.71


There are multiple ways to create vector embeddings from bert. You can avarage all the hidden layers. You can also avarage the last 4 layers. You can concat the last 4 layers instead of averaging. So this is also a part of our work that will need a lot of testing. We should probably create a table that contains all the different vector embeddings and compare their preformance. 

# Dataset exploration

In [105]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 5000)

  import pandas.util.testing as tm


In [106]:
filename = "train"
data = open("datasets/Processed/"+filename+".txt", "r").read().split("\n")

In [107]:
X = []
Y = []
x = []
y = []
for i in data:
    if i=="":
        X.append(" ".join(x))
        Y.append(y)
        x = []
        y = []
    else:
        row = i.split(" ")
        x.append(row[0])
        y.append(int(row[1]))

In [108]:
import re
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Y_new = []
X_new = []
locations = []
special = []
for x,y in zip(X,Y):
    tokenized = tokenizer.tokenize(x)
    X_new.append(tokenized)
    split_x = x.lower().split(" ")
    counter = 0
    new_y = []
    local = []
    local_special = []
    word = ""
    for i, token in enumerate(tokenized):
        word = word + token.replace("#","")
        if word == split_x[counter]:
            new_y.append(y[counter])
            counter+=1
            word = ""
            if "#" in token:
                local.append(i)
        else:
            new_y.append(y[counter])
            if "#" in token:
                local.append(i)
        if re.findall('[^A-Za-z0-9]',token) and not "#" in token:
            local_special.append(i)
    locations.append(local)
    special.append(local_special)
    assert len(new_y)!=tokenized
    for i,j in enumerate(tokenized):
        if not re.search('[a-zA-Z]', j):
            new_y[i] = 0
    Y_new.append(new_y)

In [109]:
def mean_std(df, lenght):
    cos_1 = df.loc[(df["label"]==1) & (df['token'].str.len()>lenght) & (df["is_word"]==1)]["cosine_value"].values
    cos_0 = df.loc[(df["label"]==0) & (df['token'].str.len()>lenght) & (df["is_word"]==1)]["cosine_value"].values
    euclid_1 = df.loc[(df["label"]==1) & (df['token'].str.len()>lenght) & (df["is_word"]==1)]["euclid_value"].values
    euclid_0 = df.loc[(df["label"]==0) & (df['token'].str.len()>lenght) & (df["is_word"]==1)]["euclid_value"].values
    sns.distplot(cos_1, hist=False, label="1")
    sns.distplot(cos_0, hist=False, label="0")
    plt.legend(loc="upper left")
    plt.show()
    print("1:",cos_1.mean(),cos_1.std(), euclid_1.mean(), euclid_1.std())
    sns.distplot(euclid_1, hist=False, label="1")
    sns.distplot(euclid_0, hist=False, label="0")
    plt.legend(loc="upper right")
    plt.show()
    print("0:",cos_0.mean(),cos_0.std(), euclid_0.mean(), euclid_0.std())


In [110]:
with open('datasets/vocab.pickle', 'rb') as handle:
    vocabulary = pickle.load(handle)
vocabulary

{'<PAD>': 0,
 '<UNK>': 1,
 'the': 2,
 ',': 3,
 '.': 4,
 'to': 5,
 'of': 6,
 'and': 7,
 'in': 8,
 'a': 9,
 'that': 10,
 '’': 11,
 's': 12,
 'is': 13,
 '“': 14,
 '”': 15,
 '-': 16,
 'for': 17,
 'it': 18,
 'on': 19,
 'was': 20,
 'he': 21,
 'with': 22,
 'as': 23,
 'this': 24,
 'be': 25,
 'by': 26,
 'not': 27,
 'his': 28,
 'have': 29,
 'are': 30,
 '##s': 31,
 ':': 32,
 'has': 33,
 'i': 34,
 'from': 35,
 'at': 36,
 'they': 37,
 'who': 38,
 "'": 39,
 'an': 40,
 'said': 41,
 '"': 42,
 'but': 43,
 'we': 44,
 'you': 45,
 '?': 46,
 't': 47,
 'will': 48,
 'trump': 49,
 'or': 50,
 'had': 51,
 ')': 52,
 '(': 53,
 'all': 54,
 'were': 55,
 'about': 56,
 'their': 57,
 'what': 58,
 'one': 59,
 'which': 60,
 'been': 61,
 'no': 62,
 'our': 63,
 'out': 64,
 'there': 65,
 'if': 66,
 '—': 67,
 'would': 68,
 'so': 69,
 'people': 70,
 'do': 71,
 'also': 72,
 'she': 73,
 'when': 74,
 'after': 75,
 'up': 76,
 'her': 77,
 'more': 78,
 'should': 79,
 'us': 80,
 'can': 81,
 'president': 82,
 'him': 83,
 'church': 8

In [111]:
token_df = []
for sentence, tokens, labels in tzip(X, X_new, Y_new):
    assert len(tokens)==len(labels)
    bembeddings = torch.tensor(bert.transform([sentence])[0])[:,768:][:,768*11:768*12]
    cembeddings = custom.transform([sentence])[0][:,768:][:,768*11:768*12]
    words = re.sub(r'[^A-Za-z0-9 ]+', '', sentence).split(" ")
    assert bembeddings.shape==cembeddings.shape
    for i,token in enumerate(tokens):
        cosine_value, euclid_value = measure_similarity(bembeddings[i,:], cembeddings[i,:])
        if token.isalpha():
            is_word = 1
        else:
            is_word = 0
        token_df.append([token, is_word, cosine_value, euclid_value, labels[i]])

HBox(children=(FloatProgress(value=0.0, max=13485.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
df = df = pd.DataFrame.from_records(token_df)
df.columns = ["token", "is_word","cosine_value","euclid_value","label"]
df.head(1000)

In [None]:
df.shape

In [None]:
mean_std(df, 2)

In [None]:
filtered = df.loc[df["token"].isin(list(vocabulary.keys())[100:])]
filtered.head(100)

In [None]:
mean_std(filtered, 4)