BERT Representation Segment

In [2]:
sample_text = 'Deploying a large-scale distributed ecosystem such as HBase/Hadoop in the cloud is complicated and error-prone. Multiple layers of largely independently evolving software are deployed across distributed nodes on third party infrastructures. In addition to software incompatibility and typical misconfiguration within each layer, many subtle and hard to diagnose errors happen due to misconfigurations across layers and nodes. These errors are difficult to diagnose because of scattered log management and lack of ecosystem-awareness in many diagnosis tools and processes. We report on some failure experiences in a real world deployment of HBase/Hadoop and propose some initial ideas for better trouble-shooting during deployment. We identify the following types of subtle errors and the corresponding challenges in trouble-shooting: 1) dealing with inconsistency among distributed logs, 2) distinguishing useful information from noisy logging, and 3) probabilistic determination of root causes.'

In [56]:
%% script false --no-raise-error
import re
from gensim.parsing.preprocessing import remove_stopwords

def text_preprocess(text):
    text = re.sub(r'\d+', '', text)
    text = text.casefold()
    text = remove_stopwords(text)

    return text

In [70]:
%% script false --no-raise-error
processed_text = text_preprocess(sample_text)
processed_text = '[CLS] ' + processed_text + ' [SEP]'

In [116]:
%% script false --no-raise-error
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_text = tokenizer.tokenize(processed_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

segments_id = [1] * len(tokenized_text)

[CLS]           101
deploy       21,296
##ing         2,075
large         2,312
-             1,011
scale         4,094
distributed   5,500
ecosystem    16,927
h             1,044
##base       15,058
/             1,013
had           2,018
##oop        18,589
cloud         6,112
complicated   8,552
error         7,561
-             1,011
prone        13,047
.             1,012
multiple      3,674
layers        9,014
largely       4,321
independently  9,174
evolving     20,607
software      4,007
deployed      7,333
distributed   5,500
nodes        14,164
party         2,283
infrastructure  6,502
##s           2,015
.             1,012
addition      2,804
software      4,007
inc           4,297
##omp        25,377
##ati        10,450
##bility      8,553
typical       5,171
mis          28,616
##con         8,663
##fi          8,873
##gur        27,390
##ation       3,370
layer         6,741
,             1,010
subtle       11,259
hard          2,524
dia          22,939
##gno        26,7

In [13]:
%% script false --no-raise-error
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [117]:
%% script false --no-raise-error
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_id])

In [118]:
%% script false --no-raise-error
with torch.no_grad():
    output = model(tokens_tensor, segments_tensor)
    # get hidden state from all layers
    hidden_states = output[2]

In [74]:
%% script false --no-raise-error
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches (sequences):", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units (tensor size):", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches (sequences): 1
Number of tokens: 134
Number of hidden units (tensor size): 768


In [62]:
%% script false --no-raise-error
# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 132, 768])


In [75]:
%% script false --no-raise-error
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 134, 768])

In [76]:
%% script false --no-raise-error
# Remove dimension 1, the "batches". (since this has only 1 batch)
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([13, 134, 768])

In [77]:
%% script false --no-raise-error
# Swap dimensions 0 and 1.
# from [layers, tokens, hidden units] to [tokens, layers, hidden units]
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([134, 13, 768])

In [78]:
%% script false --no-raise-error
# Get token embeddings by summing the last 4 layers for each token
token_vecs_sum = []

# `token_embeddings` is a [190 x 13 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)

    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 134 x 768


In [129]:
%% script false --no-raise-error
token_vecs_sum[0].shape[0]

768

In [79]:
%% script false --no-raise-error
index_of_token = [i for i, x in enumerate(tokenized_text) if x == 'distributed']
token_vecs = [token_vecs_sum[i] for i in index_of_token]
token_tensors = torch.stack(token_vecs)
token_vecs_avg = torch.mean(token_tensors, dim=0)
token_vecs_avg.size()

torch.Size([768])

In [80]:
%% script false --no-raise-error
from scipy.spatial.distance import cosine

print(cosine(token_vecs_sum[index_of_token[0]], token_vecs_sum[index_of_token[1]]))
print(cosine(token_vecs_sum[index_of_token[2]], token_vecs_sum[index_of_token[1]]))
print(cosine(token_vecs_sum[index_of_token[0]], token_vecs_sum[index_of_token[2]]))

0.20884549617767334
0.2738270163536072
0.32876014709472656


In [81]:
%% script false --no-raise-error
for i in index_of_token:
    print(cosine(token_vecs_sum[i], token_vecs_avg))

0.09470295906066895
0.07165509462356567
0.11758136749267578


In [55]:
%% script false --no-raise-error
sample_term = ['misconfiguration', 'hadoop', 'infrastructures', 'deploying', 'cloud computing', 'face recognition', 'embeddings']
for term in sample_term:
    sample_token = tokenizer.tokenize(term)
    print(sample_token)

['mis', '##con', '##fi', '##gur', '##ation']
['had', '##oop']
['infrastructure', '##s']
['deploy', '##ing']
['cloud', 'computing']
['face', 'recognition']
['em', '##bed', '##ding', '##s']


In [82]:
%% script false --no-raise-error
sample_text_2 = 'Several investigations [11,16,19–21] have recently been undertaken into object recognition based on matching image intensity neighborhoods rather than geometric matching of features extracted from the images. These projects have used small subwindows or complete image regions and matching has been based on the similarity of extracted descriptors to previously stored descriptors. One characteristic common to these approaches is the representation of objects as a whole, rather than as a structured ensemble. This paper describes an extension to these approaches wherein a set of related features recognized at an earlier iteration also contribute to the complete object recognition. The paper describes an iconic, or image-based, matching approach that incorporates an element of geometric matching and shows that use of the subfeatures improves matching efficiency, position accuracy and completeness.'

In [105]:
%% script false --no-raise-error
text_list = [sample_text, sample_text_2]
kw_list = ['object recognition', 'geometric matching', 'distributed ecosystem', 'distributed nodes']

In [122]:
%% script false --no-raise-error
def encode_text(text, model, tokenizer):
    text = '[CLS] ' + text + ' [SEP]'

    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_id = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])

    segments_tensor = torch.tensor([segments_id])

    # print(tokens_tensor.size())
    # print(segments_tensor.size())

    with torch.no_grad():
        output = model(tokens_tensor, segments_tensor)
        # get hidden state from all layers
        hidden_states = output[2]

    token_embeddings = torch.stack(hidden_states, dim=0)

    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_sum = []

# `token_embeddings` is a [190 x 13 x 768] tensor.

# For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

    return tokenized_text, token_vecs_sum

In [125]:
%% script false --no-raise-error
def get_term_representation(encoded_text, tokenized_text, term, tokenizer):
    tokenized_term = tokenizer.tokenize(term)

    term_representations = []

    for i in range(len(tokenized_text)):
        if tokenized_text[i:i+len(tokenized_term)] == tokenized_term:
            term_token_tensors = torch.stack(encoded_text[i:i+len(tokenized_term)])
            term_vector = torch.mean(term_token_tensors, dim=0)

            term_representations.append(term_vector)

    print('Term appearances: %d' % len(term_representations))

    return term_representations

In [127]:
%% script false --no-raise-error
import pandas as pd

corpus_tokenized = []
corpus_token_vecs = []

term_vector_df = pd.DataFrame(columns=kw_list, index=[1975])

for text in text_list:
    tokenized_text, token_vecs = encode_text(text, model=model, tokenizer=tokenizer)
    corpus_tokenized.append(tokenized_text)
    corpus_token_vecs.append(token_vecs)

for term in kw_list:
    term_representations = []
    for i in range(len(text_list)):
        term_representations.extend(get_term_representation(encoded_text=corpus_token_vecs[i],
                                                           tokenized_text=corpus_tokenized[i],
                                                           term=term, tokenizer=tokenizer))

    print('Total Appearances: %d' % len(term_representations))

    term_tensors = torch.stack(term_representations)
    term_final_vector = torch.mean(term_tensors, dim=0)

    term_vector_df[term] = term_vector_df[term].astype(object)
    term_vector_df.at[1975, term] = term_final_vector

Shape is: 192 x 768
Shape is: 158 x 768
Term appearances: 0
Term appearances: 2
Total Appearances: 2
Term appearances: 0
Term appearances: 2
Total Appearances: 2
Term appearances: 1
Term appearances: 0
Total Appearances: 1
Term appearances: 1
Term appearances: 0
Total Appearances: 1


Term Context Evolution Experiment

In [183]:
import pickle

# with open('term_representation_1985_2005.pickle', 'rb') as f:
with open('term_representation_scibert_1985_2005.pickle', 'rb') as f:
    term_vector_df = pickle.load(f)

In [184]:
kw_list = term_vector_df.columns.tolist()

In [185]:
term_vector_df.dropna(axis=0, inplace=True)

In [186]:
from scipy.spatial.distance import cosine
from scipy.stats import linregress
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import warnings
warnings.filterwarnings('ignore')

time_past_buffer = 10
time_start = 1995
time_end = 2004

In [187]:
context_evo_df = pd.DataFrame(columns=kw_list, index=range(time_start, time_end+1))

for kw in kw_list:
    print(kw)
    for current_time in range(time_start, time_end+1):
        print(f'Current Time: {current_time}')
        period_rep_df = term_vector_df.loc[current_time-time_past_buffer:current_time, [kw]]
        tensor_list = period_rep_df[kw].tolist()
        tensor_size = tensor_list[0].shape[0]
        zero_tensor = torch.tensor([0] * tensor_size, dtype=torch.float)

        for i in range(1, len(tensor_list)):
            if torch.equal(tensor_list[i], zero_tensor):
                tensor_list[i] = tensor_list[i-1]

        vector_np_list = [tf.numpy() for tf in tensor_list]

        # period_distance = pairwise_distances(vector_np_list, metric='cosine')
        # print('Distance Value: ', period_distance[time_past_buffer][:time_past_buffer])
        # slope_value = linregress(period_distance[time_past_buffer][:time_past_buffer], range(current_time-time_past_buffer, current_time)).slope

        period_similarity = cosine_similarity(vector_np_list)
        print('Similarity Value: ', period_similarity[time_past_buffer][:time_past_buffer])
        slope_value = linregress(period_similarity[time_past_buffer][:time_past_buffer], range(current_time-time_past_buffer, current_time)).slope

        print('Slope Value: ', slope_value)
        print('---------------------------\n')
        context_evo_df.at[current_time, kw] = slope_value

with open('context_evo.pickle', 'wb') as f:
    pickle.dump(context_evo_df, f)

neural network
Current Time: 1995
Similarity Value:  [0.         0.         0.8248844  0.9451669  0.9292238  0.9600707
 0.96855664 0.9357511  0.9509116  0.93303317]
Slope Value:  5.607118856039813
---------------------------

Current Time: 1996
Similarity Value:  [0.         0.8161502  0.9416206  0.9377596  0.9613466  0.96860385
 0.93977106 0.9676707  0.9352894  0.97609425]
Slope Value:  6.018899242245281
---------------------------

Current Time: 1997
Similarity Value:  [0.8009435  0.9591466  0.9465803  0.97618496 0.9765601  0.94945705
 0.95451987 0.91591907 0.97428167 0.9784683 ]
Slope Value:  28.23357224926642
---------------------------

Current Time: 1998
Similarity Value:  [0.9534458  0.94429207 0.9719892  0.97256875 0.94729185 0.9588229
 0.9262146  0.97618145 0.98231214 0.98365885]
Slope Value:  67.69470743015243
---------------------------

Current Time: 1999
Similarity Value:  [0.93973243 0.9693913  0.9701036  0.9386368  0.95983255 0.9198681
 0.9749342  0.98595285 0.9829153  0

In [None]:
def calculate_regression(data_series, time_span_onward = 3):
    time_stamp = data_series.index.tolist()
    array_data = data_series.tolist()
    slope_list = []
    for i in range(len(array_data) - time_span_onward):
        period_mean = sum(time_stamp[i:i+time_span_onward])/time_span_onward
        data_mean = sum(array_data[i:i+time_span_onward])/time_span_onward
        dividend = 0
        divisor = 0
        for j, k in zip(time_stamp[i:i+3], array_data[i:i+3]):
            dividend += (j - period_mean) * (k - data_mean)
            divisor += (j - period_mean) ** 2
        slope_list.append(dividend/divisor)
    return pd.Series(slope_list, index=time_stamp[:-time_span_onward])

In [None]:
data_path = '/Users/khoanguyen/Workspace/dataset/trendnert/trendnert_partial.gz'
trendnert_data = pd.read_json(data_path, lines=True, compression='gzip')

trendnert_data_not_null = trendnert_data.loc[trendnert_data['label'].notnull()]

label_list = trendnert_data['label'].dropna().unique().tolist()

In [None]:
topic_frequency_df = pd.DataFrame(columns=label_list)

for label in label_list:
    label_specific_df = trendnert_data_not_null.loc[trendnert_data_not_null['label'].str.contains(label, regex=False)]
    topic_frequency_df[label] = label_specific_df.loc[(label_specific_df['year'] >= 1995) &
                                                      (label_specific_df['year'] < 2010)]['year'].value_counts().sort_inde()

freq_regression_df = pd.DataFrame(columns=label_list)
for label in label_list:
    regression = calculate_regression(topic_frequency_df[label].fillna(0))
    freq_regression_df[label] = regression

In [None]:
from sklearn import metrics

def evaluate_results(topic_kw_map, freq_regression_df, context_evo_df):
    for kw in context_evo_df.columns.tolist():
        print('Keyword: ', kw)
        mapped_label = []
        for topic in topic_kw_map:
            if kw in topic['keywords']:
                mapped_label.append(topic['label'])
        prediction = context_evo_df[kw] > 0
        for label in mapped_label:
            print('Corresponding Label: ', label)
            label_trend = freq_regression_df[label] > 0
            evaluation = metrics.classification_report(label_trend.tolist()[:-2], prediction.tolist())
            print(evaluation)

In [None]:
import json
with open('/Users/khoanguyen/Workspace/git/trendnert_experiment/kw_label_map_gensim.json', 'r+') as f:
    topic_kw_map = json.load(f)

evaluate_results(topic_kw_map=topic_kw_map, freq_regression_df=freq_regression_df, context_evo_df=context_evo_df)