In [1]:
import os
import sys

# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/gipsy')

from data_reader import DataReader
from data_reader import convert_docs
from utils import find_mrc_word
from gist import GIST

In [2]:
doc = DataReader().load_input_files()
df = convert_docs(doc)

In [3]:
df.head(50)

Unnamed: 0,d_id,p_id,sen_id,token_id,token_text,token_lemma,token_pos
0,0,0,0,0,Residents,resident,NOUN
1,0,0,0,1,are,be,AUX
2,0,0,0,2,clamoring,clamor,VERB
3,0,0,0,3,to,to,PART
4,0,0,0,4,see,see,VERB
5,0,0,0,5,whether,whether,SCONJ
6,0,0,0,6,the,the,DET
7,0,0,0,7,virus,virus,NOUN
8,0,0,0,8,has,have,AUX
9,0,0,0,9,been,be,AUX


In [4]:
len(df)

99

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer

# creating list of tokens
doc_context = []
verbs_indices = []
for index, row in df.iterrows():
    doc_context.append(row['token_text'])

torch.set_grad_enabled(False)

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
model = AutoModel.from_pretrained('bert-large-uncased')

doc_string = ' '.join(doc_context)
tokens = tokenizer.tokenize(doc_string)
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))

Tokens: ['residents', 'are', 'cl', '##amo', '##ring', 'to', 'see', 'whether', 'the', 'virus', 'has', 'been', 'detected', 'in', 'their', 'neighborhoods', 'so', 'they', 'can', 'take', 'more', 'steps', 'to', 'avoid', 'any', 'contact', '.', 'american', 'researchers', 'are', 'starved', 'for', 'data', ',', 'unlike', 'their', 'colleagues', 'in', 'other', 'countries', 'who', 'are', 'harness', '##ing', 'rivers', 'of', 'information', 'from', 'their', 'more', 'centralized', 'medical', 'systems', '.', 'and', 'local', 'politicians', 'complain', 'that', 'they', 'can', 'not', 'provide', 'basic', 'information', 'on', 'the', 'spread', 'of', 'the', 'virus', 'to', 'their', 'constituents', '.', 'in', 'the', 'perennial', 'tug', '-', 'of', '-', 'war', 'between', 'privacy', 'and', 'transparency', 'in', 'the', 'united', 'states', ',', 'privacy', 'appears', 'to', 'be', 'winning', 'in', 'the', 'corona', '##virus', 'pan', '##de', '##mic', '.']
Tokens id: [3901, 2024, 18856, 22591, 4892, 2000, 2156, 3251, 1996, 7

In [5]:
last_hidden_states = outputs[0]
token_embeddings = []
i = 0
# creating a list of tokens and their embeddings
while i < len(tokens):
    token_embeddings.append([tokens[i], last_hidden_states[i+1]])
    i += 1
assert len(tokens) == len(token_embeddings)

In [6]:
last_hidden_states[0].size()

torch.Size([1024])

In [8]:
import copy
import itertools
import statistics
import torch.nn as nn

cosine = nn.CosineSimilarity(dim=0)

i = 0
i_token = 0
verb_embeddings = []
pos_tags = ['VERB']
while i < len(df): 
    if df.iloc[i]['token_pos'] in pos_tags:
        # true, if there's no sub-token
        if df.iloc[i]['token_text'].lower() == token_embeddings[i_token][0].lower():
            verb_embeddings.append(token_embeddings[i_token][1])
            i += 1
            i_token += 1
        # it means that there are sub-tokens
        else:
            # if you want to check the tokens
            # print(df.iloc[i]['token_text'], tokens[i_token])
            tensors = [token_embeddings[i_token][1]]
            j = copy.deepcopy(i_token) + 1
            
            # getting embeddings of all sub-tokens of current token and then computing their mean
            while j < len(tokens) and '#' in tokens[j]:
                tensors.append(token_embeddings[j][1])
                j += 1
            verb_embeddings.append(torch.mean(torch.stack(tensors), dim=0))
            i += 1
            i_token = copy.deepcopy(j)
    else:
        i += 1
        i_token += 1

# checking if we have the embeddings of all VERBs
assert len(df.loc[df['token_pos'].isin(pos_tags)]), len(verb_embeddings)

# computing the cosine similarity among all VERBs
scores = []
for pair in itertools.combinations(verb_embeddings, r=2):
    scores.append(cosine(pair[0], pair[1]).item())
print(statistics.mean(scores))

0.4427109473408797


In [65]:
verb_embeddings[0].size()

torch.Size([])

In [31]:
token_embeddings[0][1].size()

torch.Size([1024])

In [18]:
last_hidden_states.size()

torch.Size([107, 1024])

In [12]:
type(last_hidden_states[0][1])

torch.Tensor

In [8]:
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

inputs = tokenizer(doc_context, return_tensors="pt", is_pretokenized=True, add_special_tokens=False)
outputs = model(**inputs)

last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple