In [1]:
import json
import torch
import pandas as pd
from transformers import *

#from transformers import AutoModel, AutoTokenizer, BertTokenizer

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7feb3ea79390>

In [2]:
path = '/nfs/research/regan/data_propara/data/corenlp_fd_physical_completed.json'

In [3]:
# Store the model we want to use
MODEL_NAME = "bert-base-uncased"

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
with open(path, 'r') as f:
    data = json.load(f)
len(data); data["664"]

{'vnclass': 'bump-18.4',
 'syntax': 'Sbj V PathP',
 'fd1': 'Autonomous',
 'fd2': 'Place',
 'fd3': '',
 'example': 'The grocery cart hit against the wall',
 'domain': 'Physical',
 'pred_calc': '+MER(grocery cart) & PTH(grocery cart,wall)',
 'features': [{'text': 'The',
   'dep': 'det',
   'head': '3',
   'lemma': 'the',
   'children': [],
   'index': '1',
   'type': 'O',
   'pos': 'DT'},
  {'text': 'grocery',
   'dep': 'compound',
   'head': '3',
   'lemma': 'grocery',
   'children': [],
   'index': '2',
   'type': 'O',
   'pos': 'NN'},
  {'text': 'cart',
   'dep': 'nsubj',
   'head': '4',
   'lemma': 'cart',
   'children': ['The', 'grocery'],
   'index': '3',
   'type': 'O',
   'pos': 'NN',
   'q_value': '+MER',
   'fd_relation': 'PTH',
   'fd_subsequent': '7',
   'subevent_aspect': 'CycAch'},
  {'text': 'hit',
   'dep': 'ROOT',
   'head': '0',
   'lemma': 'hit',
   'children': ['cart', 'wall'],
   'index': '4',
   'type': 'O',
   'pos': 'VBD',
   'vnclass': '18.4'},
  {'text': 'agains

In [17]:
def create_causal_chain(features):

    for e in features:
        if e["dep"] == "ROOT":
            ents_list = e["children"]
            root_index = e["index"]
            
    entities = []
    for e in features:
        if e["text"] in ents_list and e["head"] == root_index:
            if "q_value" in e:
                entity = {"text": e["text"], "q_value": e["q_value"], "index": int(e["index"])}
                if "fd_subsequent" in e:
                    if e["fd_subsequent"] != "":
                        entity["fd_subsequent"] = int(e["fd_subsequent"])
                    
                entities.append(entity)
                
    def get_preceding_element_in_causal_chain(entities, idx): 
        for e in entities:
            if e["fd_subsequent"] == idx:
                next_index = e["index"]
                return next_index, e
        return None, e
        
                
    ordered_causal_chain = []
    all_indices = []
    for entity in entities:
        if 'fd_subsequent' not in entity:
            ordered_causal_chain.append(entity)
            entities.remove(entity)
            all_indices.append(entity["index"])
            
    while all_indices:
        idx = all_indices.pop(0)
        
        for entity in entities:
            if entity not in ordered_causal_chain:
                next_index, ele = get_preceding_element_in_causal_chain(entities, idx)
                if next_index:
                    all_indices.append(next_index)
                ordered_causal_chain.insert(0, ele)
    
    return ordered_causal_chain

all_items = []

for k, v in data.items():

    cc = create_causal_chain(v["features"])
    fd = v["fd1"] + " " + v["fd2"]
    if v["fd3"] != "":
        fd += " " + v["fd3"]
    item = {"cc": cc, "sentence": v["example"], "syntax":v["syntax"], "fd":fd,
           "tokens":[], "tokens_pt":[], "tokens_ids": [],
            "token_wise_output": [], "pooled_output": [] }
    all_items.append(item)
    
all_items[200]

{'cc': [{'text': 'jeweler', 'q_value': 'VOL', 'index': 2, 'fd_subsequent': 6},
  {'text': 'NI1', 'q_value': 'INTL', 'index': 6, 'fd_subsequent': 5},
  {'text': 'ring', 'q_value': '+MER', 'index': 5}],
 'sentence': 'The jeweler decorated the ring',
 'syntax': 'Sbj V Obj',
 'fd': 'Volitional Provide',
 'tokens': [],
 'tokens_pt': [],
 'tokens_ids': [],
 'token_wise_output': [],
 'pooled_output': []}

In [21]:
for item in all_items:
    item
    # Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. 
    tokens = tokenizer.tokenize(item["sentence"])
    item["tokens"] = tokens
    #print("Tokens: {}".format(tokens))

    # This is not sufficient for the model, as it requires integers as input, 
    # not a problem, let's convert tokens to ids.
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    item["tokens_ids"] = tokens_ids
    #print("Tokens id: {}".format(tokens_ids))

    # Add the required special tokens
    tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
    
    # We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
    tokens_pt = torch.tensor([tokens_ids])
    item["tokens_pt"] = tokens_pt
    #print("Tokens PyTorch: {}".format(tokens_pt))

    # Now we're ready to go through BERT with out input
    outputs, pooled = model(tokens_pt)
    #print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))
    item["token_wise_output"] = outputs
    item["pooled_output"] = pooled


In [22]:
df = pd.DataFrame(all_items)

In [23]:
df.head()

Unnamed: 0,cc,sentence,syntax,fd,tokens,tokens_pt,tokens_ids,token_wise_output,pooled_output
0,"[{'text': 'cart', 'q_value': '+MER', 'index': ...",The grocery cart hit against the wall,Sbj V PathP,Autonomous Place,"[the, grocery, cart, hit, against, the, wall]","[[tensor(101), tensor(1996), tensor(13025), te...","[1996, 13025, 11122, 2718, 2114, 1996, 2813]","[[[tensor(-0.2603), tensor(0.5590), tensor(-0....","[[tensor(-0.8683), tensor(-0.2977), tensor(-0...."
1,"[{'text': 'type', 'q_value': '+MER', 'index': ...",That type of rope coiled easily around the post,Sbj V PathP,Autonomous Place,"[that, type, of, rope, coiled, easily, around,...","[[tensor(101), tensor(2008), tensor(2828), ten...","[2008, 2828, 1997, 8164, 24599, 4089, 2105, 19...","[[[tensor(-0.1397), tensor(-0.2141), tensor(0....","[[tensor(-0.7615), tensor(-0.0670), tensor(0.1..."
2,"[{'text': 'rope', 'q_value': '+MER', 'index': ...",The rope coiled around the post,Sbj V PathP,Autonomous Place,"[the, rope, coiled, around, the, post]","[[tensor(101), tensor(1996), tensor(8164), ten...","[1996, 8164, 24599, 2105, 1996, 2695]","[[[tensor(-0.0777), tensor(0.0408), tensor(0.1...","[[tensor(-0.6512), tensor(-0.0065), tensor(0.7..."
3,"[{'text': 'company', 'q_value': '+MER', 'index...",The company is wedging into new markets,Sbj V PathP,Autonomous Place,"[the, company, is, wed, ##ging, into, new, mar...","[[tensor(101), tensor(1996), tensor(2194), ten...","[1996, 2194, 2003, 21981, 4726, 2046, 2047, 6089]","[[[tensor(0.0615), tensor(0.1368), tensor(0.30...","[[tensor(-0.8246), tensor(-0.1659), tensor(0.2..."
4,"[{'text': 'computer', 'q_value': '+MER', 'inde...",The computer connected well to the network,Sbj V PathP,Autonomous Place,"[the, computer, connected, well, to, the, netw...","[[tensor(101), tensor(1996), tensor(3274), ten...","[1996, 3274, 4198, 2092, 2000, 1996, 2897]","[[[tensor(-0.0860), tensor(-0.0513), tensor(0....","[[tensor(-0.6017), tensor(-0.1227), tensor(0.6..."


In [None]:
df.to_pickle("../data/fd_ph")

In [43]:
# # model_class = BertForSequenceClassification
# model_class = BertForTokenClassification
# pretrained_weights = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# len(tokenizer)

30522

In [38]:
# Models can return full list of hidden-states & attentions weights at each layer
model = model_class.from_pretrained(pretrained_weights,
                                    output_hidden_states=True,
                                    output_attentions=True)

In [44]:
tokenizer.tokenize("This is the text to encode")

['this', 'is', 'the', 'text', 'to', 'en', '##code']

In [47]:
# input_ids = torch.tensor([tokenizer.encode("This is the text to encode", add_special_tokens=True)])
# all_hidden_states, all_attentions = model(input_ids)[-2:]

input_ids = torch.tensor([tokenizer.encode(df["example"][0])])

In [51]:
all_hidden_states, all_attentions = model(input_ids)[-2:]