In [14]:
import json
import torch
import pandas as pd
from transformers import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
path = '/nfs/research/regan/data_propara/data/corenlp_fd_physical_completed.json'

In [90]:
with open(path, 'r') as f:
    data = json.load(f)
len(data)
data["664"]

{'vnclass': 'bump-18.4',
 'syntax': 'Sbj V PathP',
 'fd1': 'Autonomous',
 'fd2': 'Place',
 'fd3': '',
 'example': 'The grocery cart hit against the wall',
 'domain': 'Physical',
 'pred_calc': '+MER(grocery cart) & PTH(grocery cart,wall)',
 'features': [{'text': 'The',
   'dep': 'det',
   'head': '3',
   'lemma': 'the',
   'children': [],
   'index': '1',
   'type': 'O',
   'pos': 'DT'},
  {'text': 'grocery',
   'dep': 'compound',
   'head': '3',
   'lemma': 'grocery',
   'children': [],
   'index': '2',
   'type': 'O',
   'pos': 'NN'},
  {'text': 'cart',
   'dep': 'nsubj',
   'head': '4',
   'lemma': 'cart',
   'children': ['The', 'grocery'],
   'index': '3',
   'type': 'O',
   'pos': 'NN',
   'q_value': '+MER',
   'fd_relation': 'PTH',
   'fd_subsequent': '7',
   'subevent_aspect': 'CycAch'},
  {'text': 'hit',
   'dep': 'ROOT',
   'head': '0',
   'lemma': 'hit',
   'children': ['cart', 'wall'],
   'index': '4',
   'type': 'O',
   'pos': 'VBD',
   'vnclass': '18.4'},
  {'text': 'agains

In [94]:
for k, v in data.items():
    if "Doug" in v["example"]:
        print(k, v["example"])

1136 Doug cleaned the dishes from the table
1142 Doug removed the smudges
1143 Doug removed the smudges from the tabletop
1167 Doug cleaned the table of dishes


In [141]:
def create_causal_chain(features):

    for e in features:
        if e["dep"] == "ROOT":
            ents_list = e["children"]
            root_index = e["index"]
            
    entities = []
    for e in features:
        if e["text"] in ents_list and e["head"] == root_index:
            if "q_value" in e:
                entity = {"text": e["text"], "q_value": e["q_value"], "index": int(e["index"])}
                if "fd_subsequent" in e:
                    if e["fd_subsequent"] != "":
                        entity["fd_subsequent"] = int(e["fd_subsequent"])
                    
                entities.append(entity)
                
    def get_preceding_element_in_causal_chain(entities, idx): 
        for e in entities:
            if e["fd_subsequent"] == idx:
                next_index = e["index"]
                return next_index, e
        return None, e
        
                
    ordered_causal_chain = []
    all_indices = []
    for entity in entities:
        if 'fd_subsequent' not in entity:
            ordered_causal_chain.append(entity)
            entities.remove(entity)
            all_indices.append(entity["index"])
            
    while all_indices:
        idx = all_indices.pop(0)
        
        for entity in entities:
            if entity not in ordered_causal_chain:
                next_index, ele = get_preceding_element_in_causal_chain(entities, idx)
                if next_index:
                    all_indices.append(next_index)
                ordered_causal_chain.insert(0, ele)
    
    return ordered_causal_chain

all_items = []

for k, v in data.items():

    cc = create_causal_chain(v["features"])
    fd = v["fd1"] + " " + v["fd2"]
    if v["fd3"] != "":
        fd += " " + v["fd3"]
    item = {"cc": cc, "sentence": v["example"], "syntax":v["syntax"], "fd":fd}
    all_items.append(item)
    
all_items[200]

{'cc': [{'text': 'jeweler', 'q_value': 'VOL', 'index': 2, 'fd_subsequent': 6},
  {'text': 'NI1', 'q_value': 'INTL', 'index': 6, 'fd_subsequent': 5},
  {'text': 'ring', 'q_value': '+MER', 'index': 5}],
 'sentence': 'The jeweler decorated the ring',
 'syntax': 'Sbj V Obj',
 'fd': 'Volitional Provide'}

In [None]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer

torch.set_grad_enabled(False)

# Store the model we want to use
MODEL_NAME = "bert-base-uncased"

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [146]:
item = all_items[0]
print(item)

# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. 
tokens = tokenizer.tokenize(item["sentence"])
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))
print()

{'cc': [{'text': 'cart', 'q_value': '+MER', 'index': 3, 'fd_subsequent': 7}, {'text': 'wall', 'q_value': 'EXIST', 'index': 7}], 'sentence': 'The grocery cart hit against the wall', 'syntax': 'Sbj V PathP', 'fd': 'Autonomous Place'}
Tokens: ['the', 'grocery', 'cart', 'hit', 'against', 'the', 'wall']
Tokens id: [1996, 13025, 11122, 2718, 2114, 1996, 2813]
Tokens PyTorch: tensor([[  101,  1996, 13025, 11122,  2718,  2114,  1996,  2813,   102]])
Token wise output: torch.Size([1, 9, 768]), Pooled output: torch.Size([1, 768])



In [43]:
# # model_class = BertForSequenceClassification
# model_class = BertForTokenClassification
# pretrained_weights = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# len(tokenizer)

30522

In [38]:
# Models can return full list of hidden-states & attentions weights at each layer
model = model_class.from_pretrained(pretrained_weights,
                                    output_hidden_states=True,
                                    output_attentions=True)

In [44]:
tokenizer.tokenize("This is the text to encode")

['this', 'is', 'the', 'text', 'to', 'en', '##code']

In [47]:
# input_ids = torch.tensor([tokenizer.encode("This is the text to encode", add_special_tokens=True)])
# all_hidden_states, all_attentions = model(input_ids)[-2:]

input_ids = torch.tensor([tokenizer.encode(df["example"][0])])

In [51]:
all_hidden_states, all_attentions = model(input_ids)[-2:]