In [1]:
import json
import pandas as pd
from collections import defaultdict

In [2]:
path = '/nfs/research/regan/data_propara/data/corenlp_propara_v9.json'

In [3]:
with open(path, 'r') as f:
    data = json.load(f)
len(data)

3521

In [4]:
def create_causal_chain(features):

    for e in features:
        if e["dep"] == "ROOT":
            ents_list = e["children"]
            root_index = e["index"]
            
    entities = []
    for e in features:
#         if e["text"] in ents_list and e["head"] == root_index:
        if "fd_relation" in e:
            if e["fd_relation"] != "":
                entity = {"text": e["text"], "q_value": e["q_value"], "index": int(e["index"]), "dep": e["dep"]}
                if "fd_subsequent" in e:
                    if int(e["fd_subsequent"]) != -1:
                        entity["fd_subsequent"] = int(e["fd_subsequent"])
                        entity["fd_relation"] = e["fd_relation"]
                    else:
                        entity["fd_relation"] = "None"  
                        entity["fd_subsequent"] = -1
#                 else:
#                     entity["fd_relation"] = "None"

                entities.append(entity)
            
                
    def get_preceding_element_in_causal_chain(entities, idx): 
        for e in entities:
            if "fd_subsequent" not in e:
                print(e)
                
            elif e["fd_subsequent"] == idx:
                next_index = e["index"]
                return next_index, e
        return None, e
        
                
    ordered_causal_chain = []
    all_indices = []
    
    #print(entities)
    for entity in entities:
        if len(entity)>1:
            ordered_causal_chain.append(entity)
            entities.remove(entity)
            all_indices.append(entity["index"])
            
    while all_indices:
        idx = all_indices.pop(0)
               
        for entity in entities:
            if entity not in ordered_causal_chain:
                next_index, ele = get_preceding_element_in_causal_chain(entities, idx)
                if next_index:
                    all_indices.append(next_index)
                ordered_causal_chain.insert(-1, ele)
                #ordered_causal_chain.append(ele)
#     print(key, ordered_causal_chain)
#     print()
    
    return ordered_causal_chain

all_items = []

for k, v in data.items():
    
    if 'metadata' not in k:

        cc = create_causal_chain(v["features"])

        if len(cc) > 1:
            # meaning that all single-entity events are excluded
            fd = v["fd1"] + " " + v["fd2"]
            if v["fd3"] != "":
                fd += " " + v["fd3"]
            item = {"pk": k, 
                    "cc": cc, 
                    "sentence": v["example"], 
                    "syntax":v["syntax"], 
                    "fd":fd, 
                    "domain": v["domain"], 
                   "verb": v["verb"]}
    #         print(k, item)
    #         print()
            all_items.append(item)
    
all_items[2]

{'pk': '4.3',
 'cc': [{'text': 'sediment',
   'q_value': 'EXIST',
   'index': 12,
   'dep': 'nmod',
   'fd_relation': 'None',
   'fd_subsequent': -1},
  {'text': 'sediment',
   'q_value': '+MER',
   'index': 4,
   'dep': 'nmod',
   'fd_subsequent': 12,
   'fd_relation': 'PTH'}],
 'sentence': 'Large amounts of sediment gradually pile on top of original sediment',
 'syntax': 'Sbj V PathP',
 'fd': 'Autonomous Place',
 'domain': 'physical',
 'verb': 'pile.01'}

In [5]:
len(all_items)

912

In [12]:
def count_number_relations_in_all_items(all_items):
    
    cnt = defaultdict(int)
    for item in all_items:
        cc = item['cc']
        for c in cc:
            cnt[c['fd_relation']] += 1
        
    return cnt

cnt_relations = count_number_relations_in_all_items(all_items) 
        
cnt_relations      

defaultdict(int,
            {'None': 973,
             'PTH': 455,
             'FRC': 373,
             'FORM': 113,
             'PTH-complex': 37,
             'EXP': 5,
             'XPR': 1,
             'CAUSE': 32,
             'INT': 35})

In [13]:
total_cnt = sum([v for v in cnt_relations.values()])

cnt_relations["None"]/total_cnt

0.4807312252964427

In [7]:
df = pd.DataFrame(all_items)

In [8]:
df.head()

Unnamed: 0,pk,cc,sentence,syntax,fd,domain,verb
0,4.1,"[{'text': 'sediment', 'q_value': 'EXIST', 'ind...",They are buried in sediment,Sbj V LocP,Autonomous Location,physical_passive,bury.01
1,4.2,"[{'text': 'sediment', 'q_value': 'EXIST', 'ind...",Bacteria is buried in sediment,Sbj V LocP,Autonomous Location,physical_passive,bury.01
2,4.3,"[{'text': 'sediment', 'q_value': 'EXIST', 'ind...",Large amounts of sediment gradually pile on to...,Sbj V PathP,Autonomous Place,physical,pile.01
3,5.1,"[{'text': 'algae', 'q_value': 'COS', 'index': ...",dead algae and plankton end up part of sedimen...,Sbj V Obj,Autonomous COS,physical,end_up.03
4,5.3,"[{'text': 'pressure', 'q_value': 'INTL', 'inde...",bottom layers of sediment become compacted by ...,Sbj V Obj,Physical COS,physical,compact.01


In [9]:
df.iloc[119].cc

[{'text': 'rock',
  'q_value': 'DES',
  'index': 6,
  'dep': 'xcomp',
  'fd_relation': 'None',
  'fd_subsequent': -1},
 {'text': 'sediment',
  'q_value': 'INTL',
  'index': 2,
  'dep': 'nsubj',
  'fd_subsequent': 6,
  'fd_relation': 'FORM'}]

In [10]:
# df.to_pickle("~/src/coling2020-code/data/propara_fd_relations_v5.pkl")