In [1]:
#Import relevant libraries
from collections import defaultdict, deque
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from itertools import combinations
from collections import Iterable
from random import shuffle
import csv
from factslab.datastructures import ConstituencyTree, DependencyTree

In [2]:
#Data Locations:
home_dir = "/Users"
ud_train  =  home_dir + "/sidvash/Dropbox//facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

#Taken from github - includes doc ids and sent ids
ud_train_detailed =  home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/en_ewt-ud-train.conllu"
ud_dev_detailed =  home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/en_ewt-ud-dev.conllu"
ud_test_detailed =  home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/en_ewt-ud-test.conllu"

ud_trees = home_dir + "/sidvash/Dropbox/facts_lab/UD_data_trees/structures.tsv"

null_str = "_null_"

## Load Sentence Mappings:

In [3]:
struct_dict = {}

with open(ud_trees, 'r') as f:
    structs_sents = [line.strip().split('\t') for line in f]

for sent_id, tree_list, sent in structs_sents:
    struct_dict[sent_id] = DependencyTree.fromstring(tree_list)
    struct_dict[sent_id].sentence = sent.split(" ")
    
print(struct_dict['en-ud-dev.conllu sent_1'].sentence)

['From', 'the', 'AP', 'comes', 'this', 'story', ':']


## Load Structures (Dependency Graph object)

In [4]:
from nltk import DependencyGraph
import re
ud_path = home_dir + "/sidvash/Dropbox/facts_lab/veridicality_sid/UD_English/"

def html_ify(s):
    '''
        Takes care of &quot &lsqb &rsqb &#39
    '''
    html_string = re.sub(r'\)', r'&rcrb;', s)
    html_string = re.sub(r'\(', r'&lcrb;', html_string)
    return html_string

def get_structs(ud_path):
    files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
    structures = {}
    for file in files:
        with open(ud_path + file, 'r') as f:
            iden = 0
            a = ""
            words = []
            for line in f:
                if line != "\n":
                    a += line
                    words.append(line.split("\t")[1])
                else:
                    iden += 1
                    a = html_ify(a)
                    structure = DependencyGraph(a, top_relation_label='root')
                    sent = " ".join(words)
                    sent = html_ify(sent)
                    sent_id = file + " sent_" + str(iden)
                    structures[sent_id] = structure
                    a = ""
                    words = []
    return structures

structures = get_structs(ud_path)

## Happen data

In [5]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Select only sentences with high confidence
happnd = happnd[happnd.Confidence.isin(['4', '3'])]

#Select only sentences where Keep = True
happnd = happnd[happnd.Keep == True]

#Create a set of ID's to filter later
happen_set = list(happnd[['Sentence.ID', 'Pred.Token']].values)
happen_set = [tuple(x) for x in happen_set]
happen_set = set(happen_set)

## Functions for turk_parse

In [6]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
    s = re.sub(r"\,", r"&#44;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s

def load_ud_english(fpath):
    """Load a file from the UD English corpus

    Parameters
    ----------
    fpath : str
        Path to UD corpus file ending in .conllu
    """

    n = 1

    fname = os.path.split(fpath)[1]

    parses = defaultdict(list)
    sent_ids = []
    newdoc_ids = []
    
    for l in open(fpath):
        ident = fname+' '+str(n)
        
        if re.match(r'\# newdoc id', l):
            newdoc_ids.append(n)
            
        if re.match(r'^\d', l):
            l_split = l.strip().split()
            parses[ident].append(l_split)
        
        elif parses[ident]:
            sent_ids.append(ident)
            n += 1

    return newdoc_ids

#### Below Functions are used only if two adjacent sentences are combined into one

In [7]:
def filter_preds(pred_tuples):
    '''
    Input: a list of tuples of (sent_id_num, predicate object)
    
    Output: filter tuples only with specific pos tags predicates
    
    '''
    ans = []
    pos_tags = set(["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"])
    for sent_id, pred_obj in pred_tuples:
        if pred_obj.root.tag not in pos_tags:
            continue
        elif pred_obj.root.tag not in ["VERB", "AUX"]:
            gov_rels = [tok.gov_rel for tok in pred_obj.tokens]
            if 'cop' in gov_rels:
                ans.append((sent_id, pred_obj))
            elif pred_obj.root.tag == 'ADJ':
                ans.append((sent_id, pred_obj))
        else:
            ans.append((sent_id, pred_obj))
    return ans

def depth_in_tree(idx, dep_obj):
    '''
    Input: Index of the word in a linear sequence of words
    
    Output: Depth of that word in the dependency tree
    
    '''
    nodes = dep_obj.nodes
    depth = 0
    i = idx+1
    while nodes[i]['rel'] != 'root':
        i = nodes[i]['head']
        depth+=1
        
    return depth
            
    
def find_pivot_predicate(fname, sentid_num, predp_object, structures):
    '''
    Find the pivot-predicate of a given sentence's id
    
    Heuristic/Algo:  Follow the root predicate until you find a predicate which doesn't have
                any xcomp, ccomp or csubj dependencies.
                
    '''
    preds = filter_preds([(sentid_num, x) for x in predp_object.instances])
    tokens = [y.root.position for x, y in preds]
    
    if tokens:
        tokens_covered = set()
        
        struct_id = fname + " sent_" + str(sentid_num)
        dep_object = structures[struct_id]
        pred_heights = sorted([(x, depth_in_tree(x,dep_object)) for x in tokens], key=lambda x:x[1])
        tokens_reverse = [x for x,y in pred_heights][::-1]
        
        root_idx = tokens.index(pred_heights[0][0])
        root_predicate = preds[root_idx]
        deps = dep_object.nodes[tokens[root_idx]+1]['deps']
        
        tokens_covered.add(tokens[root_idx])
        tokens_reverse.pop()
        
        while ('ccomp' in deps) or ('xcomp' in deps) or ('csubj') in deps:
            variables = ['ccomp', 'xcomp', 'csubj']
            for var in variables:
                if var in deps:
                    tok_idx = deps[var][0]-1
                    if tok_idx in tokens:
                        root_idx = tokens.index(tok_idx)
                        tokens_covered.add(tokens[root_idx])
                        tokens_reverse.pop()
                    else:
                        if tokens_reverse:
                            root_idx = tokens.index(tokens_reverse[-1])
                            tokens_covered.add(tokens[root_idx])
                            tokens_reverse.pop()
                        else:
                            return root_predicate
                    break
                    
            deps = dep_object.nodes[tokens[root_idx]+1]['deps']
            root_predicate = preds[root_idx]
            
        return root_predicate 

    return []

def predicate_info(predicate, sent_id):
    '''
    Input: predicate object
    Output: pred_text, token, root_token
    
    Note: If predicate is copular: pred_text is only upto first 5 words
    '''       
    #Extend predicate to start from the copula
    if predicate.root.tag not in ["VERB", "AUX"]:
        all_pred = predicate.tokens
        gov_rels = [tok.gov_rel for tok in all_pred]
        if 'cop' in gov_rels:
            cop_pos = gov_rels.index('cop')
            pred = [x.text for x in all_pred[cop_pos:]]
            pred_token = [x.position for x in all_pred[cop_pos:]]
            def_pred_token = predicate.root.position  #needed for it_happen set
            cop_bool = True  
        elif predicate.root.tag == "ADJ":
            pred_token = [predicate.root.position]
            pred = [predicate.root.text]
            def_pred_token = predicate.root.position
        else:
            print("Incompatible predicate found")
            
    #Else keep the root        
    else:
        pred_token = [predicate.root.position]
        pred = [predicate.root.text]
        def_pred_token = predicate.root.position 

    #Stringify pred and pred_tokens:
    #pred_token = "_".join(map(str, pred_token))

    if len(pred)>5:
        pred = pred[:5]
        pred = " ".join(pred) + "..."
    else:
        pred = " ".join(pred)
        
    return pred, pred_token, def_pred_token

In [8]:
def dict_pred_double(pred_comb, raw_sentence, fname, sentid_num, sentid_num_next, happen_set=[]):
    '''
    Extract turk_parse dict from input predicate combination 
    
    INputs:
    1. pred_all : one list of all predicates in both sentences
    2. raw_sentence: a dict of two sentences, with key: sent_id_num
    3. sentid_num: 1st sentence in adjacent sentence
    4. sentid_num_next: 2nd sentence in adjacent sentence
    
    '''
    token_dict = {}
    pred1_obj, pred2_obj = [y for x,y in pred_comb]
    sent_id1, sent_id2 = [x for x,y in pred_comb]
    
    pred1_text, pred1_token, pred1_root_token = predicate_info(pred1_obj, sentid_num)
    pred2_text, pred2_token, pred2_root_token = predicate_info(pred2_obj, sentid_num_next)

    token_dict['pred1_token'] = "_".join(map(str, pred1_token))
    token_dict['pred1_text'] = pred1_text
    token_dict['pred2_token'] = "_".join(map(str, pred2_token))
    token_dict['pred2_text'] = pred2_text
    token_dict['sentence_id_1'] = fname + " " + sent_id1
    token_dict['sentence_id_2'] = fname + " " + sent_id2
    token_dict['pred1_root_token'] = pred1_root_token
    token_dict['pred2_root_token'] = pred2_root_token
    
    ## Raw Sentence:
    #Extract each predicate's position and text
    pred_sent1 = raw_sentence[sentid_num].copy()
    pred_sent2 = raw_sentence[sentid_num_next].copy()
    
    if sent_id1 == sent_id2 == sentid_num:
        acc=0
        for ins in pred1_token:
            pred_sent1.insert(ins + acc, ' <span class=\"predicate1\">' + '<sup>1</sup>')
            pred_sent1.insert(ins + acc + 2, '</span> ')
            acc += 2
            
        for ins in pred2_token:
            pred_sent1.insert(ins + acc, ' <span class=\"predicate2\">' + '<sup>2</sup>')
            pred_sent1.insert(ins + acc + 2, '</span> ')
            acc += 2
            
    elif sent_id1 == sent_id2 == sentid_num_next:
        acc=0
        for ins in pred1_token:
            pred_sent2.insert(ins + acc, ' <span class=\"predicate1\">' + '<sup>1</sup>')
            pred_sent2.insert(ins + acc + 2, '</span> ')
            acc += 2
            
        for ins in pred2_token:
            pred_sent2.insert(ins + acc, ' <span class=\"predicate2\">' + '<sup>2</sup>')
            pred_sent2.insert(ins + acc + 2, '</span> ')
            acc += 2
        
    else:
        acc=0
        for ins in pred1_token:
            pred_sent1.insert(ins + acc, ' <span class=\"predicate1\">' + '<sup>1</sup>')
            pred_sent1.insert(ins + acc + 2, '</span> ')
            acc += 2
            
        acc=0
        for ins in pred2_token:
            pred_sent2.insert(ins + acc, ' <span class=\"predicate2\">' + '<sup>2</sup>')
            pred_sent2.insert(ins + acc + 2, '</span> ')
            acc += 2
            
    pred_sentence = pred_sent1 + pred_sent2
    token_dict['sentence'] = " ".join(pred_sentence)
        
    return token_dict, pred1_token, pred2_token

In [9]:
def extract_list_adjacent(ud_data, happen_set, doc_id_dict, cut_option = True):
    '''
    Extract a list of JSON objects from the ud data
    
    #Difference from original: Combining two adjacent sentences to form a single one
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    total_preds = 0
    filtered_preds = 0
    single_preds=0
    global_tuples = []
    
    # Resolve relative clause
    options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=cut_option)
    
    #Predicate tags to be used later  
    copula_cnts = defaultdict(int)
    pred_cnts = defaultdict(int)
    auxverb_cnts = defaultdict(int)
    ignore_cnts =  defaultdict(int)
    copula_indxs = defaultdict(list)
    discont_indxs = defaultdict(list)
    ###
    
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = ud_data_path.split(".")[0].split("-")[-1]
        doc_ids = doc_id_dict[data_name]
        
        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        total_sents = len(parsed)
        
        docslist = deque(doc_ids[1:])
        nextdoc = docslist.popleft()
        
        #Iterating through each parsed sentence
        for i, parse_sen in enumerate(parsed):   
            
            total_preds += len(parse_sen[0].instances)
            
            if i in covered_set:
                continue
            
            if docslist:
                if nextdoc in covered_set:
                    nextdoc = docslist.popleft()
                
            pred_object = parse_sen[0] 
            
            
            sentid_num = parse_sen[1].split("_")[-1]
            ##########################################
            #Next item in parsed is not a new document
             ##########################################
            if (i+2) != nextdoc and (i+1)!= total_sents:
                covered_set.add(i)
                #covered_set.add(i+1)
                
                parse_sen_next = parsed[i+1]
                pred_object_next = parse_sen_next[0]
                sentid_num_next = parse_sen_next[1].split("_")[-1]
            
                raw_sentence =  {sentid_num : [token.text for token in pred_object.tokens] ,
                                sentid_num_next: [token.text for token in pred_object_next.tokens]}
                
                preds_curr = filter_preds([(sentid_num,pred) for pred in pred_object.instances])
                preds_next = filter_preds([(sentid_num_next,pred) for pred in pred_object_next.instances])
                
                filtered_preds += len(preds_curr)
                
                pred_combs_curr = combinations(preds_curr,2)
                
                #Curr_sent combinations
                for pred_comb in pred_combs_curr:      
                    #token dict from all predicates in both sentences:
                    token_dict, pred_token1, pred_token2 = dict_pred_double(pred_comb, raw_sentence, 
                                                                                  fname, sentid_num, 
                                                                                    sentid_num_next)
                    global_tuples.append((token_dict, pred_token1, pred_token2))
                    sent_total+=1
                
                #Root pred of curr_sent with preds of next_sent:
                pivot_curr_pred = find_pivot_predicate(fname, sentid_num, pred_object, structures)
                if pivot_curr_pred:
                    for tupl in preds_next:
                        pred_comb = [pivot_curr_pred, tupl]
                        token_dict, pred_token1, pred_token2 = dict_pred_double(pred_comb, raw_sentence, 
                                                                                      fname, sentid_num, 
                                                                                        sentid_num_next)
                        global_tuples.append((token_dict, pred_token1, pred_token2))
                        sent_total+=1
                
            ##########################################
            #Next item in parsed is from a new document
             ##########################################
            else: 
                if docslist:
                    nextdoc = docslist.popleft()
                covered_set.add(i)
                
        print("{} finished".format(data_name))
        print("\n")
    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sent_tokens found: {}".format(sent_total))
    print("Total number of predicates found: {}".format(total_preds))
    print("Total number of filtered predicates: {}".format(filtered_preds))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
    
                    
    return global_tuples



In [10]:
def createHITs(global_tuples, offset_max=12):
    '''
    Create HITs based on input global tuples
    '''
    shuffle(global_tuples)
    total_sents = 0
    iden=0
    global_list = [] 
    local_list = []
    global_sents = 0
    global_ids = []
    total_len = len(global_tuples)
    rem = total_len%offset_max
    
    for j, tupl in enumerate(global_tuples):
        
        token_dict, pred_token1, pred_token2 = tupl
            
        total_sents += 1
        global_sents +=1
            
        if rem < offset_max//2:
            if j == total_len-(rem+1):
                offset_max+=rem
        
        if total_sents >= offset_max:
            iden += 1
            token_dict['id'] = iden
            local_list.append(json.dumps(token_dict))
            global_ids.append(iden)
            global_list.append(local_list)
            local_list = []
            iden=0
            total_sents=0
            
        else:
            iden += 1
            token_dict['id'] = iden
            local_list.append(json.dumps(token_dict))
            
    #Append the last remaining sentences into the global list if rem was not used.
    if local_list:
        global_list.append(local_list)
    print("Total number of predicate-pairs: {}".format(global_sents))
    print("Average number of sentences per HIT {}".format(np.mean([len(x) for x in global_list])))
    print("Max number of sentences per HIT {}".format(np.max([len(x) for x in global_list])))
    print("Min number of sentences per HIT {}".format(np.min([len(x) for x in global_list])))
    print("\n")
    
    return global_list

#### Combining adjacent sentences into a single one

In [11]:
doc_ids = {}
doc_ids['train'] = load_ud_english(ud_train_detailed)
doc_ids['dev'] = load_ud_english(ud_dev_detailed)
doc_ids['test'] = load_ud_english(ud_test_detailed)

In [12]:
tups_train = extract_list_adjacent([ud_train], happen_set, doc_ids)
#tups_dev_test = extract_list_adjacent([ud_dev, ud_test], happen_set, doc_ids)

tups_dev = extract_list_adjacent([ud_dev], happen_set, doc_ids)
tups_test = extract_list_adjacent([ud_test], happen_set, doc_ids)

train finished


Total number of sent_tokens found: 59593
Total number of predicates found: 27390
Total number of filtered predicates: 25413
Number of sentences removed due to non-event: 0
dev finished


Total number of sent_tokens found: 5638
Total number of predicates found: 3345
Total number of filtered predicates: 2782
Number of sentences removed due to non-event: 0
test finished


Total number of sent_tokens found: 5137
Total number of predicates found: 3200
Total number of filtered predicates: 2637
Number of sentences removed due to non-event: 0


In [23]:
gl_list_train = createHITs(tups_train, offset_max=5)
gl_list_dev_test = createHITs(tups_dev_test, offset_max=5)

print("\n")
print("Length of train HIT: {}".format(len(gl_list_train)))
print("Length of dev_test HIT: {}".format(len(gl_list_dev_test)))


Total number of predicate-pairs: 59593
Average number of sentences per HIT 4.999832200687977
Max number of sentences per HIT 5
Min number of sentences per HIT 3


Total number of predicate-pairs: 10775
Average number of sentences per HIT 5.0
Max number of sentences per HIT 5
Min number of sentences per HIT 5




Length of train HIT: 11919
Length of dev_test HIT: 2155


In [14]:
#create a csv file for MTurk:
with open('event_temporal_duration_turk_file_train.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_train:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))


In [15]:
#create a csv file for MTurk:
with open('event_temporal_duration_turk_file_dev_test.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_dev_test:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))

## Rough Cost Calculations

In [16]:
# import math

# def nCr(n,r):
#     f = math.factorial
#     if n<r:
#         return 0
#     else:
#         return f(n) // f(r) // f(n-r)

In [17]:
# P1 = 1
# P2 = 0
# global_list_train_within = []
# global_list_train_between = []
# for i, tup in enumerate(tups_train):
#     ns1 = len(tup[0]['pred1'].split(","))
#     ns2 = len(tup[0]['pred2'].split(","))
#     x3 = P1*ns2
#     x4 = P2*ns1
#     #global_list_train.append([ns1,ns2,x3,x4])
#     global_list_train_within.append(nCr(ns1,2))
# #     if i % 2:
#     global_list_train_between.extend([x3,x4])
    
# global_list_dev_test_within = []
# global_list_dev_test_between = []
# for i, tup in enumerate(tups_dev_test):
#     ns1 = len(tup[0]['pred1'].split(","))
#     ns2 = len(tup[0]['pred1'].split(","))
#     x3 = P1*ns2
#     x4 = P2*ns1
#     #global_list_dev_test.append([ns1,ns2,x3,x4])
#     global_list_dev_test_within.append(nCr(ns1, 2))
# #     if i % 2:
#     global_list_dev_test_between.extend([x3,x4])
    

In [18]:
# tups_train[0]

In [19]:
# #sum([sum(x) for x in global_list_train])
# sum(global_list_train_within)+sum(global_list_train_between)

In [20]:
# (sum(global_list_dev_test_within)+sum(global_list_dev_test_between))*3

In [21]:
# .4*.1*1.2*(sum(global_list_train_within)+sum(global_list_train_between)+(sum(global_list_dev_test_within)+sum(global_list_dev_test_between))*3)

In [22]:
# .4*.1*1.2*3000