## Revamped version with all predicates and copulas

In [1]:
#Import relevant libraries
from collections import defaultdict, deque
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from itertools import combinations
from collections import Iterable
from random import shuffle
import csv

### 1. Extract old pilot sentence ids

In [2]:
#Data Locations:
pilot_data_upd = "pilot_sent_token_data.csv"
pilot_data_file = "pilot_data.csv"

In [3]:
#Pilot data extracted from pilot protocol1 annotations 
df_pilot1 = pd.read_csv(pilot_data_upd)

In [4]:
set_ids_pilot1 = set([x.split("_")[0] for x in df_pilot1.sent_token.values])
print(len(set_ids_pilot1))

100


### 2. Run Turk_Parse as original 

In [5]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

#Taken from github - includes doc ids and sent ids
ud_train_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-train.conllu"
ud_dev_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-dev.conllu"
ud_test_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-test.conllu"

null_str = "_null_"

In [6]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
    s = re.sub(r"\,", r"&#44;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s


In [7]:
def load_ud_english(fpath):
    """Load a file from the UD English corpus

    Parameters
    ----------
    fpath : str
        Path to UD corpus file ending in .conllu
    """

    n = 1

    fname = os.path.split(fpath)[1]

    parses = defaultdict(list)
    sent_ids = []
    newdoc_ids = []
    
    for l in open(fpath):
        ident = fname+' '+str(n)
        
        if re.match(r'\# newdoc id', l):
            newdoc_ids.append(n)
            
        if re.match(r'^\d', l):
            l_split = l.strip().split()
            parses[ident].append(l_split)
        
        elif parses[ident]:
            sent_ids.append(ident)
            n += 1

    return newdoc_ids

In [8]:
def copula_predicates(predicate):
    pos_tags = ["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"] 
    #Ignore predicates that do not have one of the pos tags
    if predicate.root.tag not in pos_tags:
        return None
        
    #Extend predicate to start from the copula
    if predicate.root.tag not in ["VERB", "AUX"]:
        all_pred = predicate.tokens
        gov_rels = [tok.gov_rel for tok in all_pred]
        if 'cop' in gov_rels:
            cop_pos = gov_rels.index('cop')
            pred = [x.text for x in all_pred[cop_pos:]]
            pred_token = [x.position for x in all_pred[cop_pos:]]
            def_pred_token = predicate.root.position  #needed for it_happen set
            cop_bool = True                
        else:
            return None
            
    #Else keep the root        
    else:
        pred_token = [predicate.root.position]
        pred = [predicate.root.text]
        def_pred_token = predicate.root.position 

    #Stringify pred and pred_tokens:
    #pred_token = "_".join(map(str, pred_token))

    if len(pred)>5:
        pred = pred[:5]
        pred = " ".join(pred) + "..."
    else:
        pred = " ".join(pred)
        
    return pred, pred_token, def_pred_token

In [9]:
def dict_pred_all_double(preds_all, raw_sentence, fname, sentid_num, sentid_num_next, happen_set=[]):
    '''
    Extract turk_parse dict from input all predicates 
    
    INputs:
    1. pred_all : one list of all predicates in both sentences
    2. raw_sentence: a dict of two sentences, with key: sent_id_num
    3. sentid_num: 1st sentence in adjacent sentence
    4. sentid_num_next: 2nd sentence in adjacent sentence
    
    '''
    token_dict = {}
    
    filter_preds_all = preds_all
    
    #Extract each predicate's position and text
    pred_sent1 = raw_sentence[sentid_num].copy()
    pred_sent2 = raw_sentence[sentid_num_next].copy()
        
    #Predicate sentence 1:
    pred1_tuple = [] 
    for s_id, pred in filter_preds_all:
        if s_id==sentid_num:
            temp = copula_predicates(pred)
            if temp:
                pred1_tuple.append(temp)
            
    if pred1_tuple:
        pred1 = [pred_txt for pred_txt, pred_token, pred_root_token in pred1_tuple]
        pred_root_token1 = [pred_root_token for pred_txt, pred_token, pred_root_token in pred1_tuple]
        
        pred_token1 = []
        for pred_txt, pred_token, pred_root_token in pred1_tuple:
            if isinstance(pred_token, Iterable):
                pred_token = "_".join(map(str, pred_token))
                pred_token1.append(pred_token)
            else:
                pred_token1.append(pred_token)


        acc=0
        for i,ins in enumerate(pred_token1):
            if isinstance(ins, int):
                pred_sent1.insert(ins + acc, ' <span class=\"predicate' + str(i+1) + '\">' + '<sup>' + 
                                  str(i+1) + '</sup>')
                pred_sent1.insert(ins + acc + 2, '</span> ')
                acc += 2
            else:
                int_ins = [int(x) for x in ins.split("_")]
                for pos in int_ins:
                    pred_sent1.insert(pos + acc, ' <span class=\"predicate' + str(i+1) + '\">'+ '<sup>' + 
                                  str(i+1) + '</sup>')
                    pred_sent1.insert(pos + acc + 2, '</span> ')
                    acc += 2
    else:
        pred1 = pred_root_token1 = pred_token1 = []
        
    i=len(pred1)
    #Predicate sentence 2:
    pred2_tuple = [] 
    for s_id, pred in filter_preds_all:
        if s_id==sentid_num_next:
            temp = copula_predicates(pred)
            if temp:
                pred2_tuple.append(temp)
    
    if pred2_tuple:
        pred2 = [pred_txt for pred_txt, pred_token, pred_root_token in pred2_tuple]
        pred_root_token2 = [pred_root_token for pred_txt, pred_token, pred_root_token in pred2_tuple]
        pred_token2 = []

        for pred_txt, pred_token, pred_root_token in pred2_tuple:
            if isinstance(pred_token, Iterable):
                pred_token = "_".join(map(str, pred_token))
                pred_token2.append(pred_token)
            else:
                pred_token2.append(pred_token)


        acc=0
        for j,ins in enumerate(pred_token2):
            if isinstance(ins, int):
                pred_sent2.insert(ins + acc, ' <span class=\"predicate' + str(i+j+1) + '\">' + '<sup>' + 
                                  str(i+j+1) + '</sup>')
                
                pred_sent2.insert(ins + acc + 2, '</span> ')
                acc += 2
            else:
                int_ins = [int(x) for x in ins.split("_")]
                for pos in int_ins:
                    pred_sent2.insert(pos + acc, ' <span class=\"predicate'+ str(i+j+1) + '\">'+ '<sup>' + 
                                  str(i+j+1) + '</sup>')
                    pred_sent2.insert(pos + acc + 2, '</span> ')
                    acc += 2
    else:
        pred2 = pred_root_token2 = pred_token2 = []
        
    pred_sentence = pred_sent1 + pred_sent2
    num_preds = len(pred1) + len(pred2)
    
    token_dict['sentence'] = " ".join(pred_sentence)
    
    if pred_token1:
        pred_token1 = ",".join(map(str, pred_token1))
        pred_root_token1 = ",".join(map(str, pred_root_token1))
    else:
        pred_token1 = null_str
        pred_root_token1 = null_str
        
    if pred_token2:
        pred_token2 = ",".join(map(str, pred_token2))
        pred_root_token2 = ",".join(map(str, pred_root_token2))
    else:
        pred_token2 = null_str
        pred_root_token2 = null_str
    
    if pred1:
        pred1 = ",".join(pred1)
    else:
        pred1=null_str
    
    if pred2:
        pred2 = ",".join(pred2)
    else:
        pred2 = null_str
    
    #print(token_dict['sentence'])
    #print("\n")
    token_dict['pred_token1'] = pred_token1
    token_dict['pred1'] = pred1
    token_dict['pred_token2'] = pred_token2
    token_dict['pred2'] = pred2
    token_dict['sentence_id_1'] = fname + " " + sentid_num 
    token_dict['sentence_id_2'] = fname + " " + sentid_num_next 
    token_dict['pred_root_token1'] = pred_root_token1
    token_dict['pred_root_token2'] = pred_root_token2
    
    return token_dict, pred_token1, pred_token2, num_preds

In [10]:
def extract_list_adjacent(ud_data, doc_id_dict, set_ids_pilot1):
    '''
    Extract a list of JSON objects from the ud data
    
    #Difference from original: Combining two adjacent sentences to form a single one
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    total_preds = 0
    single_preds=0
    global_tuples = []
    
    # Resolve relative clause
    options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False)
    
    #Predicate tags to be used later  
    copula_cnts = defaultdict(int)
    pred_cnts = defaultdict(int)
    auxverb_cnts = defaultdict(int)
    ignore_cnts =  defaultdict(int)
    copula_indxs = defaultdict(list)
    discont_indxs = defaultdict(list)
    ###
    
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = ud_data_path.split(".")[0].split("-")[-1]
        doc_ids = doc_id_dict[data_name]
        
        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        total_sents = len(parsed)
        
        docslist = deque(doc_ids[1:])
        nextdoc = docslist.popleft()
        
        #Iterating through each parsed sentence
        for i, parse_sen in enumerate(parsed):   
            if i in covered_set:
                continue
            
            if docslist:
                if nextdoc in covered_set:
                    nextdoc = docslist.popleft()
                
            pred_object = parse_sen[0]      
            sentid_num = parse_sen[1].split("_")[-1]
            
            
            sentence_id_temp = fname + " " + sentid_num 
            
            if sentence_id_temp not in set_ids_pilot1:
                continue
            
            ##########################################
            #Next item in parsed is not a new document
             ##########################################
            if (i+2) != nextdoc and (i+1)!= total_sents:
                covered_set.add(i)
                #covered_set.add(i+1)
                
                parse_sen_next = parsed[i+1]
                pred_object_next = parse_sen_next[0]
                sentid_num_next = parse_sen_next[1].split("_")[-1]
            
                raw_sentence =  {sentid_num : [token.text for token in pred_object.tokens] ,
                                sentid_num_next: [token.text for token in pred_object_next.tokens]}
                
                preds_curr = [(sentid_num,pred) for pred in pred_object.instances]
                preds_next = [(sentid_num_next,pred) for pred in pred_object_next.instances]
                
                preds_all = preds_curr + preds_next
                
                sent_total+=len(preds_all)
                            
                #token dict from all predicates in both sentences:
                token_dict, pred_token1, pred_token2, num_preds = dict_pred_all_double(preds_all, raw_sentence, 
                                                                                  fname, sentid_num, 
                                                                                    sentid_num_next)
                
                global_tuples.append((token_dict, pred_token1, pred_token2, num_preds))
                
                    
            ##########################################
            #Next item in parsed is from a new document
             ##########################################
            else: 
                if docslist:
                    nextdoc = docslist.popleft()
                covered_set.add(i)
                
        print("{} finished".format(data_name))
        print("\n")
    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sent_tokens found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
    
                    
    return global_tuples


In [11]:
def createHITs(global_tuples, offset_max=40):
    '''
    Create HITs based on input global tuples
    '''
    shuffle(global_tuples)
    total_preds = 0
    iden=0
    global_list = []
    local_list = []
    global_preds = 0
    global_ids = []
    for token_dict, pred_token1, pred_token2, num_preds in global_tuples:
        if num_preds< 2:
            continue

        total_preds += num_preds
        global_preds +=num_preds 
        
        if total_preds >= offset_max:
            total_preds=0
            global_ids.append(iden)
            global_list.append(local_list)
            local_list = []
            iden=1
            token_dict['id'] = iden
            local_list.append(json.dumps(token_dict))
            
        else:
            iden += 1
            token_dict['id'] = iden
            local_list.append(json.dumps(token_dict))
            
    #Append the last remaining sentences into the global list
    #global_list.append(local_list) # Removing only for the pilot
    print("Total number of predicates: {}".format(global_preds))
    print("Average number of sentences per HIT {}".format(np.mean(global_ids)))
    print("Max number of sentences per HIT {}".format(np.max(global_ids)))
    print("Min number of sentences per HIT {}".format(np.min(global_ids)))
    print("\n")
    
    return global_list


#### Combining adjacent sentences into a single one

In [12]:
doc_ids = {}
doc_ids['train'] = load_ud_english(ud_train_detailed)
doc_ids['dev'] = load_ud_english(ud_dev_detailed)
doc_ids['test'] = load_ud_english(ud_test_detailed)

In [13]:
tups_train = extract_list_adjacent([ud_train, ud_dev], doc_ids, set_ids_pilot1)

train finished


dev finished


Total number of sent_tokens found: 740
Number of sentences removed due to non-event: 0


In [14]:
gl_list = createHITs(tups_train)
print("\n")
print("Length of HIT: {}".format(len(gl_list)))

Total number of predicates: 721
Average number of sentences per HIT 6.125
Max number of sentences per HIT 8
Min number of sentences per HIT 4




Length of HIT: 16


In [15]:
#create a csv file for MTurk:
with open('test_protocol2.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))