In [1]:
#Import relevant libraries
from collections import defaultdict, deque
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
from itertools import combinations
import csv

In [2]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

#Taken from github - includes doc ids and sent ids
ud_train_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-train.conllu"
ud_dev_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-dev.conllu"
ud_test_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-test.conllu"


In [3]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Select only sentences with high confidence
happnd = happnd[happnd.Confidence.isin(['4', '3'])]

#Select only sentences where Keep = True
happnd = happnd[happnd.Keep == True]

#Create a set of ID's to filter later
happen_set = list(happnd[['Sentence.ID', 'Pred.Token']].values)
happen_set = [tuple(x) for x in happen_set]
happen_set = set(happen_set)

In [4]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s


In [5]:
def load_ud_english(fpath):
    """Load a file from the UD English corpus

    Parameters
    ----------
    fpath : str
        Path to UD corpus file ending in .conllu
    """

    n = 1

    fname = os.path.split(fpath)[1]

    parses = defaultdict(list)
    sent_ids = []
    newdoc_ids = []
    
    for l in open(fpath):
        ident = fname+' '+str(n)
        
        if re.match(r'\# newdoc id', l):
            newdoc_ids.append(n)
            
        if re.match(r'^\d', l):
            l_split = l.strip().split()
            parses[ident].append(l_split)
        
        elif parses[ident]:
            sent_ids.append(ident)
            n += 1

    return newdoc_ids

#### Below Functions are used only if two adjacent sentences are combined into one

In [6]:
def dict_pred_all_double(preds_all, raw_sentence, fname, sentid_num, sentid_num_next, happen_set):
    '''
    Extract turk_parse dict from input all predicates 
    
    INputs:
    1. pred_all : one list of all predicates in both sentences
    2. raw_sentence: a dict of two sentences, with key: sent_id_num
    3. sentid_num: 1st sentence in adjacent sentence
    4. sentid_num_next: 2nd sentence in adjacent sentence
    
    '''
    token_dict = {}
    sent_removed = 0
    ## Remove predicates if not in happen set:
    filter_preds_all = []
    for s_id, pred_obj in preds_all:
        sentence_id = fname + " " + s_id
        pred_token = pred_obj.root.position + 1
        tup = (sentence_id, pred_token)
        if tup in happen_set:
            filter_preds_all.append((s_id, pred_obj))
        else:
            sent_removed+=1
            
        
    #Extract each predicate's position and text
    pred_sent1 = raw_sentence[sentid_num].copy()
    pred_sent2 = raw_sentence[sentid_num_next].copy()
        
    pred1 = [pred.root.text for s_id,pred in filter_preds_all if s_id == sentid_num]
    pred2 = [pred.root.text for s_id,pred in filter_preds_all if s_id == sentid_num_next]
    
    pred_token1 = [pred.root.position for s_id,pred in filter_preds_all if s_id == sentid_num]
    pred_token2 = [pred.root.position for s_id,pred in filter_preds_all if s_id == sentid_num_next]
    
    acc=0
    for ins in pred_token1:
        pred_sent1.insert(ins + acc, ' <span class=\"predicate\">')
        pred_sent1.insert(ins + acc + 2, '</span> ')
        acc += 2
        
    acc=0
    for ins in pred_token2:
        pred_sent2.insert(ins + acc, ' <span class=\"predicate\">')
        pred_sent2.insert(ins + acc + 2, '</span> ')
        acc += 2

    pred_sentence = pred_sent1 + pred_sent2
    
    token_dict['sentence'] = pred_sentence
    
    
    pred_token1 = ",".join(map(str, pred_token1))
    pred_token2 = ",".join(map(str, pred_token2))
    
    pred1 = ",".join(pred1)
    pred2 = ",".join(pred2)
    
    #print(token_dict['sentence'])
    #print("\n")
    token_dict['pred_token1'] = pred_token1
    token_dict['pred1'] = pred1
    token_dict['pred_token2'] = pred_token2
    token_dict['pred2'] = pred2
    token_dict['sentence_id_1'] = fname + " " + sentid_num 
    token_dict['sentence_id_2'] = fname + " " + sentid_num_next 
    
    num_preds = len(filter_preds_all)
        
    return token_dict, pred_token1, pred_token2, num_preds, sent_removed

def dict_pred_all_single(preds_all, raw_sentence, fname, sentid_num, happen_set):
    '''
    Extract turk_parse dict from input predicate combinations 
    
    INputs:
    1. pred_comb : one tuple of a combination of predicates
    2. raw_sentence: a single sentence 
    '''
    token_dict = {}
    sent_removed = 0
    ## Remove predicates if not in happen set:
    filter_preds_all = []
    for s_id, pred_obj in preds_all:
        sentence_id = fname + " " + s_id
        pred_token = pred_obj.root.position + 1
        tup = (sentence_id, pred_token)
        if tup in happen_set:
            filter_preds_all.append((s_id, pred_obj))
        else:
            sent_removed+=1
        
    #Extract each predicate's position and text
    pred_sent1 = raw_sentence[sentid_num].copy()
        
    pred1 = [pred.root.text for s_id,pred in filter_preds_all if s_id == sentid_num]
    pred2 = []
    
    pred_token1 = [pred.root.position for s_id,pred in filter_preds_all if s_id == sentid_num]
    pred_token2 = []
    
    acc=0
    for ins in pred_token1:
        pred_sent1.insert(ins + acc, ' <span class=\"predicate\">')
        pred_sent1.insert(ins + acc + 2, '</span> ')
        acc += 2
        
    pred_sentence = pred_sent1 
    
    token_dict['sentence'] = pred_sentence
    
    pred_token1 = ",".join(map(str, pred_token1))
    pred_token2 = ",".join(map(str, pred_token2))
    
    pred1 = ",".join(pred1)
    pred2 = ",".join(pred2)
    
    #print(token_dict['sentence'])
    #print("\n")
    token_dict['pred_token1'] = pred_token1
    token_dict['pred1'] = pred1
    token_dict['pred_token2'] = pred_token2
    token_dict['pred2'] = pred2
    token_dict['sentence_id_1'] = fname + " " + sentid_num 
    token_dict['sentence_id_2'] = fname + " " + sentid_num
    
    num_preds = len(filter_preds_all)

    return token_dict, pred_token1, pred_token2, num_preds, sent_removed

In [7]:
def extract_list_adjacent(ud_data, happen_set, doc_id_dict):
    '''
    Extract a list of JSON objects from the ud data
    
    #Difference from original: Combining two adjacent sentences to form a single one
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    total_preds = 0
    single_preds=0
    
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = ud_data_path.split(".")[0].split("-")[-1]
        doc_ids = doc_id_dict[data_name]
        
        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        total_sents = len(parsed)
        
        docslist = deque(doc_ids[1:])
        nextdoc = docslist.popleft()
        
        #Iterating through each parsed sentence
        for i, parse_sen in enumerate(parsed):   
            if i in covered_set:
                
                continue
            
            if docslist:
                if nextdoc in covered_set:
                    nextdoc = docslist.popleft()
                
            pred_object = parse_sen[0]      
            sentid_num = parse_sen[1].split("_")[-1]
            ##########################################
            #Next item in parsed is not a new document
             ##########################################
            if (i+2) != nextdoc and (i+1)!= total_sents:
                covered_set.add(i)
                covered_set.add(i+1)
                
                parse_sen_next = parsed[i+1]
                pred_object_next = parse_sen_next[0]
                sentid_num_next = parse_sen_next[1].split("_")[-1]
            
                raw_sentence =  {sentid_num : [token.text for token in pred_object.tokens] ,
                                sentid_num_next: [token.text for token in pred_object_next.tokens]}
                
                preds_curr = [(sentid_num,pred) for pred in pred_object.instances]
                preds_next = [(sentid_num_next,pred) for pred in pred_object_next.instances]
                
                preds_all = preds_curr + preds_next
                
                sent_total+=len(preds_all)
                            
                #token dict from all predicates in both sentences:
                token_dict, pred_token1, pred_token2, num_preds, local_removed = dict_pred_all_double(preds_all, raw_sentence, 
                                                                                  fname, sentid_num, 
                                                                                    sentid_num_next, happen_set)
                
                sent_removed += local_removed
                if num_preds< 2:
                    continue

                token_dict['id'] = iden

                total_preds += num_preds
                iden += 1

                if total_preds >= 20:
                    total_preds=0
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    iden=1
                
                else:
                    local_list.append(json.dumps(token_dict))
            ##########################################
            #Next item in parsed is from a new document
             ##########################################
            else: 
                if docslist:
                    nextdoc = docslist.popleft()
                covered_set.add(i)
                raw_sentence = {sentid_num : [token.text for token in pred_object.tokens]}
                preds_curr = [(sentid_num,pred) for pred in pred_object.instances]
                
                preds_all = preds_curr
                sent_total+=len(preds_all)
        
                #token dict from all predicates in the sentence:
                token_dict, pred_token1, pred_token2, num_preds, local_removed = dict_pred_all_single(preds_all, 
                                                                                                      raw_sentence, 
                                                                                              fname, sentid_num, 
                                                                                            happen_set)
                
                sent_removed += local_removed
                if num_preds< 2:
                    continue
                
                single_preds+=1
                token_dict['id'] = iden

                total_preds += num_preds
                iden += 1

                if total_preds >= 20:
                    total_preds=0
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    iden=1
                
                else:
                    local_list.append(json.dumps(token_dict))
                   
                        
        print("{} finished".format(data_name))
        print("\n")
    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sentences found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
    
                    
    return global_list

#### Considering individual sentences

In [8]:
#gl_list = extract_list(ud_data, happen_set)

#### Combining adjacent sentences into a single one

In [9]:
doc_ids = {}
doc_ids['train'] = load_ud_english(ud_train_detailed)
doc_ids['dev'] = load_ud_english(ud_dev_detailed)
doc_ids['test'] = load_ud_english(ud_test_detailed)

In [10]:
gl_list_train = extract_list_adjacent([ud_train], happen_set, doc_ids)
gl_list_dev_test = extract_list_adjacent([ud_dev, ud_test], happen_set, doc_ids)

train finished


Total number of sentences found: 23091
Number of sentences removed due to non-event: 12651
dev finished


test finished


Total number of sentences found: 5567
Number of sentences removed due to non-event: 3183


In [11]:
#create a csv file for MTurk:
with open('event_temporal_turk_file_adjacent_train.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_train:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))

In [12]:
#create a csv file for MTurk:
with open('event_temporal_turk_file_adjacent_dev_test.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_dev_test:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))