In [1]:
#Import relevant libraries
from collections import defaultdict, deque
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
from itertools import combinations
import csv

In [2]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

#Taken from github - includes doc ids and sent ids
ud_train_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-train.conllu"
ud_dev_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-dev.conllu"
ud_test_detailed =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en_ewt-ud-test.conllu"


In [3]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Select only sentences with high confidence
happnd = happnd[happnd.Confidence.isin(['4', '3'])]

#Select only sentences where Keep = True
happnd = happnd[happnd.Keep == True]

#Create a set of ID's to filter later
happen_set = list(happnd[['Sentence.ID', 'Pred.Token']].values)
happen_set = [tuple(x) for x in happen_set]
happen_set = set(happen_set)

In [4]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s

def combs(l):
    '''
    Input: List containing some elements
    
    Returns: nC2 combinations of items in the list, 
            where n = length of list
    '''
    ans = []
    
    for x in combinations(l, 2):
        ans.append(x)
            
    return ans

In [5]:
def load_ud_english(fpath):
    """Load a file from the UD English corpus

    Parameters
    ----------
    fpath : str
        Path to UD corpus file ending in .conllu
    """

    n = 1

    fname = os.path.split(fpath)[1]

    parses = defaultdict(list)
    sent_ids = []
    newdoc_ids = []
    
    for l in open(fpath):
        ident = fname+' '+str(n)
        
        if re.match(r'\# newdoc id', l):
            newdoc_ids.append(n)
            
        if re.match(r'^\d', l):
            l_split = l.strip().split()
            parses[ident].append(l_split)
        
        elif parses[ident]:
            sent_ids.append(ident)
            n += 1

    return newdoc_ids

In [6]:
def extract_list(ud_data, happen_set):
    '''
    Extract a list of JSON objects from the ud data
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    
    for ud_data_path in ud_data:
        fname = ud_data_path.split("/")[-1]

        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        #Iterating through each parsed sentence
        for parse_sen in parsed:
            pred_object = parse_sen[0]            
            raw_sentence = [token.text for token in pred_object.tokens]
            pred_combs = combs(pred_object.instances)
            sentid_num = parse_sen[1].split("_")[-1]
            
            if len(pred_combs) == 0:
                continue
            #Iterating through each predicate combination
            for pred_comb in pred_combs:
                #Extract each predicate's position and text
                pred_sentence = raw_sentence.copy()
                pred1 = pred_comb[0].root.text
                pred2 = pred_comb[1].root.text
                pred_token1 = pred_comb[0].root.position
                pred_token2 = pred_comb[1].root.position
                
                #Insert predicate span into the sentence
                pred_sentence.insert(pred_token1, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token1 + 2, '</span>')
                pred_sentence.insert(pred_token2 + 2, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token2 + 4, '</span>')
                
                #Insert items into local dict
                token_dict = {}
                token_dict['pred_token1'] = str(pred_token1)
                token_dict['pred1'] = pred1
                token_dict['pred_token2'] = str(pred_token2)
                token_dict['pred2'] = pred2
                token_dict['sentence'] = " ".join(pred_sentence)
                token_dict['sentence_id'] = fname + " " + sentid_num
                token_dict['id'] = iden
            
                #create tuple to check if it exists in the event_happened master set
                sent_total += 1
                sent_tuple1 = (token_dict['sentence_id'], pred_token1+1)
                sent_tuple2 = (token_dict['sentence_id'], pred_token2+1)
                #remove if either predicate didn't happen
                if (sent_tuple1 not in happen_set) or (sent_tuple1 not in happen_set):
                    sent_removed += 1
                    continue

                iden += 1

                if iden == 11:
                    iden = 1 

                if len(local_list) == 10:
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    
                else:
                    local_list.append(json.dumps(token_dict))
        
    
    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sentences found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
                    
    return global_list

#### Below Functions are used only if two adjacent sentences are combined into one

In [7]:
def dict_pred_comb_double(pred_comb, raw_sentence, fname, sentid_num, sentid_num_next):
    '''
    Extract turk_parse dict from input predicate combinations 
    
    INputs:
    1. pred_comb : one tuple of a combination of predicates
    2. raw_sentence: a dict of two sentences, with key: sent_id_num
    '''
    token_dict = {}

    #Extract each predicate's position and text
    pred_sent = raw_sentence

    sentid_idx1 = pred_comb[0][0]
    sentid_idx2 = pred_comb[1][0]

    pred1 = pred_comb[0][1].root.text
    pred2 = pred_comb[1][1].root.text

    pred_token1 = pred_comb[0][1].root.position
    pred_token2 = pred_comb[1][1].root.position

    #Insert predicate span into the sentence
    temp1 = pred_sent[sentid_idx1][:(pred_token1)] + ['<span class=\"predicate\">'] +\
            [pred_sent[sentid_idx1][pred_token1]] + ['</span>'] + \
            pred_sent[sentid_idx1][(pred_token1+1):]
    
    
    if sentid_idx1 == sentid_idx2:
        temp2 = temp1[:(pred_token2+2)] + ['<span class=\"predicate\">'] + \
                [temp1[(pred_token2+2)]] + ['</span>'] + \
                temp1[(pred_token2+3):]
        
        if sentid_idx1 != sentid_num_next:
            temp3 = temp2 + pred_sent[sentid_num_next]
        else:
            temp3 = pred_sent[sentid_num] + temp2 
                
        token_dict['sentence'] = " ".join(temp3)
                    
    else:
        temp2 = pred_sent[sentid_idx2][:pred_token2] + ['<span class=\"predicate\">'] + \
                [pred_sent[sentid_idx2][pred_token2]] + ['</span>'] + \
                pred_sent[sentid_idx2][pred_token2+1:]
                
        token_dict['sentence'] = " ".join(temp1 + temp2)

    
    #print(token_dict['sentence'])
    #print("\n")
    token_dict['pred_token1'] = str(pred_token1)
    token_dict['pred1'] = pred1
    token_dict['pred_token2'] = str(pred_token2)
    token_dict['pred2'] = pred2
    token_dict['sentence_id_1'] = fname + " " + sentid_idx1 
    token_dict['sentence_id_2'] = fname + " " + sentid_idx2 
        
    return token_dict, pred_token1, pred_token2

def dict_pred_comb_single(pred_comb, raw_sentence, fname, sentid_num):
    '''
    Extract turk_parse dict from input predicate combinations 
    
    INputs:
    1. pred_comb : one tuple of a combination of predicates
    2. raw_sentence: a single sentence 
    '''
    token_dict = {}
    pred_sentence = raw_sentence.copy()
    pred1 = pred_comb[0].root.text
    pred2 = pred_comb[1].root.text
    pred_token1 = pred_comb[0].root.position
    pred_token2 = pred_comb[1].root.position

    #Insert predicate span into the sentence
    pred_sentence.insert(pred_token1, '<span class=\"predicate\">')
    pred_sentence.insert(pred_token1 + 2, '</span>')
    pred_sentence.insert(pred_token2 + 2, '<span class=\"predicate\">')
    pred_sentence.insert(pred_token2 + 4, '</span>')

    #Insert items into local dict
    token_dict['pred_token1'] = str(pred_token1)
    token_dict['pred1'] = pred1
    token_dict['pred_token2'] = str(pred_token2)
    token_dict['pred2'] = pred2
    token_dict['sentence'] = " ".join(pred_sentence)
    token_dict['sentence_id_1'] = fname + " " + sentid_num
    token_dict['sentence_id_2'] = fname + " " + sentid_num

    return token_dict, pred_token1, pred_token2

In [8]:
def extract_list_adjacent(ud_data, happen_set, doc_id_dict):
    '''
    Extract a list of JSON objects from the ud data
    
    #Difference from original: Combining two adjacent sentences to form a single one
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = ud_data_path.split(".")[0].split("-")[-1]
        doc_ids = doc_id_dict[data_name]
        
        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        total_sents = len(parsed)
        
        docslist = deque(doc_ids[1:])
        nextdoc = docslist.popleft()
        
        #Iterating through each parsed sentence
        for i, parse_sen in enumerate(parsed):
            
            if i in covered_set:
                continue
                
            pred_object = parse_sen[0]      
            sentid_num = parse_sen[1].split("_")[-1]
            
            ##########################################
            #Next item in parsed is not a new document
             ##########################################
            if (i+2) != nextdoc and (i+1)!= total_sents:
                covered_set.add(i)
                covered_set.add(i+1)
                
                parse_sen_next = parsed[i+1]
                pred_object_next = parse_sen_next[0]
                sentid_num_next = parse_sen_next[1].split("_")[-1]
            
                raw_sentence =  {sentid_num : [token.text for token in pred_object.tokens] ,
                                sentid_num_next: [token.text for token in pred_object_next.tokens]}
                
                preds_curr = [(sentid_num,pred) for pred in pred_object.instances]
                preds_next = [(sentid_num_next,pred) for pred in pred_object_next.instances]
                pred_combs = combs(preds_curr + preds_next)
                
                if len(pred_combs) == 0:
                    continue
                    
                #Iterating through each predicate combination
                for pred_comb in pred_combs:
                    
                    token_dict, pred_token1, pred_token2 = dict_pred_comb_double(pred_comb, raw_sentence, 
                                                                          fname, sentid_num, sentid_num_next)
                    
                    token_dict['id'] = iden

                    #create tuple to check if it exists in the event_happened master set
                    sent_total += 1
                    sent_tuple1 = (token_dict['sentence_id_1'], pred_token1+1)
                    sent_tuple2 = (token_dict['sentence_id_2'], pred_token2+1)
                    #remove if either predicate didn't happen
                    if (sent_tuple1 not in happen_set) or (sent_tuple1 not in happen_set):
                        sent_removed += 1
                        continue

                    iden += 1

                    if iden == 11:
                        iden = 1 

                    if len(local_list) == 10:
                        global_list.append(local_list)
                        local_list = []
                        local_list.append(json.dumps(token_dict))

                    else:
                        local_list.append(json.dumps(token_dict))
            ##########################################
            #Next item in parsed is from a new document
             ##########################################
            else: 
                nextdoc = docslist.popleft()
                covered_set.add(i)
                raw_sentence = [token.text for token in pred_object.tokens]
                pred_combs = combs(pred_object.instances)
                
                if len(pred_combs) == 0:
                    continue
        
                #Iterating through each predicate combination
                for pred_comb in pred_combs:
                    #Extract each predicate's position and text
                    token_dict, pred_token1, pred_token2 = dict_pred_comb_single(pred_comb, raw_sentence, 
                                                                          fname, sentid_num)
                
                    token_dict['id'] = iden
                    #create tuple to check if it exists in the event_happened master set
                    sent_total += 1
                    sent_tuple1 = (token_dict['sentence_id_1'], pred_token1+1)
                    sent_tuple2 = (token_dict['sentence_id_2'], pred_token2+1)
                    #remove if either predicate didn't happen
                    if (sent_tuple1 not in happen_set) or (sent_tuple1 not in happen_set):
                        sent_removed += 1
                        continue

                    iden += 1

                    if iden == 11:
                        iden = 1 

                    if len(local_list) == 10:
                        global_list.append(local_list)
                        local_list = []
                        local_list.append(json.dumps(token_dict))

                    else:
                        local_list.append(json.dumps(token_dict))
                        
        print("{} finished".format(data_name))

    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sentences found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
                    
    return global_list

#### Considering individual sentences

In [9]:
#gl_list = extract_list(ud_data, happen_set)

#### Combining adjacent sentences into a single one

In [10]:
doc_ids = {}
doc_ids['train'] = load_ud_english(ud_train_detailed)
doc_ids['dev'] = load_ud_english(ud_dev_detailed)
doc_ids['test'] = load_ud_english(ud_test_detailed)

In [11]:
gl_list_train = extract_list_adjacent([ud_train], happen_set, doc_ids)

gl_list_dev_test = extract_list_adjacent([ud_dev, ud_test], happen_set, doc_ids)

train finished
Total number of sentences found: 51962
Number of sentences removed due to non-event: 26688
dev finished
test finished
Total number of sentences found: 9598
Number of sentences removed due to non-event: 5167


In [12]:
#create a csv file for MTurk:
with open('event_temporal_turk_file_adjacent_train.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_train:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))

In [13]:
#create a csv file for MTurk:
with open('event_temporal_turk_file_adjacent_dev_test.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_dev_test:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))