In [1]:
#Import relevant libraries
from collections import defaultdict
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
import csv

In [2]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

In [3]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Exclude sentences with low confidence
happnd = happnd[~happnd.Confidence.isin(['0', '1', '2'])]

#Create a set of ID's to filter later
happen_set = list(happnd[['Sentence.ID', 'Pred.Token']].values)
happen_set = [tuple(x) for x in happen_set]
happen_set = set(happen_set)

In [4]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s

def combinations(l):
    '''
    Input: List containing some elements
    
    Returns: nC2 combinations of items in the list, 
            where n = length of list
    '''
    ans = []
    
    for idx in range(len(l)-1):
        for item2 in l[(idx+1):]:
            ans.append((l[idx],item2))
            
    return ans

In [5]:
def extract_list(ud_data, happen_set):
    '''
    Extract a list of JSON objects from the ud data
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    
    for ud_data_path in ud_data:
        fname = ud_data_path.split("/")[-1]

        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        #Iterating through each parsed sentence
        for parse_sen in parsed:
            pred_object = parse_sen[0]            
            raw_sentence = [token.text for token in pred_object.tokens]
            pred_combs = combinations(pred_object.instances)
            sentid_num = parse_sen[1].split("_")[-1]
            
            if len(pred_combs) == 0:
                continue
            #Iterating through each predicate combination
            for pred_comb in pred_combs:
                #Extract each predicate's position and text
                pred_sentence = raw_sentence.copy()
                pred1 = pred_comb[0].root.text
                pred2 = pred_comb[1].root.text
                pred_token1 = pred_comb[0].root.position
                pred_token2 = pred_comb[1].root.position
                
                #Insert predicate span into the sentence
                pred_sentence.insert(pred_token1, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token1 + 2, '</span>')
                pred_sentence.insert(pred_token2 + 2, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token2 + 4, '</span>')
                
                #Insert items into local dict
                token_dict = {}
                token_dict['pred_token1'] = str(pred_token1)
                token_dict['pred1'] = pred1
                token_dict['pred_token2'] = str(pred_token2)
                token_dict['pred2'] = pred2
                token_dict['sentence'] = " ".join(pred_sentence)
                token_dict['sentence_id'] = fname + " " + sentid_num
                token_dict['id'] = iden
            
                #create tuple to check if it exists in the event_happened master set
                sent_total += 1
                sent_tuple1 = (token_dict['sentence_id'], pred_token1+1)
                sent_tuple2 = (token_dict['sentence_id'], pred_token2+1)
                #remove if either predicate didn't happen
                if (sent_tuple1 not in happen_set) or (sent_tuple1 not in happen_set):
                    sent_removed += 1
                    continue

                iden += 1

                if iden == 11:
                    iden = 1 

                if len(local_list) == 10:
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    
                else:
                    local_list.append(json.dumps(token_dict))
        
    
    #Append the last remaining sentences into the global list
    global_list.append(local_list)
    
    print("Total number of sentences found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
                    
    return global_list 

In [6]:
gl_list = extract_list(ud_data, happen_set)

Total number of sentences found: 31867
Number of sentences removed due to non-event: 14570


In [7]:
#create a csv file for MTurk:
with open('test_temporal.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))