In [1]:
from collections import defaultdict
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
import csv

ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"

it_happnd = "it-happened_eng_ud1.2_07092017.tsv"

In [2]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Exclude sentences with low confidence
happnd = happnd[~happnd.Confidence.isin(['0', '1'])]

#Create a set of IDs to filter later
happen_set = set(list(happnd['Sentence.ID'].values))

In [3]:
def extract_list(ud_data_path, happen_set):
    '''
    Extract a list of JSON objects from the ud data
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    fname = ud_data_path.split("/")[-1]
    
    with open(ud_data_path) as infile:
        data = infile.read()
        parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
    
    id = 1
    global_list = []
    local_list = []

    for parse_sen in parsed:
        
        for predicate in parse_sen[0].instances:
            raw_sentence = [token.text for token in parse_sen[0].tokens]
            pred_token = predicate.root.position
            pred = predicate.root.text
            #print(raw_sentence)
            #print(pred_token)
            #print(pred)

            token_dict = {}
            pred_sentence = raw_sentence.copy()
            pred_sentence.insert(pred_token, '<span class=\"predicate\">')
            pred_sentence.insert(pred_token + 2, '</span>')
            sentid_num = parse_sen[1].split("_")[-1]

            token_dict['pred_token'] = str(pred_token)
            token_dict['sentence'] = " ".join(pred_sentence)
            token_dict['pred'] = pred
            token_dict['sentence_id'] = fname + " " + sentid_num
            token_dict['id'] = id

            if token_dict['sentence_id'] not in happen_set:
                break

            id += 1

            if id == 11:
                id = 1 

            if len(local_list) == 10:
                global_list.append(local_list)
                local_list = []
                local_list.append(json.dumps(token_dict))
            else:
                local_list.append(json.dumps(token_dict))
                    
    return global_list 

In [4]:
def replace_string(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''

    #replace all single quotes by double quotes : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #remove spaces before and after span
    s = re.sub(r"> ", r">", s)
    s = re.sub(r" <", r"<", s)
    
    return s

In [5]:
gl_list = extract_list(ud_dev, happen_set)

In [6]:
#create a csv file for MTurk:
with open('test.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(local_str)
    

In [7]:
#Read csv file created above and store into list
file_temp = open('test.csv', 'r')
lines = file_temp.readlines()
file_temp.close()

In [8]:
#create a csv file for MTurk with replacements:
with open('test_replace.csv', 'w+') as file_handler:
    for item in lines:
        file_handler.write(replace_string(item))

In [9]:
#Temporary command to replace lines manually that throw error on Turk:

#####  run from terminal:
#sed '5d;6d;8d;9d;14d;15d;28d;60d;93d;127d;128d;130d;131d;182d;210d' < test_replace.csv > test_replace_v2.csv