In [1]:
#Import relevant libraries
from collections import defaultdict
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
import csv
import random

In [2]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"

In [3]:
#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Select only sentences which did happen
happnd = data[data.Happened == "true"]

#Select only sentences with high confidence
happnd = happnd[happnd.Confidence.isin(['4', '3'])]

#Select only sentences where Keep = True
happnd = happnd[happnd.Keep == True]

#Create a set of ID's to filter later
happen_set = list(happnd[['Sentence.ID', 'Pred.Token']].values)
happen_set = [tuple(x) for x in happen_set]
happen_set = set(happen_set)

In [4]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
    s = re.sub(r"\,", r"&#44;", s)
    
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s

def extract_list_with_copulas(ud_data, happen_set=[]):
    '''
    Updated: Pred extended for Copulas
    
    Extract a list of JSON objects from the ud data
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    # Resolve relative clause
    options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False)
    
    #Predicate tags to be used later
    pos_tags = ["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"]   
    copula_cnts = defaultdict(int)
    pred_cnts = defaultdict(int)
    auxverb_cnts = defaultdict(int)
    ignore_cnts =  defaultdict(int)
    copula_indxs = defaultdict(list)
    discont_indxs = defaultdict(list)
    ###
    
    for ud_data_path in ud_data:
        fname = ud_data_path.split("/")[-1]
        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)]
        #random.shuffle(parsed)
        for parse_sen in parsed:
            for predicate in parse_sen[0].instances:
                cop_bool = False
                #Ignore predicates that do not have one of the pos tags
                if predicate.root.tag not in pos_tags:
                    ignore_cnts[fname] += 1
                    continue
                
                #Extend predicate to start from the copula
                if predicate.root.tag not in ["VERB", "AUX"]:
                    all_pred = predicate.tokens
                    gov_rels = [tok.gov_rel for tok in all_pred]
                    if 'cop' in gov_rels:
                        copula_cnts[fname] += 1
                        cop_pos = gov_rels.index('cop')
                        pred = [x.text for x in all_pred[cop_pos:]]
                        pred_token = [x.position for x in all_pred[cop_pos:]]
                        def_pred_token = predicate.root.position  #needed for it_happen set
                        cop_bool = True                
                    else:
                        ignore_cnts[fname] += 1
                        continue
                #Else keep the root        
                else:
                    pred_token = [predicate.root.position]
                    pred = [predicate.root.text]
                    def_pred_token = predicate.root.position 
                    
                raw_sentence = [token.text for token in parse_sen[0].tokens]
                token_dict = {}
                pred_sentence = raw_sentence.copy()
                acc=0
                for ins in pred_token:
                    pred_sentence.insert(ins + acc, ' <span class=\"predicate\">')
                    pred_sentence.insert(ins + acc + 2, '</span> ')
                    acc += 2
                
                sentid_num = parse_sen[1].split("_")[-1]

                #Stringify pred and pred_tokens:
                pred_token = "_".join(map(str, pred_token))
                
                if len(pred)>5:
                    pred = pred[:5]
                    pred = " ".join(pred) + "..."
                else:
                    pred = " ".join(pred)
                
                token_dict['pred_token'] = pred_token
                token_dict['sentence'] = " ".join(pred_sentence)
                token_dict['pred'] = pred
                token_dict['sentence_id'] = fname + " " + sentid_num
                token_dict['id'] = iden
                token_dict['pred_root_pos'] = str(def_pred_token)
                
                #create tuple to check if it exists in the event_happened master set
                pred_cnts[fname]+= 1
                #sent_tuple = (token_dict['sentence_id'], def_pred_token+1)
                
                #if not cop_bool:
                #    if sent_tuple not in happen_set:
                #        sent_removed += 1
                #       continue
                
                #Counting discontinuous copula predicates
                #if cop_bool:
                #    copula_indxs[fname].append((sentid_num, len(pred_token.split(","))))
                #    token_l = pred_token.split(",")
                #    
                #   if len(token_l) != len(range(int(token_l[0]) , int(token_l[-1])+1)):
                #       discont_indxs[fname].append((sentid_num, pred_token))
            
                cop_bool = False
                
                iden += 1
                
                if iden == 11:
                    iden = 1 

                if len(local_list) == 10:
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    
                else:
                    local_list.append(json.dumps(token_dict))
        
        print("Total predicates in {}: {}".format(fname, pred_cnts[fname]))
        print("Copula predicates in {} : {}".format(fname, copula_cnts[fname]))
        print("\n")
    
    #Append the last remaining sentences into the global list
    global_list.append(local_list)

    return global_list

In [5]:
gl_list_train = extract_list_with_copulas([ud_train])
gl_list_dev_test = extract_list_with_copulas([ud_dev,ud_test])

Total predicates in en-ud-train.conllu: 23538
Copula predicates in en-ud-train.conllu : 3898


Total predicates in en-ud-dev.conllu: 2884
Copula predicates in en-ud-dev.conllu : 541


Total predicates in en-ud-test.conllu: 2755
Copula predicates in en-ud-test.conllu : 492




In [6]:
#create a csv file for MTurk:
with open('event_type_turk_file_train.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_train:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))

In [7]:
#create a csv file for MTurk:
with open('event_type_turk_file_dev_test.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list_dev_test:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))