In [1]:
#Import relevant libraries
from collections import defaultdict
import os
import re
import pandas as pd
import numpy as np
import json
from predpatt import load_conllu
from predpatt import PredPatt
import csv

In [2]:
#Data Locations:
ud_train  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-train.conllu"
ud_dev  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-dev.conllu"
ud_test  =  "/Users/sidvash/facts_lab/veridicality_sid/UD_English/en-ud-test.conllu"
ud_data = [ud_train, ud_dev, ud_test]
it_happnd = "/Users/sidvash/facts_lab/veridicality_sid/it-happened_eng_ud1.2_07092017.tsv"
pilot_data_upd = "pilot_sent_token_data.csv"
pilot_data_file = "pilot_data.csv"

In [3]:
#Pilot data extracted from pilot protocol1 annotations 
df_glmer = pd.read_csv(pilot_data_upd)

#Extract Sentence IDS of event-happening sentences
data = pd.read_csv(it_happnd , sep='\t')

#Event-type pilot data
pilot_data = pd.read_csv(pilot_data_file)

In [4]:
df_glmer['instant_final'] = np.where(df_glmer['glmer_intercept_inst']>=0, True, False)
df_glmer['start_final'] = np.where(df_glmer['glmer_intercept_start']>=0, True, False)
df_glmer['end_final'] = np.where(df_glmer['glmer_intercept_end']>=0, True, False)

#If instant, convert start, end to False
df_glmer.head(10)

Unnamed: 0,sent_token,glmer_intercept_inst,glmer_intercept_start,glmer_intercept_end,instant_final,start_final,end_final
0,en-ud-dev.conllu 110_11,-0.759728,1.194579,0.585228,False,True,True
1,en-ud-dev.conllu 110_18,0.230165,1.194579,1.122885,True,True,True
2,en-ud-dev.conllu 13_14,0.429219,1.748161,1.656412,True,True,True
3,en-ud-dev.conllu 13_6,-0.259216,1.748161,1.656412,False,True,True
4,en-ud-dev.conllu 1379_1,-0.259216,1.053771,0.408542,False,True,True
5,en-ud-dev.conllu 1379_8,0.429219,1.053771,1.656412,True,True,True
6,en-ud-dev.conllu 1415_10,0.230165,1.194579,0.585228,True,True,True
7,en-ud-dev.conllu 1415_6,-0.759728,1.194579,0.585228,False,True,True
8,en-ud-dev.conllu 1705_19,-0.759728,1.745843,0.585228,False,True,True
9,en-ud-dev.conllu 1705_6,-0.759728,2.339925,2.242671,False,True,True


In [5]:
df_glmer[df_glmer['sent_token']=='en-ud-dev.conllu 110_11'].instant_final.values[0]

False

#### Extracting instant, start and end values for each sent_token id and storing in dicts

In [6]:
instant_dict = {}
start_dict = {}
end_dict = {}

sent_tokens = list(df_glmer.sent_token.values)
for x in sent_tokens:
    sent_id, token_id = x.split("_")
    instant_dict[(sent_id, int(token_id))] = str(df_glmer[df_glmer['sent_token']==x].instant_final.values[0])
    start_dict[(sent_id, int(token_id))] = str(df_glmer[df_glmer['sent_token']==x].start_final.values[0])
    end_dict[(sent_id, int(token_id))] = str(df_glmer[df_glmer['sent_token']==x].end_final.values[0])

### Sent_token ids where both "has_start" and "has_end" is False

In [7]:
df_glmer[(df_glmer.start_final == False ) & 
              (df_glmer.end_final == False )].sent_token.values

array(['en-ud-train.conllu 11844_7', 'en-ud-train.conllu 12377_42',
       'en-ud-train.conllu 301_1', 'en-ud-train.conllu 5948_12',
       'en-ud-train.conllu 5948_17', 'en-ud-train.conllu 6173_6',
       'en-ud-train.conllu 6387_2', 'en-ud-train.conllu 65_57',
       'en-ud-train.conllu 867_49', 'en-ud-train.conllu 9491_22'],
      dtype=object)

#### Select sentence-token ids where at least "has_start" or "has_end" is True

In [8]:
#Ids to be removed
temp_ids = list(df_glmer[(df_glmer.start_final == False ) &
                           (df_glmer.end_final == False )].sent_token.values)

remove_set = set(tuple([x.split("_")[0], int(x.split("_")[1])])  for x in temp_ids)
print("IDs to be removed: ")
print(remove_set)
print("\n")

#All ids
temp_total_ids = list(pilot_data.sent_token.unique())
total_set = set(tuple([x.split("_")[0], int(x.split("_")[1])])  for x in temp_total_ids)

#Ids to be selected
final_select_set = total_set.difference(remove_set)
print("No. of Selected token-ids: {}".format(len(final_select_set)))

IDs to be removed: 
{('en-ud-train.conllu 12377', 42), ('en-ud-train.conllu 867', 49), ('en-ud-train.conllu 9491', 22), ('en-ud-train.conllu 11844', 7), ('en-ud-train.conllu 5948', 12), ('en-ud-train.conllu 6387', 2), ('en-ud-train.conllu 5948', 17), ('en-ud-train.conllu 65', 57), ('en-ud-train.conllu 6173', 6), ('en-ud-train.conllu 301', 1)}


No. of Selected token-ids: 190


In [9]:
len(total_set)

200

In [10]:
def replace_to_html(s):
    '''
    Make some changes to the input string to make it html readable
    '''
    #replace special chars as per html format
    s = re.sub(r"\'", r"&#39;", s)
    s = re.sub(r'\"', r"&quot;", s)
    s = re.sub(r"\[", r"&lsqb;", s)
    s = re.sub(r"\]", r"&rsqb;", s)
        
    return s

def replace_to_turk(s):
    '''
    Make some changes to the input string to make it Turk readable
    '''
    #replace double quotes to appear twice : except at the start/end of the list
    s = re.sub(r'([^\]])\"', r'\1""', s)
    
    #replace single quotes at the beginning and end of list
    s = re.sub(r"\'\{", r"{", s)  
    s = re.sub(r"\}\'", r"}", s)
    
    #replace two backslash to three
    s = re.sub(r"\\\\", r"\\\\\\", s)
    
    #Leave spaces around <span> 
    s = re.sub(r"<span", r" <span", s)
    s = re.sub(r" </span>", r"</span> ", s)
        
    return s

def combinations(l):
    '''
    Input: List containing some elements
    
    Returns: nC2 combinations of items in the list, 
            where n = length of list
    '''
    ans = []
    
    for idx in range(len(l)-1):
        for item2 in l[(idx+1):]:
            ans.append((l[idx],item2))
            
    return ans

In [11]:
def extract_list(ud_data, happen_set, instant_dict, start_dict, end_dict):
    '''
    Extract a list of JSON objects from the ud data
    
    Input: 
    1. ud data path ending in .conll
    2. happen_set: a set of sentence_id where the event did happen
    
    '''
    global_list = []
    local_list = []
    iden = 1
    sent_removed = 0
    sent_total = 0
    out_list = []
    
    for ud_data_path in ud_data:
        fname = ud_data_path.split("/")[-1]

        with open(ud_data_path) as infile:
            data = replace_to_html(infile.read())
            parsed = [(PredPatt(ud_parse), sent_id) for sent_id, ud_parse in load_conllu(data)]
        
        #Iterating through each parsed sentence
        for parse_sen in parsed:
            pred_object = parse_sen[0]            
            raw_sentence = [token.text for token in pred_object.tokens]
            pred_combs = combinations(pred_object.instances)
            sentid_num = parse_sen[1].split("_")[-1]
            
            if len(pred_combs) == 0:
                continue
            #Iterating through each predicate combination
            for pred_comb in pred_combs:
                #Extract each predicate's position and text
                pred_sentence = raw_sentence.copy()
                pred1 = pred_comb[0].root.text
                pred2 = pred_comb[1].root.text
                pred_token1 = pred_comb[0].root.position
                pred_token2 = pred_comb[1].root.position
                
                #Insert predicate span into the sentence
                pred_sentence.insert(pred_token1, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token1 + 2, '</span>')
                pred_sentence.insert(pred_token2 + 2, '<span class=\"predicate\">')
                pred_sentence.insert(pred_token2 + 4, '</span>')
                
                #Insert items into local dict
                token_dict = {}
                token_dict['pred_token1'] = str(pred_token1)
                token_dict['pred1'] = pred1
                token_dict['pred_token2'] = str(pred_token2)
                token_dict['pred2'] = pred2
                token_dict['sentence'] = " ".join(pred_sentence)
                token_dict['sentence_id'] = fname + " " + sentid_num
                token_dict['id'] = iden
                
            
                #create tuple to check if it exists in the event_happened master set
                sent_total += 1
                sent_tuple1 = (token_dict['sentence_id'], pred_token1)
                sent_tuple2 = (token_dict['sentence_id'], pred_token2)
                
                
                #remove if either predicate didn't happen
                if (sent_tuple1 not in happen_set) or (sent_tuple2 not in happen_set):
                    sent_removed += 1
                    continue
                    
                #Instant, start and end values for each sent_token id
                token_dict['instant_pred1'] =  instant_dict[sent_tuple1]
                token_dict['instant_pred2'] =  instant_dict[sent_tuple2]
                token_dict['start_pred1'] =  start_dict[sent_tuple1]
                token_dict['start_pred2'] =  start_dict[sent_tuple2]
                token_dict['end_pred1'] =  end_dict[sent_tuple1]
                token_dict['end_pred2'] =  end_dict[sent_tuple2]
                
                out_tuple = (token_dict['sentence_id'], token_dict['pred_token1'], 
                             token_dict['pred_token2'],token_dict['sentence'] )
                out_list.append(out_tuple)
                
                
                iden += 1

                if iden == 11:
                    iden = 1 

                if len(local_list) == 10:
                    global_list.append(local_list)
                    local_list = []
                    local_list.append(json.dumps(token_dict))
                    
                else:
                    local_list.append(json.dumps(token_dict))
        
    
    #Append the last remaining sentences into the global list
    global_list.append(local_list)  #--ignoring for pilot as using multiples of 10
    
    print("Total number of sentences found: {}".format(sent_total))
    print("Number of sentences removed due to non-event: {}".format(sent_removed))
                    
    return global_list, out_list

In [12]:
#use "final_select_set" instead of "total_set" if you want to 
#exclude sentences with neither a start point not an end-point.

gl_list, out_list = extract_list(ud_data, total_set, instant_dict, start_dict, end_dict)

Total number of sentences found: 31867
Number of sentences removed due to non-event: 31767


In [13]:
print("No. of HITs: {}".format(len(gl_list)))

No. of HITs: 10


In [14]:
#create a csv file for MTurk:
with open('test_protocol2.csv', 'w+') as file_handler:
    file_handler.write("var_arrays\n")
    for item in gl_list:
        local_str = "\"" + str(item) + "\"\n"
        file_handler.write(replace_to_turk(local_str))